├── .coveragerc
├── .github
├── ISSUE_TEMPLATE
│ ├── ---bug-report.md
│ ├── ---feature-request.md
│ └── --questions---help.md
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ └── python-publish.yml
├── .gitignore
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── azure-pipelines.yml
├── ci
└── install.sh
├── conda
└── meta.yaml
├── examples
├── README.md
├── basic_functions.ipynb
├── cytoscape_vis.ipynb
├── example.sdf
└── scaffold_graphs.ipynb
├── img
└── scaffoldgraph.jpg
├── requirements.txt
├── scaffoldgraph
├── __init__.py
├── analysis
│ ├── __init__.py
│ ├── diversity.py
│ ├── enrichment.py
│ ├── frequency.py
│ ├── general.py
│ └── representation.py
├── core
│ ├── __init__.py
│ ├── fragment.py
│ ├── graph.py
│ └── scaffold.py
├── io
│ ├── __init__.py
│ ├── dataframe.py
│ ├── sdf.py
│ ├── smiles.py
│ ├── supplier.py
│ └── tsv.py
├── network.py
├── prioritization
│ ├── __init__.py
│ ├── generic_rules.py
│ ├── original_rules.py
│ ├── prioritization_rules.py
│ ├── prioritization_ruleset.py
│ └── rule_io.py
├── scripts
│ ├── __init__.py
│ ├── generate.py
│ ├── misc.py
│ ├── operations.py
│ └── run.py
├── tree.py
├── utils
│ ├── __init__.py
│ ├── aggregate.py
│ ├── bipartite.py
│ ├── cache.py
│ ├── logging.py
│ ├── misc.py
│ └── subset.py
└── vis
│ ├── __init__.py
│ ├── base.py
│ ├── notebook
│ ├── __init__.py
│ ├── cytoscape.py
│ └── resources
│ │ └── cytoscape.json
│ └── utils.py
├── setup.cfg
├── setup.py
└── tests
├── __init__.py
├── analysis
├── __init__.py
├── test_enrichment.py
├── test_general.py
└── test_representation.py
├── core
├── test_fragment.py
├── test_graph.py
└── test_scaffold.py
├── data
└── test_smiles.smi
├── prioritization
├── __init__.py
├── test_generic_rules.py
├── test_original_rules.py
├── test_prioritization_rules.py
└── test_prioritization_ruleset.py
├── scripts
└── test_scripts.py
├── test_network.py
├── test_tree.py
├── utils
├── __init__.py
├── test_aggregate.py
├── test_bipartite.py
├── test_misc.py
└── test_subset.py
└── vis
├── __init__.py
├── test_notebook.py
└── test_vis_utils.py
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 | scaffoldgraph/vis/notebook/*,
4 | scaffoldgraph/scripts/misc.py,
5 |
6 | [report]
7 | exclude_lines =
8 | pragma: no cover
9 | logger.*
10 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/---bug-report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: "\U0001F41B Bug report"
3 | about: Create a report to help us improve ScaffoldGraph
4 | title: "[BUG]"
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behaviour:
15 | 1.
16 | 2.
17 | 3.
18 |
19 | **Expected behaviour**
20 | A clear and concise description of what you expected to happen.
21 |
22 | **Desktop (please complete the following information):**
23 | - OS: [e.g. iOS]
24 | - Version [e.g. 22]
25 |
26 | **Additional context**
27 | Add any other context about the problem here.
28 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/---feature-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: "\U0001F680 Feature request"
3 | about: Suggest an idea for this project
4 | title: "[F]"
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/--questions---help.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: "❓ Questions & Help"
3 | about: Start a discussion related to ScaffoldGraph
4 | title: "[Q]"
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | ## ❓ Questions & Help
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | #### Reference Issue
5 |
6 |
7 |
8 | #### What does this implement/fix?
9 |
10 |
11 | #### Any other comments?
12 |
13 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | name: Upload to PyPi
2 |
3 | on:
4 | release:
5 | types: [created]
6 |
7 | jobs:
8 | deploy:
9 |
10 | runs-on: ubuntu-latest
11 |
12 | steps:
13 | - uses: actions/checkout@v2
14 | - name: Set up Python
15 | uses: actions/setup-python@v2
16 | with:
17 | python-version: '3.x'
18 | - name: Install dependencies
19 | run: |
20 | python -m pip install --upgrade pip
21 | pip install setuptools wheel twine
22 | - name: Build and publish
23 | env:
24 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
25 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
26 | run: |
27 | python setup.py sdist bdist_wheel
28 | twine upload dist/*
29 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # PyCharm project settings
98 | .idea
99 |
100 | # Rope project settings
101 | .ropeproject
102 |
103 | # mkdocs documentation
104 | /site
105 |
106 | # mypy
107 | .mypy_cache/
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.6"
4 | - "3.7"
5 | - "3.8"
6 |
7 | install:
8 | - source ci/install.sh
9 |
10 | script:
11 | - pytest
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at oliver.scott.17@ucl.ac.uk. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ## Contributing
2 |
3 | Contributions to ScaffoldGraph will most likely fall into the following categories:
4 |
5 | 1. Implementing a new Feature:
6 | * New Features that fit into the scope of this package will be accepted. If you are unsure about the
7 | idea/design/implementation, feel free to post an issue.
8 | 2. Fixing a Bug:
9 | * Bug fixes are welcomed, please send a Pull Request each time a bug is encountered. When sending a Pull
10 | Request please provide a clear description of the encountered bug. If unsure feel free to post an issue
11 |
12 | Please send Pull Requests to:
13 | http://github.com/UCLCheminformatics/scaffoldgraph
14 |
15 | ### Testing
16 |
17 | ScaffoldGraphs testing is located under `test/`. Run all tests using:
18 |
19 | ```
20 | $ python setup.py test
21 | ```
22 |
23 | or run an individual test: `pytest --no-cov tests/core`
24 |
25 | When contributing new features please include appropriate test files
26 |
27 | ### Continuous Integration
28 |
29 | ScaffoldGraph uses Travis CI for continuous integration
30 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 OliverBScott
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.md
2 | include LICENSE
3 | include setup.py
4 | include requirements.txt
5 | include scaffoldgraph/*.py
6 | include scaffoldgraph/*/*.py
7 | include tests/data/test_smiles.smi
8 | include scaffoldgraph/vis/notebook/resources/*
9 | recursive-include tests *.py
10 |
--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
1 | # Python package
2 | # Create and test a Python package on multiple Python versions.
3 | # Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
4 | # https://docs.microsoft.com/azure/devops/pipelines/languages/python
5 |
6 | trigger:
7 | - main
8 |
9 | pool:
10 | vmImage: 'ubuntu-latest'
11 | strategy:
12 | matrix:
13 | Python36:
14 | python.version: '3.6'
15 | Python37:
16 | python.version: '3.7'
17 | Python38:
18 | python.version: '3.8'
19 |
20 | steps:
21 | - bash: echo "##vso[task.prependpath]$CONDA/bin"
22 | displayName: Add conda to PATH
23 |
24 | - bash: |
25 | conda config --set always_yes yes --set changeps1 no
26 | conda config --add channels conda-forge
27 | conda install -q mamba
28 | mamba create -q -n $(conda_env) python=$(python.version)
29 | displayName: Create conda environment
30 |
31 | - bash: |
32 | source activate $(conda_env)
33 | mamba install -q --file ./requirements.txt
34 | python setup.py install
35 | displayName: Install package
36 |
37 | - bash: |
38 | source activate $(conda_env)
39 | mamba install -q pytest pytest-cov
40 | pytest --cov=scaffoldgraph -v -s tests/
41 | displayName: Run tests
--------------------------------------------------------------------------------
/ci/install.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | set -e
4 |
5 | # Retrieve the latest miniconda distribution for linux
6 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
7 |
8 | # Install miniconda
9 | bash miniconda.sh -b -p $HOME/miniconda
10 | export PATH="$HOME/miniconda/bin:$PATH"
11 |
12 | # Configure conda
13 | conda config --set always_yes yes --set changeps1 no
14 | conda update -q conda
15 | conda info -a
16 | conda config --add channels conda-forge
17 | conda create -q -n travis_env python=$TRAVIS_PYTHON_VERSION
18 | source activate travis_env
19 |
20 | # Install
21 | conda install --file $TRAVIS_BUILD_DIR/requirements.txt
22 | python $TRAVIS_BUILD_DIR/setup.py install
23 |
--------------------------------------------------------------------------------
/conda/meta.yaml:
--------------------------------------------------------------------------------
1 | {% set name = "ScaffoldGraph" %}
2 | {% set version = "1.1.2" %}
3 |
4 | package:
5 | name: {{ name|lower }}
6 | version: {{ version }}
7 |
8 | source:
9 | url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz
10 | sha256: 53bb45b59302c4f0fca8b6b380d79a8cd82f954727e22ec08015f4c6f3ea7640
11 |
12 | build:
13 | noarch: python
14 | number: 0
15 | skip: True # [py<35]
16 | script: "{{ PYTHON }} -m pip install . -vv"
17 |
18 | requirements:
19 | build:
20 | - python
21 | - pip
22 | - pytest-runner
23 | run:
24 | - python
25 | - rdkit
26 | - networkx
27 | - matplotlib
28 | - loguru
29 | - tqdm
30 | - scipy >=1.3.1
31 |
32 | test:
33 | imports:
34 | - scaffoldgraph
35 | - scaffoldgraph.network
36 | - scaffoldgraph.tree
37 | - scaffoldgraph.analysis
38 | - scaffoldgraph.core
39 | - scaffoldgraph.prioritization
40 | - scaffoldgraph.utils
41 | - scaffoldgraph.io
42 | commands:
43 | - scaffoldgraph --help
44 |
45 | about:
46 | home: https://github.com/UCLCheminformatics/ScaffoldGraph
47 | license: MIT
48 | license_family: MIT
49 | license_file: LICENSE
50 | summary: 'Cheminformatics package for building scaffold networks and trees.'
51 | description: |
52 | ScaffoldGraph is an open-source cheminformatics library, built using RDKit and NetworkX,
53 | for the generation and analysis of scaffold networks and scaffold trees.
54 | doc_url: https://github.com/UCLCheminformatics/ScaffoldGraph
55 | dev_url: https://github.com/UCLCheminformatics/ScaffoldGraph
56 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # ScaffoldGraph Examples
2 |
3 | These notebooks display basic functionality of the ScaffoldGraph package.
4 |
5 | **Notebooks:**
6 | * basic_functions.ipynb (Basic functionality independent of the NetworkX API)
7 | * scaffold_graphs.ipynb (Creating scaffold networks and trees using the NetworkX API)
8 | * cytoscape_vis.ipynb (Visualizing scaffold graphs using ipycytoscape)
9 |
10 | **Data:**
11 | * example.sdf (A collection of molecules from PubChem)
12 |
--------------------------------------------------------------------------------
/examples/cytoscape_vis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# ScaffoldGraph: Cytoscape Visualization\n",
8 | "\n",
9 | "ScaffoldGraph contains a utility to visualize scaffoldgraphs interactively in Jupyter using the [ipycytoscape](https://github.com/QuantStack/ipycytoscape) module.\n",
10 | "\n",
11 | "To use the utility ipycytoscape must be installed:\n",
12 | "\n",
13 | "`pip install ipycytoscape` or `conda install -c conda-forge ipycytoscape`"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 1,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "import scaffoldgraph as sg\n",
23 | "import os\n",
24 | "\n",
25 | "# Import the visualization utility.\n",
26 | "from scaffoldgraph.vis.notebook import cytoscape"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "The visualizer can be used for all scaffold graph types. Here we will use the `ScaffoldTree`."
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "data": {
43 | "application/vnd.jupyter.widget-view+json": {
44 | "model_id": "1b3cafb2bea34d4f8c77ecd284a6be91",
45 | "version_major": 2,
46 | "version_minor": 0
47 | },
48 | "text/plain": [
49 | "HBox(children=(HTML(value='ScaffoldTree'), FloatProgress(value=0.0, layout=Layout(flex='2'), max=199.0), HTML(…"
50 | ]
51 | },
52 | "metadata": {},
53 | "output_type": "display_data"
54 | },
55 | {
56 | "name": "stdout",
57 | "output_type": "stream",
58 | "text": [
59 | "\n",
60 | "Type: ScaffoldTree\n",
61 | "Number of molecule nodes: 199\n",
62 | "Number of scaffold nodes: 450\n",
63 | "Number of edges: 584\n",
64 | "Max hierarchy: 6\n",
65 | "Min hierarchy: 1\n",
66 | "\n"
67 | ]
68 | }
69 | ],
70 | "source": [
71 | "sdf_file = os.path.dirname(sg.__file__).replace('scaffoldgraph', 'examples/example.sdf')\n",
72 | "\n",
73 | "# Construct a ScaffoldTree.\n",
74 | "tree = sg.ScaffoldTree.from_sdf(sdf_file, progress=True)\n",
75 | "\n",
76 | "# Print a quick summary.\n",
77 | "print(sg.utils.summary(tree))"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "Initialize the visualizer with the scaffold tree. Note that custom styles can also be specified, allowing customisation of the visualisation. e.g. colouring molecule nodes according to activity. For more information on styles see the [ipycytoscape documentation](https://ipycytoscape.readthedocs.io/en/latest/index.html). Here we will just use the default style provided."
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 3,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "visualizer = cytoscape.CytoscapeVisualizer(tree) # initialize."
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "The visualizer allows the user to draw the whole graph or just a subset of the graph starting from a molecule or scaffold node. Given that the tree can often be very large visualizing small subsets if often a better idea.\n",
101 | "\n",
102 | "* visualizer.draw (draw the whole graph)\n",
103 | "* visualizer.draw_for_molecule (draw a molecule and its constituent scaffolds)\n",
104 | "* visualizer.draw_for_scaffold (draw a scaffold and its parent and/or children nodes)\n",
105 | "\n",
106 | "When drawing scaffold trees edges are annotated with the rule used during prioritization."
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 4,
112 | "metadata": {},
113 | "outputs": [
114 | {
115 | "data": {
116 | "application/vnd.jupyter.widget-view+json": {
117 | "model_id": "ed1b4914099d4a00804fd6df811cab1c",
118 | "version_major": 2,
119 | "version_minor": 0
120 | },
121 | "text/plain": [
122 | "CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'nodeSpacing': 50, 'edgeLengthVal': 50}, cytoscape_style=[{…"
123 | ]
124 | },
125 | "metadata": {},
126 | "output_type": "display_data"
127 | }
128 | ],
129 | "source": [
130 | "# Draw graph for PubChem ID '1201903'.\n",
131 | "widget = visualizer.draw_for_molecule('1201903')\n",
132 | "\n",
133 | "# Widget properties can be modified after construction. \n",
134 | "widget.layout.height = '600px'\n",
135 | "\n",
136 | "# display the widget.\n",
137 | "widget"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "Using the hierarchy 1 scaffold from the last query ('1201903'), lets draw the subgraph originating from this scaffold ('c1ccncc1').\n",
145 | "\n",
146 | "When creating visualizations from scaffold there is an additional option `traversal` specifying the direction of subgraph creation {'child', 'parent', 'bidirectional'}. Specifying child will draw the subgraph toward the child molecules and parent will draw towards lower hierarchy scaffolds. Specifying bidirectional will construct the subgraph in both directions. The default traversal is 'child'."
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 5,
152 | "metadata": {},
153 | "outputs": [
154 | {
155 | "data": {
156 | "application/vnd.jupyter.widget-view+json": {
157 | "model_id": "ac18776655314fb5af4bb64c530e33a0",
158 | "version_major": 2,
159 | "version_minor": 0
160 | },
161 | "text/plain": [
162 | "CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'nodeSpacing': 50, 'edgeLengthVal': 50}, cytoscape_style=[{…"
163 | ]
164 | },
165 | "metadata": {},
166 | "output_type": "display_data"
167 | }
168 | ],
169 | "source": [
170 | "# Draw graph for scaffold 'c1ccncc1'\n",
171 | "widget = visualizer.draw_for_scaffold('c1ccncc1', traversal='child')\n",
172 | "widget.layout.height = '600px'\n",
173 | "widget"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "The layout can also be customised using the `layout_kwargs` argument passing arguments to the CytoscapeWidget.set_layout function. Here we change the algorithm for constructing the layou to 'breadthfirst'."
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 6,
186 | "metadata": {},
187 | "outputs": [
188 | {
189 | "data": {
190 | "application/vnd.jupyter.widget-view+json": {
191 | "model_id": "9b62090602234268a15aa01167f5ae2b",
192 | "version_major": 2,
193 | "version_minor": 0
194 | },
195 | "text/plain": [
196 | "CytoscapeWidget(cytoscape_layout={'name': 'breadthfirst', 'nodeSpacing': 50, 'edgeLengthVal': 50}, cytoscape_s…"
197 | ]
198 | },
199 | "metadata": {},
200 | "output_type": "display_data"
201 | }
202 | ],
203 | "source": [
204 | "# Draw graph for scaffold 'c1ccncc1'\n",
205 | "widget = visualizer.draw_for_scaffold('c1ccncc1', layout_kwargs={'name': 'breadthfirst'})\n",
206 | "widget.layout.height = '600px'\n",
207 | "widget"
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {},
213 | "source": [
214 | "# Further interactivity using ipywidgets \n",
215 | "\n",
216 | "A user also has the ability to add further interactivity to the visualisations using the ipywidgets interact function, an example below shows an interactive visualisation where the user can select the molecule to display using a dropdown box and also update the layout."
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 7,
222 | "metadata": {},
223 | "outputs": [
224 | {
225 | "data": {
226 | "application/vnd.jupyter.widget-view+json": {
227 | "model_id": "bd68bd2e87a74c2a9bf0aa6824f3dad1",
228 | "version_major": 2,
229 | "version_minor": 0
230 | },
231 | "text/plain": [
232 | "interactive(children=(Dropdown(description='molecule_id', options=('1085650', '1085710', '1106125', '1113907',…"
233 | ]
234 | },
235 | "metadata": {},
236 | "output_type": "display_data"
237 | }
238 | ],
239 | "source": [
240 | "from ipywidgets import interact, fixed\n",
241 | "\n",
242 | "\n",
243 | "def draw(visualizer, molecule_id, layout):\n",
244 | " layout_kwargs = {'name': layout}\n",
245 | " widget = visualizer.draw_for_molecule(molecule_id, layout_kwargs)\n",
246 | " widget.layout.height = '600px'\n",
247 | " return widget\n",
248 | " \n",
249 | "\n",
250 | "# Get the list of all molecules in the graph and sort.\n",
251 | "mol_ids = sorted(list(tree.get_molecule_nodes()))\n",
252 | "\n",
253 | "# List of a subset of available layouts in cytoscape.\n",
254 | "layouts = ['dagre', 'breadthfirst', 'cose']\n",
255 | "\n",
256 | "# Now create the visualization.\n",
257 | "interact(\n",
258 | " draw,\n",
259 | " visualizer=fixed(visualizer),\n",
260 | " molecule_id=mol_ids,\n",
261 | " layout=layouts\n",
262 | ");"
263 | ]
264 | },
265 | {
266 | "cell_type": "markdown",
267 | "metadata": {},
268 | "source": [
269 | "# Style Customization\n",
270 | "\n",
271 | "The user also has the ability to customize various aspects of the widgets style. Fopr example the nodes can be coloured based on an attribute. The example shows how to colour scaffold nodes based on the 'hierarchy' attribute."
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": 8,
277 | "metadata": {},
278 | "outputs": [
279 | {
280 | "data": {
281 | "application/vnd.jupyter.widget-view+json": {
282 | "model_id": "5ff71126406b4fad84a0eaeae7262566",
283 | "version_major": 2,
284 | "version_minor": 0
285 | },
286 | "text/plain": [
287 | "CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'nodeSpacing': 50, 'edgeLengthVal': 50}, cytoscape_style=[{…"
288 | ]
289 | },
290 | "metadata": {},
291 | "output_type": "display_data"
292 | }
293 | ],
294 | "source": [
295 | "from scaffoldgraph.vis import color_scaffold_nodes_by_attribute\n",
296 | "from rdkit.Chem.Draw import rdMolDraw2D\n",
297 | "\n",
298 | "# Add a 'background-color' attribute to each scaffold node.\n",
299 | "# The cmap can either be a string or a matplotlib.cm.Colormap object.\n",
300 | "color_scaffold_nodes_by_attribute(tree, 'hierarchy', cmap='BuPu', label='color')\n",
301 | "\n",
302 | "# Create a style element for the visualizer.\n",
303 | "background_style = {\n",
304 | " \"selector\": \"node[color]\", # select nodes with an attribute 'color'\n",
305 | " \"style\": {\n",
306 | " \"background-color\": \"data(color)\", # set background-color from attribute.\n",
307 | " }\n",
308 | "}\n",
309 | "\n",
310 | "# We can also customize rdkit drawing options (need a transparent background).\n",
311 | "drawOpts = rdMolDraw2D.MolDrawOptions()\n",
312 | "drawOpts.clearBackground = False\n",
313 | "drawOpts.bondLineWidth = 4\n",
314 | "\n",
315 | "# Create a new visualizer with new options.\n",
316 | "vis = cytoscape.CytoscapeVisualizer(\n",
317 | " tree,\n",
318 | " refresh_images=True,\n",
319 | " rd_draw_options=drawOpts,\n",
320 | ")\n",
321 | "\n",
322 | "# Append the style element to the visualizer.\n",
323 | "vis.style.append(background_style)\n",
324 | "\n",
325 | "# Create visualization.\n",
326 | "widget = vis.draw_for_scaffold('c1ccncc1')\n",
327 | "widget.layout.height = '600px'\n",
328 | "widget\n"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": null,
334 | "metadata": {},
335 | "outputs": [],
336 | "source": []
337 | }
338 | ],
339 | "metadata": {
340 | "kernelspec": {
341 | "display_name": "ScaffoldGraph",
342 | "language": "python",
343 | "name": "scaffoldgraph"
344 | },
345 | "language_info": {
346 | "codemirror_mode": {
347 | "name": "ipython",
348 | "version": 3
349 | },
350 | "file_extension": ".py",
351 | "mimetype": "text/x-python",
352 | "name": "python",
353 | "nbconvert_exporter": "python",
354 | "pygments_lexer": "ipython3",
355 | "version": "3.7.6"
356 | }
357 | },
358 | "nbformat": 4,
359 | "nbformat_minor": 4
360 | }
361 |
--------------------------------------------------------------------------------
/img/scaffoldgraph.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCLCheminformatics/ScaffoldGraph/8168d739ca3783d39775fb1553f721b5a7dc437b/img/scaffoldgraph.jpg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | rdkit
2 | networkx
3 | tqdm
4 | loguru
5 | numpy
6 | scipy>=1.3.1
7 | matplotlib>=2.2.2
8 | pytest
--------------------------------------------------------------------------------
/scaffoldgraph/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph
3 | """
4 |
5 | from loguru import logger
6 |
7 | from . import prioritization
8 | from . import utils
9 | from . import vis
10 |
11 | from .core import (
12 | get_next_murcko_fragments,
13 | get_all_murcko_fragments,
14 | get_murcko_scaffold,
15 | get_ring_toplogy_scaffold,
16 | get_ring_connectivity_scaffold,
17 | )
18 |
19 | from .network import ScaffoldNetwork, HierS
20 | from .tree import ScaffoldTree, tree_frags_from_mol
21 |
22 | __version__ = '1.1.2'
23 |
24 |
25 | __all__ = [
26 | '__version__',
27 | 'HierS',
28 | 'ScaffoldNetwork',
29 | 'ScaffoldTree',
30 | 'tree_frags_from_mol',
31 | 'get_next_murcko_fragments',
32 | 'get_all_murcko_fragments',
33 | 'get_murcko_scaffold',
34 | 'get_ring_toplogy_scaffold',
35 | 'get_ring_connectivity_scaffold',
36 | ]
37 |
38 | logger.disable(__name__)
39 |
--------------------------------------------------------------------------------
/scaffoldgraph/analysis/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.analysis
3 |
4 | The analysis package contains functions for analyzing ScaffoldGraphs
5 | """
6 |
7 | from .representation import calc_average_pairwise_similarity, get_over_represented_scaffold_classes
8 | from .enrichment import calc_scaffold_enrichment, compound_set_enrichment
9 | from .general import get_virtual_scaffolds, get_singleton_scaffolds
10 | from .diversity import diversity_pick_for_scaffold_class
11 | from .frequency import cumulative_scaffold_frequency, area_under_curve
12 |
13 |
14 | __all__ = [
15 | 'calc_average_pairwise_similarity',
16 | 'get_over_represented_scaffold_classes',
17 | 'calc_scaffold_enrichment',
18 | 'compound_set_enrichment',
19 | 'get_virtual_scaffolds',
20 | 'get_singleton_scaffolds',
21 | 'diversity_pick_for_scaffold_class',
22 | 'cumulative_scaffold_frequency',
23 | 'area_under_curve'
24 | ]
25 |
--------------------------------------------------------------------------------
/scaffoldgraph/analysis/diversity.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.analysis.diversity
3 | """
4 |
5 | from rdkit.SimDivFilters.rdSimDivPickers import LeaderPicker
6 | from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
7 | from rdkit.Chem import MolFromSmiles
8 |
9 | from functools import partial
10 |
11 |
12 | def _form_dist_func(dist_func, fps):
13 | """function: create a partial dist_func."""
14 | if dist_func.__code__.co_argcount != 3:
15 | raise ValueError('dist_func must have three arguments: i, j, fps')
16 | if dist_func.__code__.co_varnames[2] != 'fps':
17 | raise ValueError('dist_func third argument name must be: fps')
18 | formed_dist_func = partial(dist_func, fps=fps)
19 | return formed_dist_func
20 |
21 |
22 | def _make_diversity_pick(pool, threshold, pick_size, dist_func=None):
23 | """iterable: make a diversity pick from a pool of fingerprints."""
24 | picker = LeaderPicker()
25 | pool_size = len(pool)
26 | if pick_size > pool_size:
27 | pick_size = pool_size
28 | if dist_func is None:
29 | pick = picker.LazyBitVectorPick(pool, pool_size, threshold, pick_size)
30 | else:
31 | dist_func = _form_dist_func(dist_func, pool)
32 | pick = picker.LazyPick(dist_func, pool_size, threshold, pick_size)
33 | return pick
34 |
35 |
36 | def _create_pool(scaffold, graph, radius, bits):
37 | """tuple : create molecule pool (ids, mols, fps)."""
38 | mol_ids, smiles = zip(*graph.get_molecules_for_scaffold(scaffold, 'smiles'))
39 | mols = list(map(MolFromSmiles, smiles))
40 | fps = list(map(lambda x: GetMorganFingerprintAsBitVect(x, radius, nBits=bits), mols))
41 | if len(fps) == 0:
42 | raise ValueError(f'No molecules for scaffold class: {scaffold}')
43 | return mol_ids, mols, fps
44 |
45 |
46 | def diversity_pick_for_scaffold_class(
47 | scaffold,
48 | graph,
49 | threshold=0.65,
50 | pick_size=0,
51 | fp_radius=2,
52 | fp_bits=1024,
53 | dist_func=None
54 | ):
55 | """
56 | Pick a diverse set of molecules from a scaffold class using
57 | the RDKit diversity picker (LeaderPicker) and Morgan
58 | fingerprints.
59 |
60 | Parameters
61 | ----------
62 | scaffold : str
63 | Scaffold class name i.e. scaffold SMILES.
64 | graph : ScaffoldGraph
65 | ScaffoldGraph for picking.
66 | threshold : float, optional
67 | Stop picking when the distance goes below this value.
68 | The default is 0.65 i.e. similarity = 0.35.
69 | pick_size : int, optional
70 | Number of items to pick from the molecule pool. If
71 | the pick size is greater than the pool size, the
72 | pick size will be equal to the size of the pool.
73 | fp_radius : int, optional
74 | Radius of Morgan fingerprint. The default is 2.
75 | fp_bits : int, optional
76 | Number of bits in the Morgan fingerprint. The
77 | default is 1024.
78 | dist_func : function, optional
79 | A function for calculating distance between a pair
80 | of fingerprints. The function should take two indicies
81 | (i, j) and a list of fingerprints (fps) and return
82 | the distance between these points.
83 |
84 | Examples
85 | --------
86 | Diversity pick for benzene scaffold.
87 |
88 | >>> ids, mols, fps = diversity_pick_for_scaffold_class('c1ccccc1', graph, pick_size=10)
89 |
90 | Returns
91 | -------
92 | tuple ((ids), (mols), (fps))
93 | A tuple of tuples with the first containg the picked molecules ids,
94 | the seconds containing the picked mols RDMols and the third containg
95 | the molecules fingerprints.
96 |
97 | Notes
98 | -----
99 | If performing diversity picks on a large scale, a custom implementation
100 | should probably be used where fingerprints can be cached.
101 |
102 | """
103 | mol_ids, mols, fps = _create_pool(scaffold, graph, fp_radius, fp_bits)
104 | pick = _make_diversity_pick(fps, threshold, pick_size, dist_func)
105 | picked = [(mol_ids[x], mols[x], fps[x]) for x in pick]
106 | ids, mols, fps = zip(*picked)
107 | return ids, mols, fps
108 |
--------------------------------------------------------------------------------
/scaffoldgraph/analysis/enrichment.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.analysis.enrichment
3 |
4 | Module contains an implementation of Compound Set Enrichment from the papers:
5 | - Compound Set Enrichment: A Novel Approach to Analysis of Primary HTS Data.
6 | - Mining for bioactive scaffolds with scaffold networks: Improved compound set enrichment from primary screening data.
7 | """
8 |
9 | from networkx import set_node_attributes
10 | from scipy.stats import ks_2samp, binom_test
11 | from loguru import logger
12 |
13 |
14 | def _btp(scaffoldgraph, activity_key, alternative, pd):
15 | """CSE - binomial test (used in cse functions)."""
16 | result, active, total = {}, 0, 0
17 | for m, a in scaffoldgraph.get_molecule_nodes(activity_key):
18 | if int(a) == 1:
19 | active += 1
20 | total += 1
21 | if pd is None:
22 | pd = active / total
23 | logger.debug(f'(BTP) Total: {total}, Active: {active}, pd: {pd}')
24 | for scaffold in scaffoldgraph.get_scaffold_nodes():
25 | mols, acts = zip(*scaffoldgraph.get_molecules_for_scaffold(scaffold, activity_key))
26 | N, K = len(mols), acts.count(1)
27 | pval = binom_test(K, N, pd, alternative=alternative)
28 | logger.debug(f'(BTP) {scaffold}, {K}, {N}, {pval}')
29 | result[scaffold] = {'pval': pval, '_active': K, '_total': N}
30 | return result
31 |
32 |
33 | def _ksp(scaffoldgraph, activity_key, alternative):
34 | """CSE - Kolmogorov-Smirnov test (used in cse functions)."""
35 | result, background = {}, []
36 | for _, activity in scaffoldgraph.get_molecule_nodes(activity_key):
37 | background.append(activity)
38 | for scaffold in scaffoldgraph.get_scaffold_nodes():
39 | mols, acts = zip(*scaffoldgraph.get_molecules_for_scaffold(scaffold, activity_key))
40 | N = len(mols)
41 | dmax, pval = ks_2samp(acts, background, alternative, 'auto')
42 | logger.debug(f'(KSP) {scaffold}, {N}, {dmax}, {pval}')
43 | result[scaffold] = {'pval': pval, 'dmax': dmax, '_total': N}
44 | return result
45 |
46 |
47 | def bonferroni_correction(scaffoldgraph, crit):
48 | """Returns bonferroni corrected significance level for each hierarchy.
49 |
50 | Parameters
51 | ----------
52 | scaffoldgraph : ScaffoldGraph
53 | A ScaffoldGraph object to query.
54 | crit : float
55 | The critical significance value to apply bonferroni correction at
56 | each scaffold hierarchy.
57 |
58 | Returns
59 | -------
60 | dict
61 | A dictionary containing the corrected critical significance value
62 | at each scaffold hierarchy {hierarchy: crit}.
63 |
64 | """
65 | hier = scaffoldgraph.get_hierarchy_sizes()
66 | return {k: crit / v for k, v in hier.items()}
67 |
68 |
69 | def calc_scaffold_enrichment(scaffoldgraph, activity, mode='ks', alternative='greater', p=None):
70 | """
71 | Calculate scaffold enrichment using the Kolmogorov-Smirnov or binomal test.
72 |
73 | Parameters
74 | ----------
75 | scaffoldgraph : ScaffoldGraph
76 | A ScaffoldGraph object to query.
77 | activity : str
78 | A scaffold node attribute key corresponding to an activity value.
79 | If the test is binomial this value should be a binary attribute
80 | (0 or 1 / True or False).
81 | mode : {'ks', 'b'}, optional
82 | A string specifying the statistical test to perform. 'ks' specifies a
83 | Kolmogorov-Smirnov test and 'b' or 'binomial' specifies a binomial test.
84 | The default is 'ks'.
85 | alternative : {'two-sided', 'less', 'greater'}, optional
86 | Defines the alternative hypothesis.
87 | The following options are available:
88 | * 'two-sided'
89 | * 'less': one-sided
90 | * 'greater': one-sided
91 | The default is 'greater'.
92 | p : float, None, optional
93 | The hypothesized probability of success. 0 <= p <= 1. Used in binomial mode.
94 | If not specified p is set automatically (number of active / total compounds).
95 | The default is None.
96 |
97 | Returns
98 | -------
99 | dict
100 | A dict of dicts in the format {scaffold: {results}} where results is the set
101 | of results returned by the statistical test and scaffold is a scaffold node
102 | key corresponding to a scaffold in the ScaffoldGraph object.
103 |
104 | See Also
105 | --------
106 | scaffoldgraph.analysis.enrichment.compound_set_enrichment
107 |
108 | References
109 | ----------
110 | .. [1] Varin, T., Schuffenhauer, A., Ertl, P., and Renner, S. (2011). Mining for bioactive scaffolds
111 | with scaffold networks: Improved compound set enrichment from primary screening data.
112 | Journal of Chemical Information and Modeling, 51(7), 1528–1538.
113 | .. [2] Varin, T., Gubler, H., Parker, C., Zhang, J., Raman, P., Ertl, P. and Schuffenhauer, A. (2010)
114 | Compound Set Enrichment: A Novel Approach to Analysis of Primary HTS Data.
115 | Journal of Chemical Information and Modeling, 50(12), 2067-2078.
116 |
117 | """
118 | if mode == 'binomial' or mode == 'b':
119 | return _btp(scaffoldgraph, activity, alternative, p)
120 | elif mode == 'ks' or mode == 'k':
121 | return _ksp(scaffoldgraph, activity, alternative)
122 | else:
123 | raise ValueError(f'scaffold enrichment mode: {mode}, not implemented')
124 |
125 |
126 | def compound_set_enrichment(scaffoldgraph, activity, mode='ks', alternative='greater', crit=0.01, p=None):
127 | """
128 | Perform compound set enrichment (CSE), calculating scaffolds enriched for bioactivity.
129 |
130 | Parameters
131 | ----------
132 | scaffoldgraph : ScaffoldGraph
133 | A ScaffoldGraph object to query.
134 | activity : str
135 | A scaffold node attribute key corresponding to an activity value.
136 | If the test is binomial this value should be a binary attribute
137 | (0 or 1 / True or False).
138 | mode : {'ks', 'b'}, optional
139 | A string specifying the statistical test to perform. 'ks' specifies a
140 | Kolmogorov-Smirnov test and 'b' or 'binomial' specifies a binomial test.
141 | The default is 'ks'.
142 | alternative : {'two-sided', 'less', 'greater'}, optional
143 | Defines the alternative hypothesis.
144 | The following options are available:
145 | * 'two-sided'
146 | * 'less': one-sided
147 | * 'greater': one-sided
148 | The default is 'greater'.
149 | crit : float, optional
150 | The critical significance level. The default is 0.01
151 | p : float, None, optional
152 | The hypothesized probability of success. 0 <= p <= 1. Used in binomial mode.
153 | If not specified p is set automatically (number of active / total compounds).
154 | The default is None.
155 |
156 | Returns
157 | -------
158 | A tuple of 'enriched' scaffold classes in the format: (scaffold, {data}) where data
159 | is the corresponding node attributes for the returned scaffold.
160 |
161 | Notes
162 | -----
163 | P-values are added as node attributes with the key 'pval'.
164 |
165 | References
166 | ----------
167 | .. [1] Varin, T., Schuffenhauer, A., Ertl, P., and Renner, S. (2011). Mining for bioactive scaffolds
168 | with scaffold networks: Improved compound set enrichment from primary screening data.
169 | Journal of Chemical Information and Modeling, 51(7), 1528–1538.
170 | .. [2] Varin, T., Gubler, H., Parker, C., Zhang, J., Raman, P., Ertl, P. and Schuffenhauer, A. (2010)
171 | Compound Set Enrichment: A Novel Approach to Analysis of Primary HTS Data.
172 | Journal of Chemical Information and Modeling, 50(12), 2067-2078.
173 |
174 | """
175 | set_node_attributes(scaffoldgraph, calc_scaffold_enrichment(scaffoldgraph, activity, mode, alternative, p))
176 | bonferroni = bonferroni_correction(scaffoldgraph, crit)
177 | result = []
178 | for scaffold, data in scaffoldgraph.get_scaffold_nodes(True):
179 | if data['pval'] < bonferroni[data['hierarchy']]:
180 | result.append((scaffold, data))
181 | return tuple(sorted(result, key=lambda x: x[1]['pval']))
182 |
--------------------------------------------------------------------------------
/scaffoldgraph/analysis/frequency.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.analysis.frequency
3 |
4 | """
5 | import numpy as np
6 |
7 |
8 | def cumulative_scaffold_frequency(
9 | scaffoldgraph,
10 | hierarchy=-1,
11 | norm_hierarchy=False,
12 | frequency_key=None
13 | ):
14 | """Calculate cumulative scaffold frequency distrubutions (CSF) from
15 | a scaffold graph.
16 |
17 | Parameters
18 | ----------
19 | scaffoldgraph : ScaffoldGraph
20 | A ScaffoldGraph object to query.
21 | hierarchy : int
22 | The scaffold hierarchy to consider. If -1 then the CSF is
23 | calculated for murcko scaffolds rather than a scaffold
24 | hierarchy. The default is -1 (murcko scaffolds).
25 | norm_hierarchy : bool
26 | Normalise the CSF by the number of molecules represented
27 | by the considered hierarchy rather than the total molecules
28 | in the graph. If False then compound representation in the CDF
29 | may be below 100%. The default is False (normalise by total).
30 | frequency_key : str, None, optional
31 | If scaffold frequency exists as an attribute of the graph, set
32 | this key to avoid re-calculation of scaffold frequencies.
33 |
34 | Examples
35 | --------
36 | Create a CSF plot (CSFP) for murcko scaffolds
37 |
38 | >>> import matplotlib.pyplot as plt
39 | >>> fig, ax = plt.subplots()
40 | >>> x, y = cumulative_scaffold_frequency(tree, hierarchy=-1)
41 | >>> ax.plot(x, y, label='Murcko CSF')
42 | >>> ax.set_xlabel('Percentage of scaffolds')
43 | >>> ax.set_ylabel('Percentage of molecules')
44 | >>> ax.legend()
45 | >>> fig.show()
46 |
47 | Calculate P_50, the percentage of scaffolds that represent 50% of compounds.
48 |
49 | >>> import numpy as np
50 | >>> p50 = np.interp(0.5, y, x)
51 |
52 | Returns
53 | -------
54 | tuple
55 | A tuple containing the cumulative percentage of scaffolds and
56 | the cumulative scaffold frequency as a percentage of molecules.
57 | Can be used to plot a CSF plot (x, y).
58 |
59 | Notes
60 | -----
61 | Cumulative scaffold frequency should be used with scaffold tree
62 | structures.
63 |
64 | """
65 | if not frequency_key:
66 | scaffoldgraph.add_scaffold_molecule_count()
67 | frequency_key = 'count'
68 | if hierarchy == -1: # murcko scaffolds
69 | h = _get_murcko_frequency(scaffoldgraph)
70 | elif hierarchy in scaffoldgraph.get_hierarchy_sizes(): # hierarchies
71 | h = scaffoldgraph.get_scaffolds_in_hierarchy(hierarchy, frequency_key)
72 | else: # hierarchy does not exist
73 | raise ValueError(f'Invalid hierarchy: {hierarchy}')
74 | sh = sorted(h, key=lambda x: x[1], reverse=True)
75 | sf = 1. * np.arange(len(sh)) / (len(sh) - 1)
76 | cumsum = np.cumsum([x[1] for x in sh])
77 | if norm_hierarchy is True and hierarchy > 0:
78 | mf = cumsum / cumsum[-1]
79 | else: # normalize by total molecules in graph
80 | mf = cumsum / scaffoldgraph.num_molecule_nodes
81 | return sf, mf
82 |
83 |
84 | def _get_murcko_frequency(scaffoldgraph):
85 | """Get frequencies for murcko scaffolds."""
86 | g = scaffoldgraph
87 | mols = g.get_molecule_nodes()
88 | m = list({next(g.predecessors(x)) for x in mols})
89 | f = [len([x for x in g.successors(x) if g.nodes[x]['type'] == 'molecule']) for x in m]
90 | return list(zip(m, f))
91 |
92 |
93 | def area_under_curve(x, y):
94 | """Calculate area under the curve using the trapezoidal rule.
95 |
96 | Parameters
97 | ----------
98 | x : np.ndarray, shape (n, )
99 | Array of x coordinates, must be monotonic increasing or decreasing
100 | y : np.ndarray, shape (n, )
101 | Array of y coordinates
102 |
103 | Returns
104 | -------
105 | area : float
106 | Area under the curve (AUC)
107 |
108 | """
109 | x = np.asanyarray(x)
110 | y = np.asanyarray(y)
111 | if x.shape != y.shape:
112 | raise ValueError(
113 | 'Input arrays are expected to contain the same '
114 | f'number of points, x.shape: {x.shape}, '
115 | f'y.shape: {y.shape}'
116 | )
117 | if x.shape[0] < 2:
118 | raise ValueError(
119 | 'At least two points are required to calculate'
120 | f' area under the curve, got shape: {x.shape}'
121 | )
122 | if len(x.shape) != 1:
123 | raise ValueError(
124 | f'Expected 1d arrays for x and y, got '
125 | f'shape: {x.shape}'
126 | )
127 | direction = 1
128 | dx = np.diff(x)
129 | if np.any(dx < 0):
130 | if np.all(dx < 0):
131 | raise ValueError(
132 | 'x is neither monotonic increasing or'
133 | 'decreasing'
134 | )
135 | direction = -1
136 | area = direction * np.trapz(y, x)
137 | return area
138 |
--------------------------------------------------------------------------------
/scaffoldgraph/analysis/general.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.analysis.representation
3 |
4 | Module contains general functions for scaffold analysis
5 | """
6 |
7 |
8 | def get_virtual_scaffolds(scaffoldgraph, data=False, default=None):
9 | """Get 'virtual' scaffolds within a scaffold graph.
10 |
11 | Virtual scaffolds represent scaffolds that are not directly obtained from
12 | any molecule of the collection, but generated by the pruning process.
13 | Virtual scaffolds may provide promising starting points for the synthesis
14 | or acquisition of compounds complementing the current collection.
15 |
16 | Parameters
17 | ----------
18 | scaffoldgraph : ScaffoldGraph
19 | A ScaffoldGraph object to query
20 | data : str, bool, optional
21 | The scaffold node attribute returned in 2-tuple (n, ddict[data]).
22 | If True, return entire node attribute dict as (n, ddict).
23 | If False, return just the nodes n. The default is False.
24 | default : value, bool, optional
25 | Value used for nodes that don't have the requested attribute.
26 | Only relevant if data is not True or False.
27 |
28 | Returns
29 | -------
30 | list
31 | A list of scaffold node keys corresponding to virtual scaffolds.
32 |
33 | """
34 | virtual = []
35 | for scaffold, d in scaffoldgraph.get_scaffold_nodes(True):
36 | mol_count = 0
37 | for succ in scaffoldgraph.successors(scaffold):
38 | if scaffoldgraph.nodes[succ].get('type') == 'molecule':
39 | mol_count += 1
40 | if mol_count == 0:
41 | if data is False:
42 | virtual.append(scaffold)
43 | elif data is True:
44 | virtual.append((scaffold, d))
45 | else:
46 | virtual.append((scaffold, d.get(data, default)))
47 | return virtual
48 |
49 |
50 | def get_singleton_scaffolds(scaffoldgraph, data=False, default=None):
51 | """Get singleton scaffolds within a scaffold graph.
52 |
53 | Singleton scaffolds represent scaffolds that are direct members of only
54 | one compound in the current collection.
55 |
56 | Parameters
57 | ----------
58 | scaffoldgraph : ScaffoldGraph
59 | A ScaffoldGraph object to query
60 | data : str, bool, optional
61 | The scaffold node attribute returned in 2-tuple (n, ddict[data]).
62 | If True, return entire node attribute dict as (n, ddict).
63 | If False, return just the nodes n. The default is False.
64 | default : value, bool, optional
65 | Value used for nodes that don't have the requested attribute.
66 | Only relevant if data is not True or False.
67 |
68 | Returns
69 | -------
70 | list
71 | A list of scaffold node keys corresponding to virtual scaffolds.
72 |
73 | """
74 | singletons = []
75 | for scaffold, d in scaffoldgraph.get_scaffold_nodes(True):
76 | mol_count = 0
77 | for succ in scaffoldgraph.successors(scaffold):
78 | if scaffoldgraph.nodes[succ].get('type') == 'molecule':
79 | mol_count += 1
80 | if mol_count == 1:
81 | if data is False:
82 | singletons.append(scaffold)
83 | elif data is True:
84 | singletons.append((scaffold, d))
85 | else:
86 | singletons.append((scaffold, d.get(data, default)))
87 | return singletons
88 |
--------------------------------------------------------------------------------
/scaffoldgraph/analysis/representation.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.analysis.representation
3 |
4 | Module contains an adaption of the Automated Identification of Over-Represented
5 | Scaffold Classes in HTS Data method from the paper: 'HierS: Hierarchical Scaffold Clustering
6 | Using Topological Chemical Graphs'
7 | """
8 |
9 | from networkx import set_node_attributes
10 | from itertools import combinations
11 |
12 | from rdkit import DataStructs
13 | from rdkit import Chem
14 |
15 | from scaffoldgraph.utils.cache import Cache
16 |
17 |
18 | class MolecularSimilarityCache(object):
19 | """An LRU cache for speeding up repeated molecular similarity computations."""
20 |
21 | __slots__ = ('_fp_func', '_sim_func', '_fp_cache', '_sim_cache')
22 |
23 | def __init__(self, fp_func=None, sim_func=None, fp_cache_maxsize=None, sim_cache_maxsize=None):
24 | """
25 | Parameters
26 | ----------
27 | fp_func : callable, optional
28 | A function calculating a molecular fingerprint from an rdkit Mol object.
29 | If None the function is set to ``rdkit.Chem.RDKFingerprint``.
30 | sim_func : callable, optional
31 | A function calculating the similarity between two fingerprints as returned
32 | by `fp_func`. If None the function is set to ``rdkit.Datastructs.TanimotoSimilarity``
33 | fp_cache_maxsize : int, optional
34 | Set the maximum number of fingerprints cached. If None the cache is unbounded.
35 | sim_cache_maxsize : int, optional
36 | Set the maximum number of similarity values cached. If None the cache is unbounded.
37 |
38 | """
39 | self._fp_func = fp_func if fp_func else Chem.RDKFingerprint
40 | assert callable(self._fp_func), 'fp_func must be callable or None'
41 | self._sim_func = sim_func if sim_func else DataStructs.TanimotoSimilarity
42 | assert callable(self._sim_func), 'sim_func must be callable or None'
43 | self._fp_cache = Cache(fp_cache_maxsize)
44 | self._sim_cache = Cache(sim_cache_maxsize)
45 |
46 | @property
47 | def fp_func(self):
48 | """callable: The fingerprinting function
49 |
50 | If the fingerprinting function is changed both the similarity and
51 | fingerpint caches are cleared.
52 | """
53 | return self._fp_func
54 |
55 | @fp_func.setter
56 | def fp_func(self, fp_func):
57 | setattr(self, '_fp_func', fp_func)
58 | self.clear() # clear both caches
59 |
60 | @property
61 | def sim_func(self):
62 | """callable: The molecular similarity function
63 |
64 | If the similarity function is changed the similarity cache is cleared.
65 | """
66 | return self._sim_func
67 |
68 | @sim_func.setter
69 | def sim_func(self, sim_func):
70 | setattr(self, '_sim_func', sim_func)
71 | self.clear_sim_cache() # clear only similarity cache
72 |
73 | def get_fingerprint(self, mol_node):
74 | """Retrieve a fingerprint from the cache if it exists else calculate.
75 |
76 | Parameters
77 | ----------
78 | mol_node : tuple
79 | A molecule node from a ScaffoldGraph where the first entry is the
80 | molecule ID and the second is a dictionary of node attributes.
81 |
82 | Returns
83 | -------
84 | object
85 | A molecular fingerprint.
86 |
87 | """
88 | mol_id = mol_node[0]
89 | if mol_id in self._fp_cache:
90 | return self._fp_cache[mol_id]
91 | smi = mol_node[1]['smiles']
92 | fp = self._fp_cache.setdefault(mol_id, self._calc_fp(smi))
93 | return fp
94 |
95 | def _calc_fp(self, smiles):
96 | rdmol = Chem.MolFromSmiles(smiles)
97 | return self._fp_func(rdmol)
98 |
99 | def get_similarity(self, mol_node_1, mol_node_2):
100 | """Retrieve a similarity value from the cache if it exists else calculate.
101 |
102 | Parameters
103 | ----------
104 | mol_node_1 : tuple
105 | A molecule node from a ScaffoldGraph where the first entry is the
106 | molecule ID and the second is a dictionary of node attributes.
107 | mol_node_2 : tuple
108 | A molecule node from a ScaffoldGraph where the first entry is the
109 | molecule ID and the second is a dictionary of node attributes.
110 |
111 | Returns
112 | -------
113 | float
114 | A molecular similarity score.
115 |
116 | """
117 | id1, id2 = mol_node_1[0], mol_node_2[0]
118 | key = tuple(sorted([id1, id2]))
119 | if key in self._sim_cache:
120 | return self._sim_cache[key]
121 | fp1 = self.get_fingerprint(mol_node_1)
122 | fp2 = self.get_fingerprint(mol_node_2)
123 | sim = self._sim_cache.setdefault(key, self.sim_func(fp1, fp2))
124 | return sim
125 |
126 | def clear_fp_cache(self):
127 | """Empty the fingerprint cache."""
128 | self._fp_cache.clear()
129 |
130 | def clear_sim_cache(self):
131 | """Empty the similarity cache."""
132 | self._sim_cache.clear()
133 |
134 | def clear(self):
135 | """Empty both the fingerprint and similarity caches."""
136 | self.clear_fp_cache()
137 | self.clear_sim_cache()
138 |
139 | def __enter__(self):
140 | return self
141 |
142 | def __exit__(self, exc_type, exc_val, exc_tb):
143 | self.clear()
144 |
145 | def __repr__(self):
146 | return '{}({}, {})'.format(
147 | self.__class__.__name__,
148 | repr(self._fp_cache),
149 | repr(self._sim_cache)
150 | )
151 |
152 |
153 | def calc_average_pairwise_similarity(scaffoldgraph, fp_func=None, sim_func=None, skip_levels=None,
154 | fp_cache_maxsize=None, sim_cache_maxsize=None):
155 |
156 | """Calculate average pairwise similarity for each scaffold in a ScaffoldGraph.
157 |
158 | Average Pairwise Similarity (APS) is a simple method for approximating the overall topological
159 | similarity between compounds in a given scaffold class. The APS coefficient can also be used
160 | as a metric to gauge scaffold over-representation in a set of compounds as described in the
161 | HierS paper.
162 |
163 | Notes
164 | -----
165 | The metric used in the HierS implementation is called APT (Average Pairwise Tanimoto).
166 | In this implementation it is known as 'APS', as the function enables the user to specify
167 | similarity metrics other than Tanimoto using the `sim_func` argument.
168 |
169 | Parameters
170 | ----------
171 | scaffoldgraph : ScaffoldGraph
172 | fp_func : callable, None, optional
173 | A callable returning a molecular fingerprint from an RDKit Mol object.
174 | If None the fingerprint is an RDKFingerprint with default parameters.
175 | sim_func : callable, None, optional
176 | A callable returning a similarity value (float) for a pair of fingerprint objects
177 | calculated by `fp_func`. If None the default metric is Tanimoto.
178 | skip_levels : iterable, None, optional
179 | Skip any scaffolds in hierarchy levels specified.
180 | The aps and membership is set to 0.
181 | fp_cache_maxsize : int, optional
182 | Set the maximum number of fingerprints cached. If None the cache is unbounded.
183 | sim_cache_maxsize : int, optional
184 | Set the maximum number of similarity values cached. If None the cache is unbounded.
185 |
186 | Returns
187 | -------
188 | dict
189 | A dict of dicts in the format {scaffold: {members, aps}} where members is the
190 | number of molecules in the scaffold cluster and aps is the average pairwise
191 | similarity of the molecules in the cluster.
192 |
193 | See Also
194 | --------
195 | scaffoldgraph.analysis.representation.get_over_represented_scaffold_classes
196 |
197 | """
198 | aps_dict = {}
199 | cache_args = (fp_func, sim_func, fp_cache_maxsize, sim_cache_maxsize)
200 |
201 | with MolecularSimilarityCache(*cache_args) as cache:
202 | for scaffold, data in scaffoldgraph.get_scaffold_nodes(True):
203 | aps_data = aps_dict.setdefault(scaffold, {})
204 |
205 | if skip_levels and data['hierarchy'] in skip_levels:
206 | aps_data['members'] = 0
207 | aps_data['aps'] = 0.0
208 |
209 | m_nodes = scaffoldgraph.get_molecules_for_scaffold(scaffold, data=True)
210 | n_members = len(m_nodes)
211 | aps_data['members'] = n_members
212 |
213 | # If only 1 member (or less in case of disconnect) set aps to 0.0
214 | if n_members <= 1:
215 | aps_data['aps'] = 0.0
216 | continue
217 |
218 | pw_sims = []
219 | for i, j in combinations(m_nodes, 2):
220 | pw_sims.append(cache.get_similarity(i, j))
221 | aps_data['aps'] = sum(pw_sims) / len(pw_sims)
222 |
223 | return aps_dict
224 |
225 |
226 | def get_over_represented_scaffold_classes(scaffoldgraph, threshold=0.80, member_cutoff=None,
227 | skip_aps=False, **kwargs):
228 |
229 | """Returns scaffolds that are potentially over-represented in the dataset.
230 |
231 | This method is an adaptation of the method described in the HierS paper for
232 | automated identification of over-represented scaffold classes in HTS data.
233 |
234 | The algorithm first builds a list of all scaffolds exceeding the user-defined
235 | similarity threshold which is subsequently ordered by ascending scaffold hierarchy (HierS
236 | used molecular weight to sort, but using hierarchy makes sense as it is pre-calculated
237 | during construction). Each scaffold (above hierarchy 1) is then inspected to see if it is
238 | derived from any scaffold that precedes it in the list. Any scaffold in the list of
239 | overrepresented scaffolds that is found to be derived from a higher ranking (i.e.,
240 | lower molecular weight) scaffold is removed because all of the compounds that have membership
241 | in such scaffolds are already accounted for by the higher ranking scaffold.
242 |
243 | The HierS paper uses three defined similarity thresholds (APS) in three categories:
244 |
245 | loose = 0.75
246 | medium = 0.80
247 | strict = 0.85
248 |
249 | Parameters
250 | ----------
251 | scaffoldgraph : ScaffoldGraph
252 | threshold : float, optional
253 | Similarity threshold used to define potential over-represented scaffolds.
254 | The default is 0.80 (i.e. medium)
255 | member_cutoff : int, None, optional
256 | If set, scaffolds for which (member_cutoff <= member molecules) are not considered
257 | to be over-represented (not significant). The default is None.
258 | skip_aps : bool, optional
259 | If True the function assumes that the APS has already been calculated and 'members' and
260 | 'aps' are scaffold node attributes (i.e. use if running the same function more than
261 | once with different thresholds). The default is False.
262 | **kwargs :
263 | Arguments for the calc_average_pairwise_similarity function (calculating the APS metric).
264 |
265 | References
266 | ----------
267 | .. [1] Wilkens, S., Janes, J., and Su, A. (2005). HierS: Hierarchical Scaffold Clustering
268 | Using Topological Chemical Graphs. Journal of Medicinal Chemistry, 48(9), 3182-3193.
269 |
270 | """
271 | if skip_aps is False:
272 | aps = calc_average_pairwise_similarity(scaffoldgraph, **kwargs)
273 | set_node_attributes(scaffoldgraph, aps)
274 | aps.clear()
275 |
276 | or_scaffolds = []
277 | for scaffold, d in scaffoldgraph.get_scaffold_nodes(data=True):
278 | if d.get('aps', 0) > threshold and not (member_cutoff and not d.get('members') >= member_cutoff):
279 | or_scaffolds.append((scaffold, d))
280 | or_scaffolds.sort(key=lambda n: n[1].get('hierarchy'))
281 | or_set = set([s for s, _ in or_scaffolds])
282 |
283 | def _filter(scaffold):
284 | s, data = scaffold
285 | if data.get('hierarchy', 1) == 1:
286 | return True
287 | elif any([p in or_set for p in scaffoldgraph.get_parent_scaffolds(s)]):
288 | return False
289 | return True
290 |
291 | return tuple(filter(_filter, or_scaffolds))
292 |
--------------------------------------------------------------------------------
/scaffoldgraph/core/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.core
3 |
4 | The core package contains core functionality for building ScaffoldGraphs.
5 | """
6 |
7 | from .fragment import (
8 | MurckoRingFragmenter,
9 | MurckoRingSystemFragmenter,
10 | get_all_murcko_fragments,
11 | get_next_murcko_fragments,
12 | get_murcko_scaffold,
13 | get_ring_toplogy_scaffold,
14 | get_ring_connectivity_scaffold
15 | )
16 |
17 | from .graph import ScaffoldGraph
18 | from .scaffold import Scaffold
19 |
20 | __all__ = [
21 | 'ScaffoldGraph',
22 | 'Scaffold',
23 | 'MurckoRingFragmenter',
24 | 'MurckoRingSystemFragmenter',
25 | 'get_all_murcko_fragments',
26 | 'get_next_murcko_fragments',
27 | 'get_murcko_scaffold',
28 | 'get_ring_toplogy_scaffold',
29 | 'get_ring_connectivity_scaffold',
30 | ]
31 |
--------------------------------------------------------------------------------
/scaffoldgraph/io/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.io
3 |
4 | Contains functions for reading molecules from various formats.
5 | - SMILES
6 | - SDF
7 | - DataFrame
8 |
9 | Constains function for writing ScaffoldGraphs to various formats.
10 | - SDF
11 | - TSV
12 | """
13 |
14 | from .dataframe import read_dataframe
15 | from .sdf import read_sdf
16 | from .smiles import read_smiles_file
17 |
18 | __all__ = ['read_sdf', 'read_smiles_file', 'read_dataframe']
19 |
--------------------------------------------------------------------------------
/scaffoldgraph/io/dataframe.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.io.dataframe
3 |
4 | Contains functions for reading molecules from pandas dataframes.
5 | """
6 |
7 | from rdkit.Chem import MolFromSmiles, Mol
8 | from loguru import logger
9 |
10 |
11 | class DataFrameMolSupplier(object):
12 | """Class supplying rdkit Mols from a pandas DataFrame."""
13 |
14 | def __init__(self, df, smiles_column, name_column, data_cols=None):
15 | """Initialize DataFrameMolSupplier.
16 |
17 | Parameters
18 | ----------
19 | df : pandas.DataFrame
20 | Dataframe to read molecules from.
21 | smiles_column : str
22 | Key of column containing SMILES strings.
23 | name_column : str
24 | Key of column containing molecule name strings.
25 | data_cols : list, optional
26 | A list of column keys containg data to retain
27 | in molecule graph nodes. The default is None.
28 |
29 | """
30 | self.data_cols = data_cols
31 | if data_cols is None:
32 | self.supplier = zip(
33 | df[smiles_column].values,
34 | df[name_column].values
35 | )
36 | else:
37 | self.supplier = zip(
38 | df[smiles_column].values,
39 | df[name_column].values,
40 | df[data_cols].values
41 | )
42 | self.n = len(df[smiles_column])
43 | self.cursor = 1
44 |
45 | def __iter__(self):
46 | return self
47 |
48 | def __next__(self):
49 | values = next(self.supplier)
50 | try:
51 | if isinstance(values[0], Mol):
52 | mol = values[0]
53 | else:
54 | mol = MolFromSmiles(values[0])
55 | mol.SetProp('_Name', str(values[1]))
56 | if self.data_cols is not None:
57 | for key, value in zip(self.data_cols, values[2]):
58 | mol.SetProp(str(key), str(value))
59 | except AttributeError:
60 | logger.warning('Molecule {} : {} could not be parsed'.format(
61 | self.cursor, values[0]
62 | ))
63 | self.cursor += 1
64 | return None
65 |
66 | self.cursor += 1
67 | return mol
68 |
69 | def __len__(self):
70 | return self.n
71 |
72 |
73 | def read_dataframe(df, smiles_column, name_column, data_columns=None):
74 | """Read molecules from a dataframe.
75 |
76 | Parameters
77 | ----------
78 | df : pandas.DataFrame
79 | Dataframe to read molecules from.
80 | smiles_column : str
81 | Key of column containing SMILES strings or rdkit Mol objects.
82 | name_column : str
83 | Key of column containing molecule name strings.
84 | data_columns : list, optional
85 | A list of column keys containg data to retain
86 | in molecule graph nodes. The default is None.
87 |
88 | Returns
89 | -------
90 | DataFrameMolSupplier
91 |
92 | """
93 | return DataFrameMolSupplier(df, smiles_column, name_column, data_columns)
94 |
--------------------------------------------------------------------------------
/scaffoldgraph/io/sdf.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.io.sdf
3 |
4 | Contains functions for reading and writing from/to SDF.
5 | """
6 |
7 | from rdkit.Chem import ForwardSDMolSupplier, SDWriter, MolFromSmiles
8 |
9 | from .supplier import MolSupplier, EnumeratedMolSupplier
10 |
11 |
12 | def read_sdf(sdf_file, requires_length=False):
13 | """Read molecules from an SDF.
14 |
15 | Parameters
16 | ----------
17 | sdf_file : file-like object
18 | An open SDF.
19 | requires_length : bool, optional
20 | If True returns an enumerated MolSupplier,
21 | i.e. when monitoring progress. The default
22 | is False.
23 |
24 | Returns
25 | -------
26 | MolSupplier or EnumeratedSupplier
27 |
28 | """
29 | supplier = ForwardSDMolSupplier(sdf_file)
30 | if not requires_length:
31 | return MolSupplier(supplier)
32 | count = sdf_count(sdf_file)
33 | sdf_file.seek(0)
34 | return EnumeratedMolSupplier(supplier, count)
35 |
36 |
37 | def write_sdf_file(scaffold_graph, output_file):
38 | """Write an SDF file from a ScaffoldGraph.
39 |
40 | All scaffolds in the scaffoldgraph are written to the
41 | SDF, while molecules are ignored. Scaffolds are sorted
42 | in ascending order according to their hierarchy level.
43 |
44 | The output follows the standard SDF specification with
45 | the added property fields:
46 |
47 | TITLE field: scaffold ID
48 | SUBSCAFFOLDS field: list of sub-scaffold IDs
49 | HIERARCHY field: hierarchy level of scaffold
50 | SMILES field: scaffold canonical SMILES
51 |
52 | Parameters
53 | ----------
54 | scaffold_graph : scaffoldgraph.core.ScaffoldGraph
55 | ScaffoldGraph to be written to an SDF.
56 | output_file : str
57 | Filepath to an output file.
58 |
59 | """
60 | N = scaffold_graph.num_scaffold_nodes
61 | sorted_scaffolds = sorted(scaffold_graph.get_scaffold_nodes(data=True), key=lambda x: x[1]['hierarchy'])
62 | mapping = dict(zip([s[0] for s in sorted_scaffolds], range(0, N)))
63 | writer = SDWriter(output_file)
64 | for scaffold, data in sorted_scaffolds:
65 | molecule = MolFromSmiles(scaffold)
66 | if molecule is not None:
67 | subscaffolds = list(scaffold_graph.predecessors(scaffold))
68 | molecule.SetProp('_Name', mapping[scaffold])
69 | molecule.SetIntProp('HIERARCHY', scaffold_graph.nodes[scaffold]['HIERARCHY'])
70 | molecule.SetProp('SMILES', scaffold)
71 | molecule.SetProp('SUBSCAFFOLDS', ', '.join([str(mapping[s]) for s in subscaffolds]))
72 | writer.write(molecule)
73 | writer.close()
74 |
75 |
76 | def sdf_count(file_obj):
77 | """Count the number of molecules in an SDF file.
78 |
79 | Counts the number of times '$$$$' occurs at the start of lines
80 | in the file.
81 |
82 | Parameters
83 | ----------
84 | file_obj : file-like object
85 |
86 | Returns
87 | -------
88 | int
89 | The number of molecules in the file.
90 |
91 | """
92 | return sum(1 for line in file_obj if line[:4] == b'$$$$')
93 |
--------------------------------------------------------------------------------
/scaffoldgraph/io/smiles.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.io.smiles
3 |
4 | Contains functions for reading molecules from SMILES files.
5 | """
6 |
7 | from rdkit.Chem import SmilesMolSupplier
8 |
9 | from .supplier import EnumeratedMolSupplier, MolSupplier
10 |
11 |
12 | def read_smiles_file(smiles_file, delimiter=' ', smiles_column=0,
13 | name_column=1, header=False, requires_length=False):
14 |
15 | """Read molecules from a SMILES file.
16 |
17 | Parameters
18 | ----------
19 | smiles_file : str
20 | File path to a SMILES file.
21 | delimiter : str, optional
22 | Delimiter used in SMILES file. The default is ' '.
23 | smiles_column : int, optional
24 | SMILES column index. The default is 0.
25 | name_column : int, optional
26 | Molecule name/ID column index. The default is 1.
27 | header : bool, optional
28 | Whether the SMILES file contains a header.
29 | The default is False.
30 | requires_length : bool, optional
31 | If True returns an enumerated Mol supplier, i.e. when
32 | monitoring progress. The default is False.
33 |
34 | Returns
35 | -------
36 | MolSupplier or EnumeratedSupplier
37 |
38 | """
39 | if requires_length is False:
40 | return MolSupplier(
41 | SmilesMolSupplier(
42 | smiles_file,
43 | delimiter,
44 | smiles_column,
45 | name_column,
46 | header,
47 | True))
48 |
49 | count = smiles_count(smiles_file)
50 | if header is True:
51 | count -= 1
52 |
53 | supplier = SmilesMolSupplier(
54 | smiles_file, delimiter, smiles_column, name_column, header, True
55 | )
56 |
57 | return EnumeratedMolSupplier(supplier, count)
58 |
59 |
60 | def smiles_count(smiles_file):
61 | """int : Return the number of lines in a SMILES file."""
62 | f = open(smiles_file, 'rb')
63 | lines = 0
64 | buf_size = 1024 * 1024
65 | read_f = f.read
66 | buf = read_f(buf_size)
67 | while buf:
68 | lines += buf.count(b'\n')
69 | buf = read_f(buf_size)
70 | f.close()
71 | return lines
72 |
--------------------------------------------------------------------------------
/scaffoldgraph/io/supplier.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.io.supplier
3 |
4 | Contains utilities for io within scaffoldgraph.
5 | """
6 |
7 | from loguru import logger
8 |
9 |
10 | class MolSupplier(object):
11 | """A wrapper for rdkit Mol suppliers.
12 |
13 | Provides logging for molecule parsing errors in
14 | a way that is compatible with scaffoldgraphs
15 | logging system.
16 |
17 | Notes
18 | -----
19 | Technically the supplier can be used with any iterable python object
20 | containing or supplying rdkit Mol objects.
21 |
22 | See Also
23 | --------
24 | EnumeratedMolSupplier
25 |
26 | """
27 | def __init__(self, supplier):
28 | """Initialize an EnumeratedMolSupplier.
29 |
30 | Parameters
31 | ----------
32 | supplier : iterable
33 | An rdkit Mol Supplier.
34 |
35 | """
36 | self.supplier = supplier
37 | self.cursor = 1
38 |
39 | def __iter__(self):
40 | return self
41 |
42 | def __next__(self):
43 | mol = next(self.supplier)
44 | if mol is None:
45 | logger.warning('Molecule {} could not be parsed'.format(
46 | self.cursor
47 | ))
48 | self.cursor += 1
49 | return mol
50 |
51 |
52 | class EnumeratedMolSupplier(MolSupplier):
53 | """
54 | A wrapper for rdkit Mol suppliers, providing the number of mols in the supplier,
55 | for use with progress monitoring.
56 |
57 | Attributes
58 | ----------
59 | n : int
60 | The length of the supplier
61 |
62 | Notes
63 | -----
64 | Technically the supplier can be used with any iterable python object
65 | containing or supplying rdkit Mol objects.
66 |
67 | See Also
68 | --------
69 | MolSupplier
70 |
71 | """
72 | def __init__(self, supplier, length):
73 | """Initialize an EnumeratedMolSupplier.
74 |
75 | Parameters
76 | ----------
77 | supplier : iterable
78 | An rdkit Mol Supplier.
79 | length : int
80 | Number of Mols in the supplier.
81 |
82 | """
83 | super(EnumeratedMolSupplier, self).__init__(supplier)
84 | self.n = length
85 |
86 | def __len__(self):
87 | return self.n
88 |
--------------------------------------------------------------------------------
/scaffoldgraph/io/tsv.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.io.tsv
3 |
4 | Contains functions for writing to TSV files.
5 | """
6 |
7 | import csv
8 |
9 |
10 | def write_tsv(scaffold_graph, output_file, write_ids=False):
11 | """Write a ScaffoldGraph to a file in TSV format.
12 |
13 | Used by scaffoldgraphs CLI utility.
14 |
15 | Parameters
16 | ----------
17 | scaffold_graph : scaffoldgraph.core.ScaffoldGraph
18 | An scaffold graph to write to a file.
19 | output_file : str
20 | Path to output file.
21 | write_ids : bool, optional
22 | If True, write the fields {'ID', 'HIERARCHY', 'SMILES',
23 | 'SUBSCAFFOLDS'} else write the fields {'HIERARCHY',
24 | 'SMILES', 'SUBSCAFFOLDS', 'MOLECULES', 'ANNOTATIONS'}.
25 | The aggregate CLI function uses write_ids=True, while
26 | the generation utilities use write_ids=False. The default
27 | is False.
28 |
29 | """
30 | N = scaffold_graph.num_scaffold_nodes
31 | sorted_scaffolds = sorted(scaffold_graph.get_scaffold_nodes(data=True), key=lambda x: x[1]['hierarchy'])
32 |
33 | if write_ids:
34 | field_names = ['ID', 'HIERARCHY', 'SMILES', 'SUBSCAFFOLDS']
35 | mapping = dict(zip([s[0] for s in sorted_scaffolds], range(0, N)))
36 | else:
37 | field_names = ['HIERARCHY', 'SMILES', 'SUBSCAFFOLDS', 'MOLECULES', 'ANNOTATIONS']
38 | mapping = None
39 |
40 | with open(output_file, 'w') as output:
41 |
42 | writer = csv.DictWriter(output, delimiter='\t', fieldnames=field_names)
43 | writer.writeheader()
44 |
45 | for node, data in sorted_scaffolds:
46 | line = dict.fromkeys(field_names)
47 | line['SMILES'] = node
48 | line['HIERARCHY'] = data['hierarchy']
49 |
50 | subscaffolds = list(scaffold_graph.predecessors(node))
51 | if write_ids:
52 | line['SUBSCAFFOLDS'] = ', '.join([str(mapping[s]) for s in subscaffolds])
53 | line['ID'] = str(mapping[node])
54 | else:
55 | line['SUBSCAFFOLDS'] = ', '.join(subscaffolds)
56 | ancestors = scaffold_graph.successors(node)
57 | molecules, annotations = [], set()
58 | for a in ancestors:
59 | try:
60 | if scaffold_graph.nodes[a]['type'] == 'molecule':
61 | molecules.append(a)
62 | edge = scaffold_graph.edges[(node, a)]
63 | annotations.add(edge['annotation'])
64 | except KeyError:
65 | continue
66 | line['MOLECULES'] = ', '.join(molecules)
67 | line['ANNOTATIONS'] = ', '.join(annotations)
68 |
69 | writer.writerow(line)
70 |
--------------------------------------------------------------------------------
/scaffoldgraph/network.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.network
3 | """
4 |
5 | from .core import MurckoRingFragmenter, MurckoRingSystemFragmenter
6 | from .core import ScaffoldGraph
7 |
8 |
9 | class ScaffoldNetwork(ScaffoldGraph):
10 | """
11 | Class representing a scaffold network.
12 |
13 | Explore scaffold-space through the iterative removal of available rings,
14 | generating all possible sub-scaffolds for a set of input molecules.
15 | The output is a directed acyclic graph of molecular scaffolds.
16 |
17 | Examples
18 | --------
19 | Create a ScaffoldNetwork from a SMILES file.
20 |
21 | >>> import scaffoldgraph as sg
22 | >>> network = sg.ScaffoldNetwork.from_smiles_file('my_file.smi', progress=True)
23 | >>> network.num_scaffold_nodes
24 | 100
25 |
26 | Create a ScaffoldNetwork from an SDF.
27 |
28 | >>> network = sg.ScaffoldNetwork.from_sdf('my_file.sdf', progress=True)
29 |
30 | If the SDF is zipped:
31 |
32 | >>> network = sg.ScaffoldNetwork.from_sdf('my_file.sdf.gz', zipped=True)
33 |
34 | Get scaffold nodes:
35 |
36 | >>> list(network.get_scaffold_nodes())
37 | ['O=C(OCOC(=O)c1cccc2ncn(Cc3ccccc3)c12)OC1CCCCC1',
38 | 'O=C(OCOC(=O)c1cccc2nc[nH]c12)OC1CCCCC1',
39 | ...]
40 |
41 | Include node attributes:
42 |
43 | >>> list(network.get_scaffold_nodes(data=True))
44 | [('O=C(OCOC(=O)c1cccc2ncn(Cc3ccccc3)c12)OC1CCCCC1', {'type': 'scaffold', 'hierarchy': 4}),
45 | ('O=C(OCOC(=O)c1cccc2nc[nH]c12)OC1CCCCC1', {'type': 'scaffold', 'hierarchy': 3}),
46 | ...]
47 |
48 | Get molecule nodes (use data=True to get attributes):
49 |
50 | >>> list(network.get_molecule_nodes())
51 | ['DB00006',
52 | 'DB00007',
53 | 'DB00014',
54 | ...]
55 |
56 |
57 | References
58 | ----------
59 | .. [1] Varin, T., Schuffenhauer, A., Ertl, P., and Renner, S. (2011). Mining for bioactive
60 | scaffolds with scaffold networks: Improved compound set enrichment from primary screening data.
61 | Journal of Chemical Information and Modeling, 51(7), 1528–1538.
62 |
63 | See Also
64 | --------
65 | ScaffoldGraph
66 | ScaffoldTree
67 | HierS
68 |
69 | """
70 | def __init__(self, graph=None, **kwargs):
71 | """Initialize a ScaffoldNetwork.
72 |
73 | Parameters
74 | ----------
75 | graph : input graph, optional
76 | Data to initialize graph. If None (default) an empty
77 | graph is created. The data can be any format that is supported
78 | by the ``to_networkx_graph()`` function, currently including
79 | edge list, dict of dicts, dict of lists, NetworkX graph,
80 | NumPy matrix or 2d ndarray, SciPy sparse matrix,
81 | or PyGraphviz graph. This argument is passed to the networkx
82 | DiGraph constructor.
83 |
84 | """
85 | super(ScaffoldNetwork, self).__init__(graph, MurckoRingFragmenter(), 'network')
86 |
87 | def _hierarchy_constructor(self, child):
88 | parents = (p for p in self.fragmenter.fragment(child) if p)
89 | for parent in parents:
90 | if parent in self.nodes:
91 | self.add_scaffold_edge(parent, child)
92 | else:
93 | self.add_scaffold_node(parent)
94 | self.add_scaffold_edge(parent, child)
95 | if parent.rings.count > 1:
96 | self._hierarchy_constructor(parent)
97 |
98 |
99 | class HierS(ScaffoldGraph):
100 | """
101 | Class representing a HierS type scaffold network.
102 |
103 | Explore scaffold-space through the iterative removal of available rings,
104 | generating all possible sub-scaffolds without dissecting fused ring-systems.
105 |
106 | Notes
107 | -----
108 | A HierS type network differs from a conventional scaffold network, through construction.
109 | When fragmenting molecules the HierS constructor does not attempt to break fused ring
110 | systems.
111 |
112 | Examples
113 | --------
114 | Create a HierS network from a SMILES file.
115 |
116 | >>> import scaffoldgraph as sg
117 | >>> network = sg.HierS.from_smiles_file('my_file.smi', progress=True)
118 | >>> network.num_scaffold_nodes
119 | 92
120 |
121 | Create a HierS netwoek from an SDF.
122 |
123 | >>> network = sg.HierS.from_sdf('my_file.sdf', progress=True)
124 |
125 | If the SDF is zipped:
126 |
127 | >>> network = sg.HierS.from_sdf('my_file.sdf.gz', zipped=True)
128 |
129 | Get scaffold nodes:
130 |
131 | >>> list(network.get_scaffold_nodes())
132 | ['O=C(OCOC(=O)c1cccc2ncn(Cc3ccccc3)c12)OC1CCCCC1',
133 | 'O=C(OCOC(=O)c1cccc2nc[nH]c12)OC1CCCCC1',
134 | ...]
135 |
136 | Include node attributes:
137 |
138 | >>> list(network.get_scaffold_nodes(data=True))
139 | [('O=C(OCOC(=O)c1cccc2ncn(Cc3ccccc3)c12)OC1CCCCC1', {'type': 'scaffold', 'hierarchy': 4}),
140 | ('O=C(OCOC(=O)c1cccc2nc[nH]c12)OC1CCCCC1', {'type': 'scaffold', 'hierarchy': 3}),
141 | ...]
142 |
143 | Get molecule nodes (use data=True to get attributes):
144 |
145 | >>> list(network.get_molecule_nodes())
146 | ['DB00006',
147 | 'DB00007',
148 | 'DB00014',
149 | ...]
150 |
151 | References
152 | ----------
153 | .. [1] Wilkens, S., Janes, J., and Su, A. (2005). HierS: Hierarchical Scaffold Clustering
154 | Using Topological Chemical Graphs. Journal of Medicinal Chemistry, 48(9), 3182-3193.
155 |
156 | See Also
157 | --------
158 | ScaffoldGraph
159 | ScaffoldNetwork
160 | ScaffoldTree
161 |
162 | """
163 | def __init__(self, graph=None, **kwargs):
164 | """Initialize a HierS network.
165 |
166 | Parameters
167 | ----------
168 | graph : input graph, optional
169 | Data to initialize graph. If None (default) an empty
170 | graph is created. The data can be any format that is supported
171 | by the ``to_networkx_graph()`` function, currently including
172 | edge list, dict of dicts, dict of lists, NetworkX graph,
173 | NumPy matrix or 2d ndarray, SciPy sparse matrix,
174 | or PyGraphviz graph. This argument is passed to the networkx
175 | DiGraph constructor.
176 |
177 | """
178 | super(HierS, self).__init__(graph, MurckoRingSystemFragmenter(), 'hiers')
179 |
180 | def _hierarchy_constructor(self, child):
181 | parents = (p for p in self.fragmenter.fragment(child) if p)
182 | for parent in parents:
183 | if parent in self.nodes:
184 | self.add_scaffold_edge(parent, child)
185 | else:
186 | self.add_scaffold_node(parent)
187 | self.add_scaffold_edge(parent, child)
188 | if parent.ring_systems.count > 1:
189 | self._hierarchy_constructor(parent)
190 |
--------------------------------------------------------------------------------
/scaffoldgraph/prioritization/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.prioritization
3 |
4 | Contains functions for scaffold prioritization.
5 | """
6 |
7 | from .original_rules import original_ruleset
8 | from .prioritization_ruleset import ScaffoldRuleSet
9 | from .prioritization_rules import BaseScaffoldFilterRule, ScaffoldFilterRule, \
10 | ScaffoldMinFilterRule, ScaffoldMaxFilterRule
11 | from .generic_rules import *
12 |
13 |
14 | __all__ = [
15 | 'BaseScaffoldFilterRule',
16 | 'ScaffoldFilterRule',
17 | 'ScaffoldMinFilterRule',
18 | 'ScaffoldMaxFilterRule',
19 | 'ScaffoldRuleSet',
20 | 'original_ruleset',
21 | ]
22 |
--------------------------------------------------------------------------------
/scaffoldgraph/prioritization/original_rules.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.prioritization.original_rules
3 |
4 | Implements rules from the paper:
5 | 'The Scaffold Tree − Visualization of the Scaffold Universe by Hierarchical Scaffold Classification'
6 | """
7 |
8 | from itertools import chain
9 |
10 | from rdkit.Chem import MolFromSmarts
11 |
12 | from scaffoldgraph.prioritization.prioritization_ruleset import ScaffoldRuleSet
13 | from scaffoldgraph.core.fragment import collect_linker_atoms
14 |
15 | from .prioritization_rules import *
16 |
17 |
18 | __all__ = [
19 | 'OriginalRule01',
20 | 'OriginalRule02',
21 | 'OriginalRule03',
22 | 'OriginalRule04',
23 | 'OriginalRule05',
24 | 'OriginalRule06',
25 | 'OriginalRule07',
26 | 'OriginalRule08',
27 | 'OriginalRule09a',
28 | 'OriginalRule09b',
29 | 'OriginalRule09c',
30 | 'OriginalRule10',
31 | 'OriginalRule11',
32 | 'OriginalRule12',
33 | 'OriginalRule13',
34 | 'original_ruleset',
35 | ]
36 |
37 |
38 | class OriginalRule01(ScaffoldFilterRule):
39 | """Remove heterocycles of size 3 first."""
40 |
41 | def condition(self, child, parent):
42 | removed_ring = child.rings[parent.removed_ring_idx]
43 | ring_atomic_nums = [a.GetAtomicNum() for a in removed_ring.atoms]
44 | ring_num_het = len([a for a in ring_atomic_nums if a != 1 and a != 6])
45 | return removed_ring.size == 3 and ring_num_het == 1
46 |
47 | @property
48 | def name(self):
49 | return 'original rule 01'
50 |
51 |
52 | class OriginalRule02(ScaffoldFilterRule):
53 | """Do not remove rings with >= 12 atoms if there are smaller rings to remove."""
54 |
55 | def condition(self, child, parent):
56 | removed_ring = child.rings[parent.removed_ring_idx]
57 | return removed_ring.size < 12
58 |
59 | @property
60 | def name(self):
61 | return 'original rule 02'
62 |
63 |
64 | class OriginalRule03(ScaffoldMinFilterRule):
65 | """Choose the parent scaffold with the smallest number of acyclic linker bonds."""
66 |
67 | acyc_linker_smarts = MolFromSmarts('*!@!=!#*')
68 |
69 | def get_property(self, child, parent):
70 | matches = parent.mol.GetSubstructMatches(self.acyc_linker_smarts)
71 | return len(matches)
72 |
73 | @property
74 | def name(self):
75 | return 'original rule 03'
76 |
77 |
78 | class OriginalRule04(ScaffoldMaxFilterRule):
79 | """Retain bridged rings, spiro rings and nonlinear fusion patterns with preference."""
80 |
81 | def get_property(self, child, parent):
82 | nr = parent.rings.count
83 | rb = list(chain(*parent.rings.bond_rings))
84 | nrrb = len(rb) - len(set(rb))
85 | return abs(nrrb - (nr - 1))
86 |
87 | @property
88 | def name(self):
89 | return 'original rule 04'
90 |
91 |
92 | class OriginalRule05(ScaffoldFilterRule):
93 | """Bridged ring systems retained with preference over spiro rings,
94 | Rings with a positive signed delta are retained."""
95 |
96 | def condition(self, child, parent):
97 | nr = parent.rings.count
98 | rb = list(chain(*parent.rings.bond_rings))
99 | nrrb = len(rb) - len(set(rb))
100 | delta = nrrb - (nr - 1)
101 | return delta >= 1
102 |
103 | @property
104 | def name(self):
105 | return 'original rule 05'
106 |
107 |
108 | class OriginalRule06(ScaffoldFilterRule):
109 | """Remove rings of size 3, 5 and 6 first."""
110 |
111 | def condition(self, child, parent):
112 | rr_size = child.rings[parent.removed_ring_idx].size
113 | return rr_size == 3 or rr_size == 5 or rr_size == 6
114 |
115 | @property
116 | def name(self):
117 | return 'original rule 06'
118 |
119 |
120 | class OriginalRule07(BaseScaffoldFilterRule):
121 | """A fully aromatic ring system must not be dissected in a way that the resulting system
122 | is not aromatic anymore.
123 |
124 | UNIMPLEMENTED
125 | This is tricky to implement and should probably be done during the fragmentation process,
126 | although for efficiency it might be better to ignore this rule, rdkit seems to catch many
127 | of these cases in the partial sanitization as we do not attempt to change atom types when
128 | this event occurs. (SNG also skips this step).
129 | """
130 |
131 | def filter(self, child, parents):
132 | return parents
133 |
134 | @property
135 | def name(self):
136 | return 'original rule 07'
137 |
138 |
139 | class OriginalRule08(ScaffoldMinFilterRule):
140 | """Remove rings with the least hetero atoms first."""
141 |
142 | def get_property(self, child, parent):
143 | removed_ring = child.rings[parent.removed_ring_idx]
144 | ring_atomic_nums = [a.GetAtomicNum() for a in removed_ring.atoms]
145 | return len([a for a in ring_atomic_nums if a != 1 and a != 6])
146 |
147 | @property
148 | def name(self):
149 | return 'original rule 08'
150 |
151 |
152 | class OriginalRule09a(ScaffoldMinFilterRule):
153 | """Remove scaffolds with least nitrogen atoms in deleted ring."""
154 |
155 | def get_property(self, child, parent):
156 | removed_ring = child.rings[parent.removed_ring_idx]
157 | ring_atomic_nums = [a.GetAtomicNum() for a in removed_ring.atoms]
158 | return ring_atomic_nums.count(7)
159 |
160 | @property
161 | def name(self):
162 | return 'original rule 09a'
163 |
164 |
165 | class OriginalRule09b(ScaffoldMinFilterRule):
166 | """Remove scaffolds with least oxygen atoms in deleted ring."""
167 |
168 | def get_property(self, child, parent):
169 | removed_ring = child.rings[parent.removed_ring_idx]
170 | ring_atomic_nums = [a.GetAtomicNum() for a in removed_ring.atoms]
171 | return ring_atomic_nums.count(8)
172 |
173 | @property
174 | def name(self):
175 | return 'original rule 09b'
176 |
177 |
178 | class OriginalRule09c(ScaffoldMinFilterRule):
179 | """Remove scaffolds with least sulphur atoms in deleted ring."""
180 |
181 | def get_property(self, child, parent):
182 | removed_ring = child.rings[parent.removed_ring_idx]
183 | ring_atomic_nums = [a.GetAtomicNum() for a in removed_ring.atoms]
184 | return ring_atomic_nums.count(16)
185 |
186 | @property
187 | def name(self):
188 | return 'original rule 09c'
189 |
190 |
191 | class OriginalRule10(ScaffoldMinFilterRule):
192 | """Smaller rings are removed first."""
193 |
194 | def get_property(self, child, parent):
195 | return child.rings[parent.removed_ring_idx].size
196 |
197 | @property
198 | def name(self):
199 | return 'original rule 10'
200 |
201 |
202 | class OriginalRule11(ScaffoldFilterRule):
203 | """Retain non-aromatic rings with preference."""
204 |
205 | def condition(self, child, parent):
206 | removed_ring = child.rings[parent.removed_ring_idx]
207 | return all([bond.GetIsAromatic() for bond in removed_ring.bonds])
208 |
209 | @property
210 | def name(self):
211 | return 'original rule 11'
212 |
213 |
214 | class OriginalRule12(ScaffoldFilterRule):
215 | """Remove rings first where the linker is attached to a ring hetero atom at either end of the linker."""
216 |
217 | def condition(self, child, parent):
218 | linker, ra = set(), set() # linker atoms, ring attachments
219 | removed_ring = child.rings[parent.removed_ring_idx]
220 | attachments = removed_ring.get_attachment_points()
221 | for attachment in attachments:
222 | ra.update(collect_linker_atoms(child.mol.GetAtomWithIdx(attachment), linker, False))
223 | atomic_nums = [child.atoms[x].GetAtomicNum() for x in ra]
224 | return len([a for a in atomic_nums if a != 1 and a != 6]) > 0
225 |
226 | @property
227 | def name(self):
228 | return 'original rule 12'
229 |
230 |
231 | class OriginalRule13(BaseScaffoldFilterRule):
232 | """Tie-breaker rule (alphabetical)."""
233 |
234 | def filter(self, child, parents):
235 | return [sorted(parents, key=lambda p: p.smiles)[0]]
236 |
237 | @property
238 | def name(self):
239 | return 'original rule 13'
240 |
241 |
242 | def _make_original_rules():
243 | """list: Generate a list of the original rules."""
244 | all_rules = [
245 | OriginalRule01(),
246 | OriginalRule02(),
247 | OriginalRule03(),
248 | OriginalRule04(),
249 | OriginalRule05(),
250 | OriginalRule06(),
251 | OriginalRule07(),
252 | OriginalRule08(),
253 | OriginalRule09a(),
254 | OriginalRule09b(),
255 | OriginalRule09c(),
256 | OriginalRule10(),
257 | OriginalRule11(),
258 | OriginalRule12(),
259 | OriginalRule13(),
260 | ]
261 | return all_rules
262 |
263 |
264 | # This is the ruleset used by the original scaffold tree publication.
265 | original_ruleset = ScaffoldRuleSet(_make_original_rules(), name='Original Rules')
266 |
--------------------------------------------------------------------------------
/scaffoldgraph/prioritization/prioritization_rules.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.prioritization.prioritization_rules
3 |
4 | Implements abstract rules for scaffold prioritization when constructing scaffold trees.
5 | """
6 |
7 | from abc import ABCMeta, abstractmethod
8 | from itertools import compress
9 |
10 | __all__ = [
11 | 'BaseScaffoldFilterRule',
12 | 'ScaffoldFilterRule',
13 | 'ScaffoldMaxFilterRule',
14 | 'ScaffoldMinFilterRule',
15 | ]
16 |
17 |
18 | class BaseScaffoldFilterRule(metaclass=ABCMeta):
19 | """Abstract base class for defining rules for scaffold prioritization.
20 |
21 | Scaffold filter rules should subclass this base class.
22 | All base rules should implement the ``name`` property and the
23 | ``filter`` function.
24 |
25 | """
26 | @abstractmethod
27 | def filter(self, child, parents):
28 | """Filter a set of input scaffolds (parents).
29 |
30 | The filter method should filter a set of input parent scaffolds using
31 | a defined rule.
32 |
33 | Defined rules may use properties relating to the scaffolds themselves
34 | or from the child scaffold from which they were obtained. i.e. properties
35 | of the ring that was removed:
36 |
37 | # Get index of removed ring from a parent scaffold
38 | >>> removed_ring = parents[0].removed_ring_index
39 |
40 | # Get atoms/bonds in this ring
41 | >>> ring_atoms = child.rings[removed_ring]
42 |
43 | # calculate property (i.e. number of carbon atoms in ring)
44 | >>> prop = [a.GetAtomicNum() for a in ring_atoms].count(6)
45 |
46 | Parameters
47 | ----------
48 | child : scaffoldgraph.core.Scaffold
49 | The child scaffold from which the parent scaffolds were obtained.
50 | parents : iterable
51 | An iterable of all parent scaffolds generated by a fragmenter.
52 |
53 | """
54 | raise NotImplementedError()
55 |
56 | @property
57 | @abstractmethod
58 | def name(self):
59 | """Return the name of the filter rule.
60 |
61 | Subclasses should define this, returning a name for the rule.
62 |
63 | """
64 | raise NotImplementedError()
65 |
66 | def __call__(self, child, parents):
67 | return self.filter(child, parents)
68 |
69 | def __str__(self):
70 | return str(self.name)
71 |
72 | def __repr__(self):
73 | return '<{_cls} at {address}>'.format(
74 | _cls=self.__class__.__name__,
75 | address=hex(id(self))
76 | )
77 |
78 |
79 | class ScaffoldFilterRule(BaseScaffoldFilterRule):
80 | """
81 | Abstract base class for defining rules for scaffold prioritization
82 | based on a defined True/False condition.
83 |
84 | Subclasses should implement the condition method, where a boolean value
85 | is returned for a particular input scaffold. Scaffolds with a 'True'
86 | property will be retained when using the filter method. Subclasses
87 | should also implement the ``name`` property.
88 |
89 | Example
90 | -------
91 | >>> class MyRule(ScaffoldFilterRule):
92 | ...
93 | ... def condition(self, child, parent):
94 | ... if parent ... :
95 | ... return True
96 | ... return False
97 | ...
98 | ... @property
99 | ... def name(self):
100 | ... return 'my conditional rule'
101 |
102 | """
103 | def filter(self, child, parents):
104 | """Filter a set of parent scaffolds using a defined condition.
105 |
106 | Parameters
107 | ----------
108 | child : scaffoldgraph.core.Scaffold
109 | The child scaffold from which the parent scaffolds were obtained.
110 | parents : iterable
111 | An iterable of all parent scaffolds generated by a fragmenter.
112 |
113 | """
114 | return [s for s in parents if self.condition(child, s)]
115 |
116 | @abstractmethod
117 | def condition(self, child, parent):
118 | """A Boolean condition for scaffold filtering.
119 |
120 | Subclasses should implement this method.
121 |
122 | Parameters
123 | ----------
124 | child : scaffoldgraph.core.Scaffold
125 | The child scaffold from which the parent scaffolds were obtained.
126 | parent : scaffoldgraph.core.Scaffold
127 | A parent scaffold.
128 |
129 | """
130 | raise NotImplementedError()
131 |
132 |
133 | class ScaffoldMinFilterRule(BaseScaffoldFilterRule):
134 | """
135 | Abstract base class for defining rules for scaffold prioritization
136 | based on a minimum property value.
137 |
138 | Subclasses should implement the ``get_property method``, where a property value
139 | is returned for a particular input scaffold. Scaffolds with a property value
140 | equal to the minimum property value will be retained. Subclasses should also
141 | implement the ``name`` property.
142 |
143 | Example
144 | -------
145 | >>> class MyRule(ScaffoldMinFilterRule):
146 | ...
147 | ... def get_property(self, child, parent):
148 | ... prop = get_some_property(parent)
149 | ... return prop
150 | ...
151 | ... @property
152 | ... def name(self):
153 | ... return 'my min conditional rule'
154 |
155 | """
156 | def filter(self, child, parents):
157 | """Filter a set of parent scaffolds using a minimum property value.
158 |
159 | Parameters
160 | ----------
161 | child : scaffoldgraph.core.Scaffold
162 | The child scaffold from which the parent scaffolds were obtained.
163 | parents : iterable
164 | An iterable of all parent scaffolds generated by a fragmenter.
165 |
166 | """
167 | props = [self.get_property(child, s) for s in parents]
168 | min_val = min(props)
169 | return list(compress(parents, [True if p == min_val else False for p in props]))
170 |
171 | @abstractmethod
172 | def get_property(self, child, parent):
173 | """Return a property value for a child/parent scaffold.
174 |
175 | Subclasses should implement this method.
176 |
177 | Parameters
178 | ----------
179 | child : scaffoldgraph.core.Scaffold
180 | The child scaffold from which the parent scaffolds were obtained.
181 | parent : scaffoldgraph.core.Scaffold
182 | A parent scaffold.
183 |
184 | """
185 | raise NotImplementedError()
186 |
187 |
188 | class ScaffoldMaxFilterRule(BaseScaffoldFilterRule):
189 | """Abstract base class for defining rules for scaffold prioritization
190 | based on a maximum property value.
191 |
192 | Subclasses should implement the ``get_property`` method, where a property value
193 | is returned for a particular input scaffold. Scaffolds with a property value
194 | equal to the maximum property value will be retained. Subclasses should also
195 | implement the ``name`` property.
196 |
197 | Example
198 | -------
199 | >>> class MyRule(ScaffoldMaxFilterRule):
200 | ...
201 | ... def get_property(self, child, parent):
202 | ... prop = get_some_property(parent)
203 | ... return prop
204 | ...
205 | ... @property
206 | ... def name(self):
207 | ... return 'my min conditional rule'
208 |
209 | """
210 | def filter(self, child, parents):
211 | """Filter a set of parent scaffolds using a maximum property value.
212 |
213 | Parameters
214 | ----------
215 | child : scaffoldgraph.core.Scaffold
216 | The child scaffold from which the parent scaffolds were obtained.
217 | parents : iterable
218 | An iterable of all parent scaffolds generated by a fragmenter.
219 |
220 | """
221 | props = [self.get_property(child, s) for s in parents]
222 | max_val = max(props)
223 | return list(compress(parents, [True if p == max_val else False for p in props]))
224 |
225 | @abstractmethod
226 | def get_property(self, child, parent):
227 | """Return a property value for a child/parent scaffold.
228 |
229 | Subclasses should implement this method.
230 |
231 | Parameters
232 | ----------
233 | child : scaffoldgraph.core.Scaffold
234 | The child scaffold from which the parent scaffolds were obtained.
235 | parent : scaffoldgraph.core.Scaffold
236 | A parent scaffold.
237 |
238 | """
239 | raise NotImplementedError()
240 |
--------------------------------------------------------------------------------
/scaffoldgraph/prioritization/prioritization_ruleset.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.prioritization.prioritization_ruleset
3 |
4 | Implements a ruleset for scaffold prioritization when constructing scaffold trees.
5 | """
6 |
7 | from .prioritization_rules import BaseScaffoldFilterRule
8 |
9 |
10 | class ScaffoldRuleSet(object):
11 | """
12 | Class defining a set of rules used for scaffold prioritization.
13 |
14 | Rules added to the rule set must subclass the BaseScaffoldFilterRule.
15 |
16 | """
17 | def __init__(self, rules=None, name=None):
18 | """
19 | Initialize a rule set with an iterable of rules and an
20 | optional name.
21 |
22 | Parameters
23 | ----------
24 | rules : iterable, optional
25 | An iterable of rules. The default is None.
26 | name : str, optional
27 | Name of rule set. The default is None.
28 |
29 | """
30 | self._rules = []
31 | if rules is not None:
32 | for rule in rules:
33 | self.add_rule(rule)
34 | self.name = name if name else 'ScaffoldRuleSet'
35 |
36 | def __call__(self, child, parents):
37 | return self.filter_scaffolds(child, parents)
38 |
39 | @property
40 | def rules(self):
41 | """list : Return rules as a list."""
42 | return self._rules
43 |
44 | def filter_scaffolds(self, child, parents):
45 | """Filter a set of parent scaffolds using the defined rules.
46 |
47 | Method is called internally by scaffold graph constructors.
48 | __call__ is an alias for this function.
49 |
50 | Parameters
51 | ----------
52 | child : scaffoldgraph.core.Scaffold
53 | Child scaffold.
54 | parents : list
55 | Parent scaffolds.
56 |
57 | Returns
58 | -------
59 | parent : scaffoldgraph.core.Scaffold
60 | The scaffold retained after filtering.
61 |
62 | Raises
63 | ------
64 | ValueError
65 | Raised if the ruleset contains no rules.
66 | ValueError
67 | Raised if the iterable of parent scaffolds
68 | is empty.
69 | ValueError
70 | Raised if more than one scaffold is left after
71 | all of the filter rules are evaluated. The RuleSet
72 | may require a tie-breaker rule.
73 |
74 | """
75 | if len(self) == 0:
76 | raise ValueError('No rules defined in rule set')
77 | if len(parents) == 0:
78 | raise ValueError('No parent scaffolds supplied to filter')
79 | elif len(parents) == 1:
80 | parent = parents.pop()
81 | parent.prioritization_rule = 'last remaining'
82 | return parent
83 | remaining = list(parents)
84 | for rule in self:
85 | filtered = rule.filter(child, remaining)
86 | if filtered:
87 | remaining = filtered
88 | if len(remaining) == 1:
89 | parent = remaining.pop()
90 | parent.prioritization_rule = rule.name
91 | return parent
92 | raise ValueError('Filter error, more than one remaining scaffold '
93 | 'after filter rules applied. Rule set may require '
94 | 'a tie-breaker rule')
95 |
96 | def add_rule(self, rule):
97 | """Appends a rule to the ruleset.
98 |
99 | Parameters
100 | ----------
101 | rule : BaseScaffoldFilterRule
102 | Scaffold filter rule with base class ``BaseScaffoldFilterRule``.
103 |
104 | """
105 | if self.check_valid_rule(rule):
106 | self._rules.append(rule)
107 | else:
108 | raise TypeError('rule must be a subclass of BaseScaffoldRule')
109 |
110 | def insert_rule(self, rule, index):
111 | """Inserts a rule into the ruleset at supplied index.
112 |
113 | Parameters
114 | ----------
115 | rule : BaseScaffoldFilterRule
116 | Scaffold filter rule with base class ``BaseScaffoldFilterRule``.
117 | index : int
118 | Position in list to insert rule.
119 |
120 | """
121 | if self.check_valid_rule(rule):
122 | self._rules.insert(index, rule)
123 | else:
124 | raise TypeError('rule must be a subclass of BaseScaffoldRule')
125 |
126 | def delete_rule(self, index):
127 | """Deletes a rule from the ruleset at supplied index.
128 |
129 | Parameters
130 | ----------
131 | index : int
132 | Position in list to delete rule.
133 |
134 | """
135 | self._rules.__delitem__(index)
136 |
137 | @classmethod
138 | def from_rule_file(cls, filename, name=None):
139 | """Create a scaffold rule set from a rule set file.
140 |
141 | A rule set file is a text file specifying the names of
142 | rules to include in the ruleset seperated by new lines.
143 | The rule names must belong to either the original set
144 | or the generic set. The name of the rule corresponds to
145 | the class name of the desired rule. i.e. for OriginalRule01
146 | the file should contain the string OriginalRule01 followed
147 | by a new line. When including generic rules, min or max
148 | can be specified by including min or max after the name
149 | seperated by an underscore. i.e. RRPNumHetAtoms_min.
150 | For Rules which contain further arguments, these can be
151 | appended to the name with underscores. i.e.
152 | RRPRingSizeX_max_6. In this case the rule will prioritize
153 | scaffolds where the removed rings size is equal to 6.
154 |
155 | Parameters
156 | ----------
157 | filename : str
158 | File name of the rule set file.
159 | name : str, optional
160 | Name to assign rule set.
161 |
162 | See Also
163 | --------
164 | scaffoldgraph.prioritization.original_rules
165 | scaffoldgraph.prioritization.generic_rules
166 |
167 | """
168 | from .rule_io import read_rule_file
169 | rules = read_rule_file(filename)
170 | return cls(rules, name)
171 |
172 | @staticmethod
173 | def check_valid_rule(rule):
174 | """bool : Returns True if rule is a valid scaffold filter rule."""
175 | return BaseScaffoldFilterRule in rule.__class__.__mro__
176 |
177 | def __getitem__(self, index):
178 | return self._rules[index]
179 |
180 | def __setitem__(self, index, rule):
181 | if self.check_valid_rule(rule):
182 | self._rules.__setitem__(index, rule)
183 | raise TypeError('rule must be a subclass of BaseScaffoldRule')
184 |
185 | def __len__(self):
186 | return len(self._rules)
187 |
188 | def __repr__(self):
189 | return '<{_cls} at {address}>'.format(
190 | _cls=self.__class__.__name__,
191 | address=hex(id(self))
192 | )
193 |
--------------------------------------------------------------------------------
/scaffoldgraph/prioritization/rule_io.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.prioritization.rule_io
3 |
4 | Functions for reading prioritization rulesets from a file.
5 | Can be used to specify custom rulesets when using the CLI.
6 | """
7 |
8 | from scaffoldgraph.prioritization.original_rules import *
9 | from scaffoldgraph.prioritization.generic_rules import *
10 |
11 |
12 | rule_name_to_class = {
13 | 'scpabsdelta': SCPAbsDelta,
14 | 'scpdelta': SCPDelta,
15 | 'scpnumlinkerbonds': SCPNumLinkerBonds,
16 | 'scpnumaromaticrings': SCPNumAromaticRings,
17 | 'scpnumhetatoms': SCPNumHetAtoms,
18 | 'scpnumnatoms': SCPNumNAtoms,
19 | 'scpnumoatoms': SCPNumOAtoms,
20 | 'scpnumsatoms': SCPNumSAtoms,
21 | 'scpnumxatoms': SCPNumXAtoms,
22 | 'rrpringsize': RRPRingSize,
23 | 'rrplinkerlength': RRPLinkerLength,
24 | 'rrphetatomlinked': RRPHetAtomLinked,
25 | 'rrplinkerlengthx': RRPLinkerLengthX,
26 | 'rrpnumhetatoms': RRPNumHetAtoms,
27 | 'rrpnumnatoms': RRPNumNAtoms,
28 | 'rrpnumoatoms': RRPNumOAtoms,
29 | 'rrpnumsatoms': RRPNumSAtoms,
30 | 'rrpnumxatoms': RRPNumXAtoms,
31 | 'rrpringsizex': RRPRingSizeX,
32 | 'rspabsdelta': RSPAbsDelta,
33 | 'rspdelta': RSPDelta,
34 | 'rspnumaromaticrings': RSPNumAromaticRings,
35 | 'rspnumhetatoms': RSPNumHetAtoms,
36 | 'rspnumnatoms': RSPNumNAtoms,
37 | 'rspnumoatoms': RSPNumOAtoms,
38 | 'rspnumsatoms': RSPNumSAtoms,
39 | 'rspnumxatoms': RSPNumXAtoms,
40 | 'tiebreaker': Tiebreaker,
41 | 'originalrule01': OriginalRule01,
42 | 'originalrule02': OriginalRule02,
43 | 'originalrule03': OriginalRule03,
44 | 'originalrule04': OriginalRule04,
45 | 'originalrule05': OriginalRule05,
46 | 'originalrule06': OriginalRule06,
47 | 'originalrule07': OriginalRule07,
48 | 'originalrule08': OriginalRule08,
49 | 'originalrule09a': OriginalRule09a,
50 | 'originalrule09b': OriginalRule09b,
51 | 'originalrule09c': OriginalRule09c,
52 | 'originalrule10': OriginalRule10,
53 | 'originalrule11': OriginalRule11,
54 | 'originalrule12': OriginalRule12,
55 | 'originalrule13': OriginalRule13,
56 | }
57 |
58 |
59 | def read_rule_file(filename):
60 | """Read rules from a file.
61 |
62 | Parameters
63 | ----------
64 | filename : str
65 | Name of rule file.
66 |
67 | Returns
68 | -------
69 | list
70 | list of rule objects.
71 |
72 | Raises
73 | ------
74 | ValueError
75 | Raised if any of the rules defined in the
76 | rule file are not implemented.
77 |
78 | """
79 | rules = []
80 | with open(filename, 'r') as f:
81 | for line in f.readlines():
82 | tokens = line.strip().split('_')
83 | if len(tokens) == 0:
84 | continue
85 | rule_name = tokens[0]
86 | rule_cls = rule_name_to_class.get(
87 | rule_name.lower(), None)
88 | if rule_cls is None:
89 | raise ValueError(f'Rule {rule_name} is not defined')
90 | if len(tokens) > 2:
91 | rule = rule_cls(tokens[1], *list(map(int, tokens[2:])))
92 | else:
93 | rule = rule_cls(*tokens[1:])
94 | rules.append(rule)
95 | return rules
96 |
--------------------------------------------------------------------------------
/scaffoldgraph/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.scripts
3 |
4 | scaffoldgraph CLI utility
5 | """
6 |
--------------------------------------------------------------------------------
/scaffoldgraph/scripts/generate.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.scripts.generate
3 | """
4 |
5 | import datetime
6 | import time
7 |
8 | from loguru import logger
9 |
10 | from scaffoldgraph import ScaffoldNetwork, ScaffoldTree, HierS
11 | from scaffoldgraph.prioritization import ScaffoldRuleSet
12 | from scaffoldgraph.io import tsv
13 |
14 | from .misc import file_format
15 |
16 | start_message = """
17 | Running ScaffoldGraph ({command}) Generation with options:
18 | Input file: {input}
19 | Output file: {output}
20 | Maximum rings: {max_r}
21 | Flatten isotopes: {isotope}
22 | Keep largest Fragment: {fragment}
23 | Discharge & Deradicalize: {discharge}
24 | """
25 |
26 | stop_message = """
27 | ScaffoldGraph Generation Complete:
28 | Molecules written: {molecules}
29 | Scaffolds written: {scaffolds}
30 | Molecules filtered: {filtered}
31 | Linear molecules: {linear}
32 | Time elapsed: {time}
33 |
34 | Output saved @ {output}
35 | """
36 |
37 |
38 | def _get_graph_cls(name):
39 | """Get scaffoldgraph class from name string."""
40 | if name == 'network':
41 | return ScaffoldNetwork
42 | elif name == 'tree':
43 | return ScaffoldTree
44 | elif name == 'hiers':
45 | return HierS
46 | else:
47 | msg = f'scaffold graph type: {name} not known'
48 | raise ValueError(msg)
49 |
50 |
51 | def _maybe_ruleset(args):
52 | """Return a ScaffoldRuleset if specified in CLI arguments."""
53 | ruleset = None
54 | if 'ruleset' in args and args.ruleset is not None:
55 | filename = args.ruleset
56 | ruleset = ScaffoldRuleSet.from_rule_file(filename)
57 | return ruleset
58 |
59 |
60 | def generate_cli(args):
61 | """Run scaffoldgraph generation for CLI utility."""
62 | graph_cls = _get_graph_cls(args.command)
63 | graph_name = graph_cls.__name__
64 | ruleset = _maybe_ruleset(args)
65 |
66 | if not args.silent:
67 | print(
68 | start_message.format(
69 | command=graph_name,
70 | input=args.input,
71 | output=args.output,
72 | max_r=args.max_rings,
73 | isotope=args.flatten_isotopes,
74 | fragment=args.keep_largest_fragment,
75 | discharge=args.discharge_and_deradicalize,
76 | )
77 | )
78 |
79 | logger.info(f'Generating {graph_name} Graph...')
80 | fmt, zipped = file_format(args.input)
81 | start = time.time()
82 |
83 | if fmt == 'SDF':
84 | sg = graph_cls.from_sdf(
85 | args.input,
86 | ring_cutoff=args.max_rings,
87 | progress=args.silent is False,
88 | zipped=zipped,
89 | flatten_isotopes=args.flatten_isotopes,
90 | keep_largest_fragment=args.keep_largest_fragment,
91 | discharge_and_deradicalize=args.discharge_and_deradicalize,
92 | prioritization_rules=ruleset,
93 | )
94 | elif fmt == 'SMI':
95 | sg = graph_cls.from_smiles_file(
96 | args.input,
97 | ring_cutoff=args.max_rings,
98 | progress=args.silent is False,
99 | flatten_isotopes=args.flatten_isotopes,
100 | keep_largest_fragment=args.keep_largest_fragment,
101 | discharge_and_deradicalize=args.discharge_and_deradicalize,
102 | prioritization_rules=ruleset,
103 | )
104 | else:
105 | raise ValueError('input file format is not currently supported')
106 |
107 | tsv.write_tsv(sg, args.output, write_ids=False)
108 | logger.info(f'{graph_name} Graph Generation Complete...')
109 | elapsed = datetime.timedelta(seconds=round(time.time() - start))
110 | filtered = sg.graph['num_filtered']
111 | linear = sg.graph['num_linear']
112 |
113 | if not args.silent:
114 | print(
115 | stop_message.format(
116 | molecules=sg.num_molecule_nodes,
117 | scaffolds=sg.num_scaffold_nodes,
118 | filtered=filtered,
119 | linear=linear,
120 | time=elapsed,
121 | output=args.output
122 | )
123 | )
124 |
--------------------------------------------------------------------------------
/scaffoldgraph/scripts/misc.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.scripts.misc
3 | """
4 |
5 | import logging
6 | import os
7 |
8 | import tqdm
9 |
10 |
11 | class TqdmHandler(logging.Handler):
12 | """Logging handler for use with tqdm (used in CLI)."""
13 |
14 | def __init__(self, level=logging.NOTSET):
15 | super().__init__(level)
16 |
17 | def emit(self, record):
18 | try:
19 | msg = self.format(record)
20 | tqdm.tqdm.write(msg)
21 | self.flush()
22 | except (KeyboardInterrupt, SystemExit):
23 | raise
24 | except Exception:
25 | self.handleError(record)
26 |
27 |
28 | def file_format(path):
29 | """Determine an input file format from a path."""
30 | split_path, extension = os.path.splitext(path)
31 | if extension == '.sdf':
32 | return 'SDF', False
33 | elif extension == '.smi':
34 | return 'SMI', False
35 | elif extension == '.gz' or extension == '.gzip':
36 | new_extension = file_format(split_path)
37 | if new_extension[0] is not None:
38 | return new_extension[0], True
39 | else:
40 | return None, False
41 | else:
42 | return None, False
43 |
--------------------------------------------------------------------------------
/scaffoldgraph/scripts/run.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.scripts.run
3 |
4 | Module defines the CLI utility for ScaffoldGraph.
5 | """
6 |
7 | import argparse
8 | import logging
9 | import sys
10 |
11 | from loguru import logger
12 |
13 | from scaffoldgraph import __version__
14 | from .generate import generate_cli
15 | from .misc import TqdmHandler
16 | from .operations import select_cli, aggregate_cli
17 |
18 | title = f"ScaffoldGraph {__version__}"
19 | desc = "Generate Scaffold Networks and Scaffold Trees."
20 |
21 | tqdm_format = "scaffold-graph: "
22 | tqdm_format += "{time:HH:mm:ss} "
23 | tqdm_format += "{process} "
24 | tqdm_format += "{level}: "
25 | tqdm_format += "{message}"
26 |
27 | tqdm_handler = {
28 | 'sink': TqdmHandler(logging.NOTSET),
29 | 'format': tqdm_format,
30 | 'level': 'INFO'
31 | }
32 |
33 | usage = 'scaffoldgraph []'
34 |
35 |
36 | def configure_logger(verbosity):
37 | """Configure the scaffoldgraph cli logger to use tqdm handler.
38 |
39 | Parameters
40 | ----------
41 | verbosity : int
42 | Select the output verbosity. 0 is the lowest verbosity
43 | 'CRITICAL' and 4 is the highest verbosity 'DEBUG'. If
44 | < 0 or > 4 the maximum verbosity is selected.
45 |
46 | """
47 | config = {'handlers': []}
48 | logger.enable('scaffoldgraph')
49 |
50 | if verbosity == 0:
51 | tqdm_handler['sink'].level = logging.CRITICAL
52 | tqdm_handler['level'] = 'CRITICAL'
53 | elif verbosity == 1:
54 | tqdm_handler['sink'].level = logging.ERROR
55 | tqdm_handler['level'] = 'ERROR'
56 | elif verbosity == 2:
57 | tqdm_handler['sink'].level = logging.WARNING
58 | tqdm_handler['level'] = 'WARNING'
59 | elif verbosity == 3:
60 | tqdm_handler['sink'].level = logging.INFO
61 | tqdm_handler['level'] = 'INFO'
62 | elif verbosity == 4:
63 | tqdm_handler['sink'].level = logging.DEBUG
64 | tqdm_handler['level'] = 'DEBUG'
65 | else: # if < 0 or > 4 is supplied set logger to max level (DEBUG)
66 | tqdm_handler['sink'].level = logging.DEBUG
67 | tqdm_handler['level'] = 'DEBUG'
68 |
69 | config["handlers"].append(tqdm_handler)
70 | logger.configure(**config)
71 |
72 |
73 | def parent_parser():
74 | """Common arguments for all scaffoldgraph commands."""
75 | parser = argparse.ArgumentParser(add_help=False)
76 | parser.add_argument('-v', '--verbosity', metavar='', type=int, default=3, choices=[0, 1, 2, 3, 4],
77 | help='set logger verbosity [0, 1, 2, 3, 4] (default: 3)')
78 | parser.add_argument('-s', '--silent', action='store_true', help='silence console output (default: False)')
79 | return parser
80 |
81 |
82 | def generate_parent_parser():
83 | """Creates a parent parser for generate commands (Network, Tree, HierS)."""
84 | parser = argparse.ArgumentParser(add_help=False)
85 | parser.add_argument('input', help='input file (SDF, SMILES)')
86 | parser.add_argument('output', help='output file path')
87 | parser.add_argument('--max-rings', '-m', type=int, default=10, metavar='',
88 | help='ignore molecules with # rings > (default: 10)')
89 | parser.add_argument('--flatten-isotopes', '-i', action='store_true',
90 | help='remove remove specific isotopes when initializing the scaffold')
91 | parser.add_argument('--keep_largest_fragment', '-f', action='store_true',
92 | help='when encountering molecules containing disconnected fragments initialize'
93 | ' the scaffold from only the largest disconnected fragment')
94 | parser.add_argument('--discharge-and-deradicalize', '-d', action='store_true',
95 | help='remove charges and radicals when initializing the scaffold')
96 | return parser
97 |
98 |
99 | def scaffoldgraph_args():
100 | """Defines CLI utility for ScaffoldGraph."""
101 | parser = argparse.ArgumentParser('scaffoldgraph', description=desc)
102 | parser.add_argument('--version', action='version', version=__version__)
103 | subparsers = parser.add_subparsers(title='command', dest='command')
104 |
105 | # network (generate a scaffold network from a SMILES or SDF file)
106 | network_parser = subparsers.add_parser('network', description='Generate a scaffold network',
107 | parents=[generate_parent_parser(), parent_parser()])
108 | network_parser.set_defaults(func=generate_cli)
109 |
110 | # HierS (generate a HierS scaffold network from a SMILES or SDF file)
111 | hiers_parser = subparsers.add_parser('hiers', description='Generate a HierS type scaffold network',
112 | parents=[generate_parent_parser(), parent_parser()])
113 | hiers_parser.set_defaults(func=generate_cli)
114 |
115 | # tree (generate a scaffold tree form a SMILES or SDF file)
116 | tree_parser = subparsers.add_parser('tree', description='Generate a scaffold tree',
117 | parents=[generate_parent_parser(), parent_parser()])
118 | tree_parser.add_argument('-r', '--ruleset', help='supply a ruleset file for custom scaffold prioritization',
119 | metavar='')
120 | tree_parser.set_defaults(func=generate_cli)
121 |
122 | # select (select a subgraph of a scaffold graph using a molecular query)
123 | select_parser = subparsers.add_parser('select', description='Select subgraph from a molecular query.',
124 | parents=[parent_parser()])
125 | select_parser.add_argument('input_graph', help='input aggregated graph file')
126 | select_parser.add_argument('input_query', help='input query file (SDF, SMILES)')
127 | select_parser.add_argument('output', help='output file path')
128 | select_parser.add_argument('-d', '--sdf', help='write output as an SDF', action='store_true')
129 | select_parser.set_defaults(func=select_cli)
130 |
131 | # aggregate (Aggregate intermediate scaffold graph files (TSV or PICKLE))
132 | aggregate_parser = subparsers.add_parser('aggregate', description='Aggregate scaffold graphs',
133 | parents=[parent_parser()])
134 | aggregate_parser.add_argument('input', nargs='+', help='input file(s) (TSV)')
135 | aggregate_parser.add_argument('output', help='output file path')
136 | aggregate_parser.add_argument('-m', '--map-mols', help='map molecule IDs from input to scaffold IDs, \
137 | and place result in given file', metavar='')
138 | aggregate_parser.add_argument('-a', '--map-annotations', help='map scaffold IDs to annotations, \
139 | and place result in given file', metavar='')
140 | aggregate_parser.add_argument('-d', '--sdf', help='write output as an SDF', action='store_true')
141 | aggregate_parser.set_defaults(func=aggregate_cli)
142 |
143 | return parser
144 |
145 |
146 | def scaffoldgraph_main():
147 | """Run the CLI utility for ScaffoldGraph."""
148 | parser = scaffoldgraph_args()
149 | args = parser.parse_args(None if sys.argv[1:] else ['-h'])
150 | configure_logger(args.verbosity)
151 | try:
152 | args.func(args)
153 | except FileNotFoundError as e:
154 | logger.critical(f'Input file not found: {e.filename}')
155 | except ValueError as e:
156 | logger.critical(e)
157 | except RuntimeError as e:
158 | logger.critical(e)
159 | except MemoryError as e:
160 | logger.critical(e)
161 | except KeyboardInterrupt:
162 | logger.critical('scaffoldgraph process interrupted from keyboard')
163 | except Exception as e:
164 | logger.critical(f'Unknown error: {e}')
165 | finally:
166 | logger.info('Exiting scaffoldgraph...')
167 |
--------------------------------------------------------------------------------
/scaffoldgraph/tree.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.tree
3 | """
4 |
5 | from rdkit.Chem import rdmolops
6 |
7 | from .core import ScaffoldGraph, Scaffold, MurckoRingFragmenter
8 | from .core.fragment import get_murcko_scaffold
9 | from .prioritization import original_ruleset
10 | from .utils import suppress_rdlogger
11 |
12 |
13 | class ScaffoldTree(ScaffoldGraph):
14 | """
15 | Class representing a scaffold tree.
16 |
17 | Explore scaffold-space through the iterative removal of the least-characteristic
18 | ring from a molecular scaffold. The output is a tree of molecular scaffolds.
19 |
20 | Examples
21 | --------
22 | Create a ScaffoldTree from a SMILES file.
23 |
24 | >>> import scaffoldgraph as sg
25 | >>> tree = sg.ScaffoldTree.from_smiles_file('my_file.smi', progress=True)
26 | >>> network.num_scaffold_nodes
27 | 75
28 |
29 | Create a ScaffoldTree from an SDF.
30 |
31 | >>> tree = sg.ScaffoldTree.from_sdf('my_file.sdf', progress=True)
32 |
33 | If the SDF is zipped:
34 |
35 | >>> tree = sg.ScaffoldTree.from_sdf('my_file.sdf.gz', zipped=True)
36 |
37 | Get scaffold nodes:
38 |
39 | >>> list(tree.get_scaffold_nodes())
40 | ['O=C(OCOC(=O)c1cccc2ncn(Cc3ccccc3)c12)OC1CCCCC1',
41 | 'O=C(OCOC(=O)c1cccc2nc[nH]c12)OC1CCCCC1',
42 | ...]
43 |
44 | Include node attributes:
45 |
46 | >>> list(tree.get_scaffold_nodes(data=True))
47 | [('O=C(OCOC(=O)c1cccc2ncn(Cc3ccccc3)c12)OC1CCCCC1', {'type': 'scaffold', 'hierarchy': 4}),
48 | ('O=C(OCOC(=O)c1cccc2nc[nH]c12)OC1CCCCC1', {'type': 'scaffold', 'hierarchy': 3}),
49 | ...]
50 |
51 | Get molecule nodes (use data=True to get attributes):
52 |
53 | >>> list(tree.get_molecule_nodes())
54 | ['DB00006',
55 | 'DB00007',
56 | 'DB00014',
57 | ...]
58 |
59 | References
60 | ----------
61 | .. [1] Schuffenhauer, A., Ertl, P., Roggo, S., Wetzel, S., Koch, M. A., and Waldmann, H. (2007).
62 | The scaffold tree visualization of the scaffold universe by hierarchical scaffold classification.
63 | Journal of Chemical Information and Modeling, 47(1), 47–58. PMID: 17238248.
64 |
65 | See Also
66 | --------
67 | ScaffoldGraph
68 | ScaffoldNetwork
69 | HierS
70 |
71 | """
72 | def __init__(self, graph=None, prioritization_rules=None, **kwargs):
73 | """Initialize a ScaffoldTree.
74 |
75 | Parameters
76 | ----------
77 | graph : input graph, optional
78 | Data to initialize graph. If None (default) an empty
79 | graph is created. The data can be any format that is supported
80 | by the ``to_networkx_graph()`` function, currently including
81 | edge list, dict of dicts, dict of lists, NetworkX graph,
82 | NumPy matrix or 2d ndarray, SciPy sparse matrix,
83 | or PyGraphviz graph. This argument is passed to the networkx
84 | DiGraph constructor.
85 | prioritization_rules : ScaffoldRuleSet
86 | Ruleset for prioritizing parent scaffolds during tree
87 | construction.
88 |
89 | """
90 | super(ScaffoldTree, self).__init__(graph, MurckoRingFragmenter(True), 'tree')
91 | self.rules = prioritization_rules if prioritization_rules else original_ruleset
92 |
93 | def _hierarchy_constructor(self, child):
94 | parents = [p for p in self.fragmenter.fragment(child) if p]
95 | if not parents:
96 | return
97 | parent = self.rules(child, parents)
98 | if not parent:
99 | return
100 | deletion_rule = parent.prioritization_rule
101 | if parent in self.nodes:
102 | self.add_scaffold_edge(parent, child, rule=deletion_rule)
103 | else:
104 | self.add_scaffold_node(parent)
105 | self.add_scaffold_edge(parent, child, rule=deletion_rule)
106 | if parent.rings.count > 1:
107 | self._hierarchy_constructor(parent)
108 |
109 | @property
110 | def prioritization_rules(self):
111 | """ScaffoldRuleSet : Return the prioritization ruleset used."""
112 | return self.rules
113 |
114 |
115 | @suppress_rdlogger()
116 | def tree_frags_from_mol(mol, prioritization_rules=None):
117 | """Generate a scaffold tree from a single molecule without using networkx.
118 |
119 | Parameters
120 | ----------
121 | mol: rdkit.Chem.rdchem.Mol
122 | rdkit molecule for processing.
123 | prioritization_rules : ScaffoldRuleSet, optional
124 | rules for prioritizing parent scaffolds. If
125 | not supplied the original rules are used.
126 | The default is None.
127 |
128 | Returns
129 | -------
130 | parents
131 | An ordered list of rdkit Mols representing a scaffold tree.
132 |
133 | Examples
134 | --------
135 | Generating scaffold tree fragments:
136 |
137 | >>> from rdkit import Chem
138 | >>> smiles = 'Cc1[nH]cnc1Cn1cccc(-c2ccccc2O)c1=O'
139 | >>> molecule = Chem.MolFromSmiles(smiles)
140 | >>> frags = tree_frags_from_mol(molecule)
141 |
142 | """
143 | scaffold = Scaffold(get_murcko_scaffold(mol))
144 | rdmolops.RemoveStereochemistry(scaffold.mol)
145 | parents = [scaffold]
146 | fragmenter = MurckoRingFragmenter(use_scheme_4=True)
147 | rules = prioritization_rules if prioritization_rules else original_ruleset
148 |
149 | def _next_scaffold(child):
150 | next_parents = [p for p in fragmenter.fragment(child) if p]
151 | if not next_parents:
152 | return
153 | next_parent = rules(child, next_parents)
154 | parents.append(next_parent)
155 | if next_parent.rings.count > 1:
156 | _next_scaffold(next_parent)
157 |
158 | _next_scaffold(scaffold)
159 |
160 | return [p.mol for p in parents]
161 |
--------------------------------------------------------------------------------
/scaffoldgraph/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.utils
3 | """
4 |
5 | from .misc import canonize_smiles, summary
6 | from .aggregate import aggregate
7 | from .logging import suppress_rdlogger
8 |
9 | __all__ = [
10 | 'canonize_smiles',
11 | 'aggregate',
12 | 'summary',
13 | 'suppress_rdlogger',
14 | ]
15 |
--------------------------------------------------------------------------------
/scaffoldgraph/utils/aggregate.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.utils.aggregate
3 |
4 | Functions for aggregating ScaffoldGraphs.
5 | """
6 |
7 | import warnings
8 |
9 | from networkx import Graph, compose
10 |
11 |
12 | def aggregate(list_of_graphs):
13 | """Aggregate a list of graphs into one graph object.
14 |
15 | Graphs within the list must be a subclass of a networkx Graph object.
16 |
17 | Parameters
18 | ----------
19 | list_of_graphs : list
20 | A list of scaffold graphs (ScaffoldGraph) for aggregation.
21 |
22 | Returns
23 | -------
24 | ScaffoldGraph
25 | A graph type object with the same class as the first entry in
26 | the parameter list_of_graphs.
27 |
28 | Raises
29 | ------
30 | ValueError:
31 | raises if an empty list is provided, instead of a list of graphs.
32 | ValueError:
33 | raises if any entry in the list is not a subclass of nx.Graph.
34 |
35 | Examples
36 | --------
37 | >>> g1 = sg.ScaffoldNetwork.from_sdf('g1.sdf')
38 | >>> print(g1.number_of_nodes())
39 | 100
40 | >>> g2 = sg.ScaffoldNetwork.from_sdf('g2.sdf')
41 | >>> print(g2.number_of_nodes())
42 | 50
43 | >>> g3 = sg.ScaffoldNetwork.from_sdf('g3.sdf')
44 | >>> print(g3.number_of_nodes())
45 | 200
46 | >>> list_of_graphs = [g1, g2, g3]
47 | >>> aggregated_graph = aggregate(list_of_graphs)
48 | >>> print(aggregated_graph.number_of_nodes())
49 | 325
50 |
51 | Notes
52 | -----
53 | The user is not prevented from aggregating multiple graphs of
54 | differing types, although this may lead to undesired behaviour.
55 | (i.e. aggregating a tree and a network is possible)
56 |
57 | Based on nx.compose_all:
58 | .. _Compose-all: https://networkx.github.io/documentation/stable/reference/algorithms/
59 | generated/networkx.algorithms.operators.all.compose_all.html
60 |
61 | """
62 | if not list_of_graphs:
63 | raise ValueError('Cannot apply aggregate to an empty list')
64 | graphs = iter(list_of_graphs)
65 | C = next(graphs)
66 | graph_type = type(C)
67 | for H in graphs:
68 | if not issubclass(type(H), Graph):
69 | raise ValueError('Can only aggregate graph type objects')
70 | if graph_type != type(H):
71 | warnings.warn('Attempting to aggregate graphs of different types '
72 | f'({graph_type} & {type(H)}) '
73 | 'could result in undesired behaviour')
74 | C = compose(C, H)
75 | return C
76 |
--------------------------------------------------------------------------------
/scaffoldgraph/utils/bipartite.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.utils.bipartite
3 |
4 | Defines functions for creating bipartite graphs from scaffold graphs.
5 | """
6 |
7 | from scaffoldgraph.core import ScaffoldGraph
8 |
9 |
10 | def make_bipartite_graph(graph):
11 | """Collapse a scaffold hierarchy into a bipartite representation.
12 |
13 | Scaffold --> Molecule
14 |
15 | The returned output will inherit the class of the input graph.
16 |
17 | Parameters
18 | ----------
19 | graph : sg.core.ScaffoldGraph
20 | A scaffold graph template for producing a bipaertite
21 | graph.
22 |
23 | Returns
24 | -------
25 | sg.core.ScaffoldGraph
26 | Bipartite scaffoldgraph where the scaffold hierarchy
27 | has been collapsed.
28 |
29 | """
30 | if not issubclass(type(graph), ScaffoldGraph):
31 | raise ValueError(f'{graph} must be a ScaffoldGraph')
32 | graph_type = type(graph)
33 | B = graph_type(None)
34 | for scf, sdata in graph.get_scaffold_nodes(True):
35 | B.add_node(scf, **sdata)
36 | for mol, mdata in graph.get_molecules_for_scaffold(scf, True):
37 | if not B.molecule_in_graph(mol):
38 | B.add_node(mol, **mdata)
39 | B.add_edge(scf, mol)
40 | return B
41 |
--------------------------------------------------------------------------------
/scaffoldgraph/utils/cache.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.utils.cache
3 | """
4 |
5 | from collections import OrderedDict
6 | from operator import eq as _eq
7 |
8 |
9 | class Cache(OrderedDict):
10 | """A basic implementation of an LRU cache using OrderedDict.
11 |
12 | Adapted (slightly) from the collections ``OrderedDict``
13 | documentation.
14 |
15 | .. _collections OrderedDict Documentation:
16 | https://docs.python.org/3/library/collections.html#collections.OrderedDict
17 |
18 | """
19 | def __init__(self, maxsize=None, *args, **kwargs):
20 | """
21 | Parameters
22 | ----------
23 | maxsize : int, None, optional
24 | Set the maximum size of the cache, if None the cache
25 | has no size limitation. The default is None.
26 | *args
27 | Variable length argument list.
28 | Passed to OrderedDict.
29 | **kwargs
30 | Arbitrary keyword arguments.
31 | Passed to OrderedDict.
32 |
33 | """
34 | self._maxsize = maxsize
35 | super(Cache, self).__init__(*args, **kwargs)
36 |
37 | @property
38 | def maxsize(self):
39 | """int: The maximum size of the cache."""
40 | return self._maxsize
41 |
42 | def __getitem__(self, key):
43 | value = super().__getitem__(key)
44 | self.move_to_end(key)
45 | return value
46 |
47 | def __setitem__(self, key, value):
48 | super().__setitem__(key, value)
49 | if self.maxsize and len(self) > self.maxsize:
50 | oldest = next(iter(self))
51 | del self[oldest]
52 |
53 | def __eq__(self, other):
54 | if isinstance(other, Cache):
55 | return dict.__eq__(self, other) and all(map(_eq, self, other))
56 | return dict.__eq__(self, other)
57 |
58 | def __repr__(self):
59 | return '{}(maxsize={})'.format(
60 | self.__class__.__name__,
61 | self.maxsize
62 | )
63 |
--------------------------------------------------------------------------------
/scaffoldgraph/utils/logging.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.utils.logging
3 |
4 | Utilities for dealing with rdkit logging.
5 | """
6 |
7 | import functools
8 | import warnings
9 |
10 | from rdkit import __version__ as rdversion
11 | from rdkit import RDLogger, rdBase
12 |
13 |
14 | DEFAULT_RDLOGGER_STATUS = {
15 | 'rdApp.debug': True,
16 | 'rdApp.info': True,
17 | 'rdApp.warning': True,
18 | 'rdApp.error': True
19 | }
20 |
21 | QUIET_RDLOGGER_STATUS = {
22 | 'rdApp.debug': False,
23 | 'rdApp.info': False,
24 | 'rdApp.warning': False,
25 | 'rdApp.error': True
26 | }
27 |
28 | UNKNOWN_RDLOGGER_STATUS = DEFAULT_RDLOGGER_STATUS
29 |
30 |
31 | def get_rdlogger_status():
32 | """dict : Return the status of the rdlogger."""
33 | status_dict = {}
34 | if rdversion < '2020.09.01':
35 | warnings.warn('Failed to get status of rdlogger')
36 | return UNKNOWN_RDLOGGER_STATUS
37 | for status in rdBase.LogStatus().split('\n'):
38 | level, state = status.split(':')
39 | status_dict[level] = True if state == 'enabled' else False
40 | return status_dict
41 |
42 |
43 | def set_rdlogger_status(status_dict):
44 | """Set the state of the rdlogger."""
45 | for level, state in status_dict.items():
46 | if state is True:
47 | rdBase.EnableLog(level)
48 | else:
49 | rdBase.DisableLog(level)
50 |
51 |
52 | def set_rdlogger_quiet():
53 | """Set the rdlogger to quiet status."""
54 | set_rdlogger_status(QUIET_RDLOGGER_STATUS)
55 |
56 |
57 | def reset_rdlogger():
58 | """Reset the rdlogger status to default."""
59 | set_rdlogger_status(DEFAULT_RDLOGGER_STATUS)
60 |
61 |
62 | def set_rdlogger_debug_status(status):
63 | """Set status of rdlogger: debug."""
64 | set_rdlogger_status({'rdApp.debug': status})
65 |
66 |
67 | def set_rdlogger_info_status(status):
68 | """Set status of rdlogger: info."""
69 | set_rdlogger_status({'rdApp.info': status})
70 |
71 |
72 | def set_rdlogger_warning_status(status):
73 | """Set status of rdlogger: warning."""
74 | set_rdlogger_status({'rdApp.warning': status})
75 |
76 |
77 | def set_rdlogger_error_status(status):
78 | """Set status of rdlogger: error."""
79 | set_rdlogger_status({'rdApp.error': status})
80 |
81 |
82 | def suppress_rdlogger(
83 | suppress_info=True,
84 | suppress_warning=True,
85 | suppress_error=True,
86 | suppress_debug=True
87 | ):
88 | """Decorator for controlling the output level of the rdkit logger.
89 |
90 | Useful for supressing the output of noisy functions related to
91 | the rdkit logger. The previous status of the logger is returned
92 | after the function has been executed.
93 |
94 | Parameters
95 | ----------
96 | suppress_info : bool, optional
97 | Suppress logs from rdApp.info. The default is True.
98 | suppress_warning : bool, optional
99 | Suppress logs from rdApp.warning. The default is True.
100 | suppress_error : bool, optional
101 | Suppress logs from rdApp.error. The default is True.
102 | suppress_debug : bool, optional
103 | Suppress logs from rdApp.debug. The default is True.
104 |
105 | Returns
106 | -------
107 | decorator : function
108 |
109 | Notes
110 | -----
111 | The prior state of the logger can only be returned in the newer
112 | versions of rdkit (>= '2020.09.01'). In previous versions the
113 | logger status is returned to its default state.
114 |
115 | """
116 | rdlogger, altered_status = RDLogger.logger(), {}
117 | altered_status['rdApp.info'] = not suppress_info
118 | altered_status['rdApp.warning'] = not suppress_warning
119 | altered_status['rdApp.error'] = not suppress_error
120 | altered_status['rdApp.debug'] = not suppress_debug
121 |
122 | def decorator(func):
123 | @functools.wraps(func)
124 | def wrap_suppress(*args, **kwargs):
125 | # rdkit version compatability.
126 | prior_status = DEFAULT_RDLOGGER_STATUS
127 | if rdversion >= '2020.09.01':
128 | prior_status = get_rdlogger_status()
129 | set_rdlogger_status(altered_status)
130 | try: # restore status of rdlogger on failure.
131 | result = func(*args, **kwargs)
132 | except Exception as e:
133 | set_rdlogger_status(prior_status)
134 | raise e
135 | set_rdlogger_status(prior_status)
136 | return result
137 | return wrap_suppress
138 | return decorator
139 |
--------------------------------------------------------------------------------
/scaffoldgraph/utils/misc.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.utils.misc
3 |
4 | Defines miscellaneous functions used within scaffoldgraph.
5 | """
6 |
7 | import networkx as nx
8 |
9 | from rdkit import Chem
10 |
11 |
12 | def canonize_smiles(smiles, failsafe=True):
13 | """Canonize a SMILES string (with failsafe).
14 |
15 | Parameters
16 | ----------
17 | smiles : str
18 | SMILES string to canonize.
19 | failsafe : bool
20 | If True, if the SMILES fails to parse
21 | the input SMILES is returned instead
22 | of raising an error.
23 |
24 | Returns
25 | -------
26 | str
27 | The canonical SMILES representation.
28 |
29 | """
30 | mol = Chem.MolFromSmiles(smiles)
31 | if mol is None and failsafe:
32 | return smiles
33 | return Chem.MolToSmiles(mol)
34 |
35 |
36 | def summary(graph, n=None):
37 | """Return a summary of information for the graph or a single node n.
38 |
39 | Parameters
40 | ----------
41 | graph : sg.core.ScaffoldGraph or NetworkX graph
42 | A graph object which can either be a ScaffoldGraph graph or a NetworkX
43 | graph object.
44 | n : any hashable, optional
45 | A node in the graph. The default is None.
46 |
47 | Returns
48 | -------
49 | info : str
50 | A string containing the summary.
51 |
52 | Raises
53 | ------
54 | ValueError
55 | If n is not in the graph.
56 |
57 | """
58 | from scaffoldgraph.core import ScaffoldGraph
59 | if not issubclass(type(graph), ScaffoldGraph):
60 | return nx.info(graph, n)
61 | info = ""
62 | if n is None:
63 | type_name = [type(graph).__name__]
64 | info += f"Type: {','.join(type_name)}\n"
65 | info += f"Number of molecule nodes: {graph.num_molecule_nodes}\n"
66 | info += f"Number of scaffold nodes: {graph.num_scaffold_nodes}\n"
67 | info += f"Number of edges: {graph.number_of_edges()}\n"
68 | info += f"Max hierarchy: {graph.max_hierarchy()}\n"
69 | info += f"Min hierarchy: {graph.min_hierarchy()}\n"
70 | else:
71 | if graph.molecule_in_graph(n):
72 | info += f"Node {n} has the following properties:\n"
73 | info += "Type: molecule\n"
74 | info += f"SMILES: {graph.nodes[n].get('smiles')}\n"
75 | info += f"Degree: {graph.degree(n)}\n"
76 | info += "Parent scaffolds: "
77 | info += " ".join(str(s) for s in graph.predecessors(n))
78 | elif graph.scaffold_in_graph(n):
79 | key = canonize_smiles(n)
80 | info += f"Node {key} has the following properties:\n"
81 | info += "Type: scaffold\n"
82 | info += f"Hierarchy: {graph.nodes[key].get('hierarchy')}\n"
83 | info += f"Degree: {graph.degree(key)}\n"
84 | info += "Parent scaffolds: "
85 | info += " ".join(str(s) for s in graph.get_parent_scaffolds(key, max_levels=1))
86 | info += "\n"
87 | info += "Child scaffolds: "
88 | info += " ".join(str(s) for s in graph.get_child_scaffolds(key, max_levels=1))
89 | info += "\n"
90 | info += "Child molecules: "
91 | info += " ".join(
92 | str(s) for s in graph.successors(key) if graph.nodes[s].get('type') == 'molecule'
93 | )
94 | else:
95 | raise ValueError(f"node {n} not in graph")
96 | return info
97 |
--------------------------------------------------------------------------------
/scaffoldgraph/utils/subset.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.utils.subset
3 |
4 | """
5 | from networkx.algorithms.traversal import bfs_tree
6 | from collections import defaultdict
7 |
8 |
9 | def split_graph_by_molecule_attribute(graph, key, default=None):
10 | """Split a scaffold graph into subgraphs based on unique molecule attributes.
11 |
12 | This function first groups molecule nodes sharing a unique attribute
13 | value, and then proceeds to build subgraphs from each node subset using
14 | a breadth-first search.
15 |
16 | The returned subgraphs are graph views and thus changes to the graph are
17 | nruled out by the view, but changes to node attributes
18 | are reflected in the original graph. To prevent this behaviour use:
19 | subgraph.copy()
20 |
21 | Parameters
22 | ----------
23 | graph : sg.core.ScaffoldGraph
24 | A scaffold graph to split.
25 | key : str
26 | The key for the molecule node attribute used to split the graph
27 | into subgraphs.
28 | default : value, bool, optional
29 | Value used for nodes that don't have the requested attribute.
30 |
31 | Returns
32 | -------
33 | splits : dict
34 | A dictionary with keys representing unique node attributes and
35 | values representing the constructed subgraphs.
36 |
37 | """
38 | if isinstance(key, bool):
39 | raise ValueError('Attribute key cannot be a boolean type')
40 | splits = defaultdict(list)
41 | for node, attr in graph.get_molecule_nodes(key, default):
42 | splits[attr].append(node)
43 | splits.default_factory = None # Not really required
44 | for attr, nodes in splits.items():
45 | bfs_subset = set()
46 | for node in nodes:
47 | bfs = bfs_tree(graph, node, reverse=True)
48 | bfs_subset.update(bfs)
49 | splits[attr] = graph.subgraph(bfs_subset)
50 | return splits
51 |
--------------------------------------------------------------------------------
/scaffoldgraph/vis/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.vis
3 | """
4 |
5 | from .utils import (
6 | embed_node_mol_images,
7 | remove_node_mol_images,
8 | color_scaffold_nodes_by_attribute,
9 | color_molecule_nodes_by_attribute,
10 | add_root_node,
11 | remove_root_node,
12 | )
13 |
14 | __all__ = [
15 | 'embed_node_mol_images',
16 | 'remove_node_mol_images',
17 | 'color_scaffold_nodes_by_attribute',
18 | 'color_molecule_nodes_by_attribute',
19 | 'add_root_node',
20 | 'remove_root_node',
21 | ]
22 |
--------------------------------------------------------------------------------
/scaffoldgraph/vis/base.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.vis.base
3 | """
4 |
5 | import networkx as nx
6 |
7 | from abc import ABC
8 |
9 | from scaffoldgraph.core import ScaffoldGraph
10 | from scaffoldgraph.utils import canonize_smiles
11 |
12 | from .utils import remove_node_mol_images
13 |
14 |
15 | class Visualizer(ABC):
16 | """Base class for ScaffoldGraph visualizers.
17 |
18 | A Visualizer contains functions for creating visualizations
19 | of ScaffoldGraphs.
20 |
21 | See Also
22 | --------
23 | scaffoldgraph.vis.notebook.cytoscape.CytoscapeVisualizer
24 |
25 | """
26 | def __init__(self, graph, requires_tree=False, refresh_images=False):
27 | """Initialize the visualizer.
28 |
29 | Parameters
30 | ----------
31 | graph : ScaffoldGraph
32 | ScaffoldGraph to visualize
33 | requires_tree : bool, optional
34 | Whether the visualizer requires a tree
35 | structure to create a visualization.
36 | refresh_images: bool, optional
37 | If True remove all embeded images from the
38 | input graph and regenerate when required.
39 | The default is False.
40 |
41 | """
42 | self._requires_tree = requires_tree
43 | self._refresh = refresh_images
44 | self._graph = self._validate_graph(graph)
45 |
46 | @property
47 | def graph(self):
48 | """ScaffoldGraph: return the graph associated with the visualizer."""
49 | return self._graph
50 |
51 | @graph.setter
52 | def graph(self, graph):
53 | self._graph = self._validate_graph(graph)
54 |
55 | def _validate_graph(self, graph):
56 | """Private: Validate a graph is suitable for visualizer."""
57 | if not issubclass(type(graph), ScaffoldGraph):
58 | raise ValueError(
59 | f'{graph} must be a subclass of ScaffoldGraph'
60 | )
61 | if self._requires_tree:
62 | if not nx.is_tree(graph) or nx.is_forest(graph):
63 | msg = '{} requires a tree/forest structured graph'
64 | msg.format(self.__class__.__name__)
65 | raise ValueError(msg)
66 | if self._refresh is True:
67 | remove_node_mol_images(graph)
68 | return graph
69 |
70 | def _subgraph_from_mol(self, molecule):
71 | """Private: Select a subgraph starting at a molecule node.
72 |
73 | Parameters
74 | ----------
75 | molecule : str
76 | Molecule node identifier.
77 |
78 | Returns
79 | -------
80 | subgraph : ScaffoldGraph
81 | A subgraph starting at `molecule`.
82 |
83 | """
84 | G = self._graph
85 | if not G.molecule_in_graph(molecule):
86 | raise ValueError(f'molecule: {molecule} not in graph {G}')
87 | scaffolds = G.get_scaffolds_for_molecule(molecule)
88 | subgraph = G.subgraph([molecule] + scaffolds)
89 | return subgraph
90 |
91 | def _subgraph_from_scf(self, scaffold, traversal):
92 | """Private: Select a subgraph starting at a scaffold node.
93 |
94 | Parameters
95 | ----------
96 | scaffold : str
97 | Scaffold node identifier.
98 | traversal : str {'parent', 'child', 'bidirectional'}
99 | The direction of traversal to create the subgraph.
100 | If 'bidirectional' both directions are considered.
101 |
102 | Returns
103 | -------
104 | subgraph : ScaffoldGraph
105 | A subgraph starting at `scaffold`.
106 |
107 | """
108 | G = self._graph
109 | query = canonize_smiles(scaffold)
110 | if not G.scaffold_in_graph(query):
111 | raise ValueError(f'scaffold: {query} not in graph {G}')
112 | if traversal == 'parent':
113 | nodes = G.get_parent_scaffolds(query)
114 | elif traversal == 'child':
115 | nodes = list(nx.descendants(G, query))
116 | elif traversal == 'bidirectional':
117 | nodes = G.get_parent_scaffolds(query)
118 | nodes += list(nx.descendants(G, query))
119 | else:
120 | msg = 'traversal must be one of {child, parent, bidirectional}'
121 | raise ValueError(msg)
122 | subgraph = G.subgraph([query] + nodes)
123 | return subgraph
124 |
125 | def __repr__(self):
126 | return '<{_cls} at {address}>'.format(
127 | _cls=self.__class__.__name__,
128 | address=hex(id(self))
129 | )
130 |
--------------------------------------------------------------------------------
/scaffoldgraph/vis/notebook/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.vis.notebook
3 | """
4 |
--------------------------------------------------------------------------------
/scaffoldgraph/vis/notebook/cytoscape.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.vis.notebook.cytoscape
3 | """
4 |
5 | import warnings
6 | import json
7 |
8 | from pathlib import Path
9 |
10 | from scaffoldgraph.vis.base import Visualizer
11 | from scaffoldgraph.vis.utils import embed_node_mol_images
12 |
13 | try:
14 | import ipycytoscape as cy
15 | _cytoscape_available = True
16 | except ImportError:
17 | _cytoscape_available = False
18 | warnings.warn(
19 | 'ipycytoscape could not be imported and is required '
20 | 'for generating cytoscape based visualizations.'
21 | )
22 |
23 | DEFAULT_STYLE = Path(__file__).parent.resolve() / 'resources' / 'cytoscape.json'
24 |
25 | DEFAULT_LAYOUT = {
26 | 'name': 'dagre',
27 | 'nodeSpacing': 50,
28 | 'edgeLengthVal': 50
29 | }
30 |
31 |
32 | def read_style_file(path):
33 | """Read a JSON style file (cytoscape).
34 |
35 | Parameters
36 | ----------
37 | path : str
38 | File path to style file.
39 |
40 | Returns
41 | -------
42 | style : dict
43 | Style dictionary.
44 |
45 | """
46 | with open(path, 'r') as f:
47 | style = json.load(f)
48 | return style
49 |
50 |
51 | class CytoscapeVisualizer(Visualizer):
52 | """Class for creating visualizations using ipycytoscape.
53 |
54 | This visualizer renders scaffoldgraphs as interactive
55 | networks using cytoscape. The visualizer is flexible
56 | allowing users to customize the output defining the
57 | style and layout options.
58 |
59 | Notes
60 | -----
61 | visualizer is intended to be used within a jupyter notebook.
62 |
63 | ipycytoscape must be installed to use this feature.
64 |
65 | The code for this feature was inspired/adpated from:
66 | .. _Blogpost: https://iwatobipen.wordpress.com/2020/03/30/draw-scaffold-tree
67 | -as-network-with-molecular-image-rdkit-cytoscape/
68 |
69 | Examples
70 | --------
71 | Create a visualization for a whole graph.
72 |
73 | >>> from scaffoldgraph.vis.notebook import cytoscape
74 | >>> import scaffoldgraph as sg
75 | >>> tree = sg.ScaffoldTree.from_sdf('my_sdf.sdf')
76 | >>> visualizer = cytoscape.CytoscapeVisualizer(tree)
77 | >>> visualizer.draw()
78 |
79 | Use a different layout.
80 |
81 | >>> visualizer.draw(layout_kwargs={'name': 'breadthfirst'})
82 |
83 | Draw a subgraph starting from a molecule node.
84 |
85 | >>> visualizer.draw_for_molecule('CHEMBL1997663')
86 |
87 | Draw a subgraph starting from a scaffold node.
88 |
89 | >>> visualizer.draw_for_scaffold('c1ccc(CNc2ccccc2)cc1')
90 |
91 | """
92 | def __init__(
93 | self,
94 | graph,
95 | style=None,
96 | refresh_images=False,
97 | rd_draw_options=None,
98 | mol_img_size=(350, 300),
99 | ):
100 | """Initialize the cytoscape visualizer.
101 |
102 | Parameters
103 | ----------
104 | graph : ScaffoldGraph
105 | A ScaffoldGraph object to draw.
106 | style : list, optional
107 | A list of dicts specifying the style to pass
108 | to the cytoscape widget, for more details
109 | see the ipycytoscape documentation. If None
110 | a default style is used and can be updated
111 | after initialization.
112 | refresh_images: bool, optional
113 | If True remove all embeded images from the
114 | input graph and regenerate when required.
115 | The default is False.
116 | rd_draw_options: rdkit.Chem.Draw.rdMolDraw2D.MolDrawOptions, optional
117 | Specify options for molecule drawing. Requires a
118 | `MolDrawOptions` object or `None`.
119 | The default is None.
120 | mol_img_size: tuple, optional
121 | Specify the size of the node images. Format is
122 | `(width, height)`. Note that if changed from
123 | default the style will have to be updated.
124 | The default is `(350, 300)`.
125 |
126 | """
127 | super(CytoscapeVisualizer, self).__init__(
128 | graph,
129 | requires_tree=False,
130 | refresh_images=refresh_images,
131 | )
132 | self._drawopts = rd_draw_options
133 | self._img_size = mol_img_size
134 | self._style = style if style else read_style_file(DEFAULT_STYLE)
135 |
136 | @property
137 | def style(self):
138 | """list : returns the cytoscape style associated."""
139 | return self._style
140 |
141 | @style.setter
142 | def style(self, style):
143 | assert isinstance(style, list),\
144 | 'style must be a list object'
145 | self._style = style
146 |
147 | @staticmethod
148 | def _cytoscape_validate():
149 | if _cytoscape_available is False:
150 | raise RuntimeError('ipycytoscape is not available')
151 |
152 | def _draw(self, subgraph, layout_kwargs):
153 | """Private: create the cytoscape widget from a subgraph."""
154 | if subgraph.number_of_nodes() >= 100:
155 | warnings.warn('graphs with > 100 nodes may be slow to render')
156 | embed_node_mol_images(
157 | subgraph,
158 | size=self._img_size,
159 | draw_options=self._drawopts,
160 | )
161 | layout = {}
162 | layout.update(DEFAULT_LAYOUT)
163 | if layout_kwargs:
164 | layout.update(layout_kwargs)
165 | widget = cy.CytoscapeWidget()
166 | widget.set_style(self._style)
167 | widget.set_layout(**layout)
168 | widget.graph.add_graph_from_networkx(
169 | subgraph, directed=True
170 | )
171 | return widget
172 |
173 | def draw(self, layout_kwargs=None):
174 | """Draw the entire scaffoldgraph.
175 |
176 | Parameters
177 | ----------
178 | layout_kwargs : dict, optional
179 | arguments to pass to the CytoscapeWidget.set_layout
180 | function.
181 |
182 | Returns
183 | -------
184 | widget : ipycytoscape.CytoscapeWidget
185 |
186 | """
187 | self._cytoscape_validate()
188 | return self._draw(self._graph, layout_kwargs)
189 |
190 | def draw_for_molecule(self, molecule_id, layout_kwargs=None):
191 | """Draw subgraph starting from a query molecule.
192 |
193 | Parameters
194 | ----------
195 | molecule_id : str
196 | Molecule node identifier.
197 | layout_kwargs : dict, optional
198 | arguments to pass to the CytoscapeWidget.set_layout
199 | function.
200 |
201 | Returns
202 | -------
203 | widget : ipycytoscape.CytoscapeWidget
204 |
205 | """
206 | self._cytoscape_validate()
207 | subgraph = self._subgraph_from_mol(molecule_id)
208 | return self._draw(subgraph, layout_kwargs)
209 |
210 | def draw_for_scaffold(self, scaffold_id, traversal='child', layout_kwargs=None):
211 | """Draw subgraph starting from a query scaffold.
212 |
213 | Parameters
214 | ----------
215 | scaffold_id : str
216 | Scaffold node identifier.
217 | traversal : str {'parent', 'child', 'bidirectional'}
218 | The direction of traversal to create the subgraph.
219 | If 'bidirectional' both directions are considered.
220 | The default is 'child'.
221 | layout_kwargs : dict, optional
222 | arguments to pass to the CytoscapeWidget.set_layout
223 | function.
224 |
225 | Returns
226 | -------
227 | widget : ipycytoscape.CytoscapeWidget
228 |
229 | """
230 | self._cytoscape_validate()
231 | subgraph = self._subgraph_from_scf(scaffold_id, traversal)
232 | return self._draw(subgraph, layout_kwargs)
233 |
--------------------------------------------------------------------------------
/scaffoldgraph/vis/notebook/resources/cytoscape.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "selector": "node",
4 | "style": {
5 | "text-valign": "top",
6 | "color": "#2E56AD",
7 | "font-family": "arial",
8 | "font-size": "40px",
9 | "shape": "rectangle",
10 | "width": 350,
11 | "height": 300,
12 | "background-color": "#EEEEEE",
13 | "background-fit": "contain",
14 | "background-image": "data(img)",
15 | "text-outline-width": 2,
16 | "text-outline-color": "white"
17 | }
18 | },
19 | {
20 | "selector": "node[type='molecule']",
21 | "style": {
22 | "content": "data(id)",
23 | "background-color": "#EEEEEE"
24 | }
25 | },
26 | {
27 | "selector": "edge",
28 | "style": {
29 | "width": 6,
30 | "line-color": "#9dbaea",
31 | "target-arrow-shape": "triangle",
32 | "target-arrow-color": "#9dbaea",
33 | "curve-style": "bezier"
34 | }
35 | },
36 | {
37 | "selector": "edge[rule]",
38 | "style": {
39 | "content": "data(rule)",
40 | "color": "#2E56AD",
41 | "font-family": "arial",
42 | "font-size": "30px",
43 | "text-outline-width": 2,
44 | "text-outline-color": "white"
45 | }
46 | }
47 | ]
--------------------------------------------------------------------------------
/scaffoldgraph/vis/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph.vis.utils
3 | """
4 |
5 | import matplotlib.pyplot as plt
6 | import matplotlib as mpl
7 |
8 | from rdkit.Chem.Draw import rdMolDraw2D
9 | from rdkit import Chem
10 |
11 | from loguru import logger
12 | from urllib import parse
13 |
14 |
15 | def _maybe_kekulize(mol):
16 | """Private: attempt to kekulize a molecule."""
17 | try:
18 | Chem.Kekulize(mol)
19 | except Chem.KekulizeException:
20 | smi = Chem.MolToSmiles(mol)
21 | logger.warning(f'Failed to kekulize mol: {smi}')
22 | return mol
23 |
24 |
25 | def smiles_to_svg(smiles, size=(350, 300), draw_options=None):
26 | """Create an SVG string from a SMILES string.
27 |
28 | Parameters
29 | ----------
30 | smiles : str
31 | SMILES to create SVG image.
32 | size : tuple, optional
33 | Size of image, the default is (350, 300).
34 | draw_options : rdMolDraw2D.MolDrawOptions
35 | Options to pass to the drawer.
36 |
37 | Returns
38 | -------
39 | svg : str
40 | SVG text for molecule.
41 |
42 | """
43 | mol = Chem.MolFromSmiles(smiles)
44 | if mol is None:
45 | return ''
46 | mol = _maybe_kekulize(mol)
47 | drawer = rdMolDraw2D.MolDraw2DSVG(*size)
48 | if draw_options:
49 | drawer.SetDrawOptions(draw_options)
50 | rdMolDraw2D.PrepareAndDrawMolecule(drawer, mol)
51 | drawer.FinishDrawing()
52 | return drawer.GetDrawingText()
53 |
54 |
55 | def smiles_to_image(smiles, size=(350, 300), draw_options=None):
56 | """Create an SVG image from a SMILES string (ready for HTML).
57 |
58 | Parameters
59 | ----------
60 | smiles : str
61 | SMILES to create SVG image.
62 | size : tuple, optional
63 | Size of image, the default is (350, 300).
64 | draw_options : rdMolDraw2D.MolDrawOptions
65 | Options to pass to the drawer.
66 |
67 | Returns
68 | -------
69 | svg : str
70 | SVG image path.
71 |
72 | """
73 | svg = smiles_to_svg(smiles, size, draw_options)
74 | img_path = 'data:image/svg+xml;charset=utf-8,'
75 | img_path += parse.quote(svg, safe='')
76 | return img_path
77 |
78 |
79 | def embed_node_mol_images(graph, size=(350, 300), draw_options=None, skip_existing=True):
80 | """Embed molecule images into a graph.
81 |
82 | Images are added as an attribute 'img' to each node with an
83 | available SMILES string ('molecule', 'scaffold'). The graph
84 | is modified in-place.
85 |
86 | Parameters
87 | ----------
88 | graph : ScaffoldGraph
89 | Input ScaffoldGraph.
90 | size : tuple, optional
91 | Size of image, the default is (350, 300).
92 | draw_options : rdMolDraw2D.MolDrawOptions
93 | Options to pass to the drawer.
94 | skip_existing : bool
95 | Skip node if it contains an 'img' attribute.
96 | The default is True.
97 |
98 | """
99 | for node, data in graph.nodes(data=True):
100 | if skip_existing and data.get('img', None):
101 | continue
102 | elif data.get('type', None) == 'scaffold':
103 | data['img'] = smiles_to_image(node, size, draw_options)
104 | elif data.get('type', None) == 'molecule':
105 | data['img'] = smiles_to_image(data['smiles'], size, draw_options)
106 | else:
107 | data['img'] = ''
108 |
109 |
110 | def remove_node_mol_images(graph):
111 | """Remove embeded images from a graph.
112 |
113 | Parameters
114 | ----------
115 | graph : ScaffoldGraph
116 | Input ScaffoldGraph
117 |
118 | """
119 | for node, data in graph.nodes(data=True):
120 | _ = data.pop('img', None)
121 |
122 |
123 | def rgba_to_hex(scalar_mappable, value):
124 | """str: rgba to hex."""
125 | rgba = scalar_mappable.to_rgba(value)
126 | c_hex = mpl.colors.to_hex(rgba, keep_alpha=False)
127 | return c_hex
128 |
129 |
130 | def cmap_to_scalar_mappable(cmap, vmin, vmax):
131 | """Convert matplotlib Colormap to a ScalarMappable.
132 |
133 | Parameters
134 | ----------
135 | cmap : matplotlib.colors.Colormap
136 | vmin : float
137 | Minimum value for normalization.
138 | vmax : float
139 | Maximum value for normalization.
140 |
141 | Returns
142 | -------
143 | matplolib.cm.ScalarMappable
144 |
145 | """
146 | cnorm = mpl.colors.Normalize(vmin, vmax)
147 | scalar = mpl.cm.ScalarMappable(norm=cnorm, cmap=cmap)
148 | return scalar
149 |
150 |
151 | def color_nodes_by_attribute(graph, attribute, cmap, node_type, label='color'):
152 | """
153 | Add an attribute to nodes in a ScaffoldGraph containing a color hex code,
154 | calculated from a paticular node attribute and a matplotlib cmap. The
155 | operation is perfomred in-place.
156 |
157 | Can be used for adding colors to ScaffoldGraph visualizations.
158 |
159 | Parameters
160 | ----------
161 | graph : ScaffoldGraph
162 | Input ScaffoldGraph
163 | attribute : str
164 | Key for the attibute from which to calculate a color.
165 | cmap : str or matplotlib.colors.Colormap
166 | A matplotlib cmap or name of a cmap e.g. 'BuPu' for
167 | calculating a nodes colour.
168 | node_type : str
169 | The type of node to process e.g. 'scaffold' / 'molecule'
170 | label : str, optional
171 | The attribute label to use for storing the color.
172 | The default is 'color'.
173 |
174 | """
175 | # Cmap may be a string or a Colormap
176 | if isinstance(cmap, str):
177 | cmap = plt.get_cmap(cmap)
178 | else:
179 | if not issubclass(type(cmap), mpl.colors.Colormap):
180 | raise ValueError('cmap must be a string or a matplotlib Colormap')
181 |
182 | # Get attribute range.
183 | _, attr = zip(*graph._get_nodes_with_type(node_type, attribute, None))
184 | attr = list(filter(lambda x: x is not None, attr))
185 | attr = list(map(float, attr))
186 | vmin, vmax = min(attr), max(attr)
187 |
188 | # Assign colors to each node.
189 | scalar_mappable = cmap_to_scalar_mappable(cmap, vmin, vmax)
190 | for node, data in graph._get_nodes_with_type(node_type, True, None):
191 | attr_val = data.get(attribute, None)
192 | if not attr_val:
193 | color = '#EEEEEE' # Set a neutral default.
194 | else:
195 | color = rgba_to_hex(scalar_mappable, attr_val)
196 | data[label] = color
197 |
198 |
199 | def color_scaffold_nodes_by_attribute(graph, attribute, cmap, label='color'):
200 | """
201 | Add an attribute to scaffold nodes in a ScaffoldGraph containing a color hex code,
202 | calculated from a paticular scaffold node attribute and a matplotlib cmap. The
203 | operation is perfomred in-place.
204 |
205 | Can be used for adding colors to ScaffoldGraph visualizations.
206 |
207 | Parameters
208 | ----------
209 | graph : ScaffoldGraph
210 | Input ScaffoldGraph
211 | attribute : str
212 | Key for the attibute from which to calculate a color.
213 | cmap : str or matplotlib.colors.Colormap
214 | A matplotlib cmap or name of a cmap e.g. 'BuPu' for
215 | calculating a nodes colour.
216 | label : str, optional
217 | The attribute label to use for storing the color.
218 | The default is 'color'.
219 |
220 | See Also
221 | --------
222 | color_molecule_nodes_by_attribute
223 |
224 | """
225 | color_nodes_by_attribute(graph, attribute, cmap, 'scaffold', label)
226 |
227 |
228 | def color_molecule_nodes_by_attribute(graph, attribute, cmap, label='color'):
229 | """
230 | Add an attribute to molecule nodes in a ScaffoldGraph containing a color hex code,
231 | calculated from a paticular molecule node attribute and a matplotlib cmap. The
232 | operation is perfomred in-place.
233 |
234 | Can be used for adding colors to ScaffoldGraph visualizations.
235 |
236 | Parameters
237 | ----------
238 | graph : ScaffoldGraph
239 | Input ScaffoldGraph
240 | attribute : str
241 | Key for the attibute from which to calculate a color.
242 | cmap : str or matplotlib.colors.Colormap
243 | A matplotlib cmap or name of a cmap e.g. 'BuPu' for
244 | calculating a nodes colour.
245 | label : str, optional
246 | The attribute label to use for storing the color.
247 | The default is 'color'.
248 |
249 | See Also
250 | --------
251 | color_scaffold_nodes_by_attribute
252 |
253 | """
254 | color_nodes_by_attribute(graph, attribute, cmap, 'molecule', label)
255 |
256 |
257 | def add_root_node(graph):
258 | """Add a root node to a scaffoldgraph.
259 |
260 | Parameters
261 | ----------
262 | graph : ScaffoldGraph
263 | Graph to add root node.
264 |
265 | """
266 | graph.add_node('root', type='root', hierarchy=0)
267 | edges = [('root', s) for s, d in graph.in_degree if d == 0 and s != 'root']
268 | graph.add_edges_from(edges, type=2)
269 |
270 |
271 | def remove_root_node(graph):
272 | """Remove a root node from a scaffoldgraph.
273 |
274 | Parameters
275 | ----------
276 | graph : Scaffoldgraph
277 | Graph from which to remove root node.
278 |
279 | """
280 | if 'root' in graph:
281 | graph.remove_node('root')
282 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file=README.md
3 | license_files=LICENSE
4 |
5 | [options.extras_require]
6 | vis=ipycytoscape>=1.2.0
7 | rdkit=rdkit-pypi
8 |
9 | [aliases]
10 | test=pytest
11 |
12 | [tool:pytest]
13 | markers =
14 | slow: marks tests as slow (deselect with '-m "not slow"')
15 | serial
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph setup.py
3 | """
4 |
5 | from setuptools import setup, find_packages
6 | from pathlib import Path
7 | import re
8 |
9 | url = 'https://github.com/UCLCheminformatics/scaffoldgraph'
10 |
11 | description = 'ScaffoldGraph is an open-source cheminformatics library, built using RDKit and \
12 | NetworkX for generating scaffold networks and scaffold trees.'
13 |
14 | root = Path(__file__).parent.resolve()
15 |
16 | init_path = root / 'scaffoldgraph' / '__init__.py'
17 | with init_path.open('r', encoding='utf8') as f:
18 | __version__ = re.findall("__version__ = '(.*)'", f.read())[0]
19 |
20 | requires_path = root / 'requirements.txt'
21 | with requires_path.open('r', encoding='utf8') as f:
22 | install_requires = [line.strip() for line in f]
23 | install_requires.remove('rdkit')
24 |
25 | readme_path = root / 'README.md'
26 | with readme_path.open('r', encoding='utf-8') as f:
27 | long_description = f.read()
28 |
29 | setup_requires = ['pytest-runner']
30 | tests_require = ['pytest', 'pytest-cov']
31 |
32 | entry_points = {
33 | 'console_scripts': [
34 | 'scaffoldgraph = scaffoldgraph.scripts.run:scaffoldgraph_main',
35 | ]
36 | }
37 |
38 | setup(
39 | name='ScaffoldGraph',
40 | version=__version__,
41 | description=description,
42 | long_description=long_description,
43 | long_description_content_type='text/markdown',
44 | author='Oliver Scott',
45 | author_email='oliver.scott.17@ucl.ac.uk',
46 | url=url,
47 | download_url='{}/archive/{}.tar.gz'.format(url, __version__),
48 | license='MIT',
49 | keywords=[
50 | 'rdkit',
51 | 'networkx',
52 | 'cheminformatics',
53 | 'scaffolds',
54 | 'scaffold tree',
55 | 'scaffold network'
56 | ],
57 | classifiers=[
58 | 'License :: OSI Approved :: MIT License',
59 | 'Programming Language :: Python :: 3',
60 | 'Operating System :: OS Independent',
61 | 'Topic :: Scientific/Engineering',
62 | 'Topic :: Scientific/Engineering :: Bio-Informatics',
63 | 'Topic :: Scientific/Engineering :: Chemistry',
64 | ],
65 | python_requires='>=3.6',
66 | install_requires=install_requires,
67 | setup_requires=setup_requires,
68 | tests_require=tests_require,
69 | entry_points=entry_points,
70 | include_package_data=True,
71 | packages=find_packages(
72 | exclude=['tests.*', 'tests']
73 | ),
74 | )
75 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests
3 | """
4 |
5 | import os
6 |
7 | import pytest
8 | from rdkit import Chem
9 | from rdkit import rdBase
10 |
11 | rdBase.DisableLog('rdApp.*')
12 |
13 |
14 | def test_root_dir():
15 | return os.path.dirname(os.path.abspath(__file__))
16 |
17 |
18 | @pytest.fixture(name='sdf_file')
19 | def mock_sdf(tmp_path):
20 | d = tmp_path / "test_data"
21 | d.mkdir()
22 | p = d / "test.sdf"
23 | writer = Chem.SDWriter(str(p))
24 | writer.write(Chem.MolFromSmiles('CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CC=CC=C3'))
25 | writer.write(Chem.MolFromSmiles('CCC1=CC2=C(S1)N(C(=O)CN=C2C3=CC=CC=C3Cl)C'))
26 | writer.close()
27 | return str(p)
28 |
29 |
30 | @pytest.fixture(name='sdf_file_2')
31 | def mock_sdf_2(tmp_path):
32 | d = tmp_path / "test_data"
33 | try:
34 | d.mkdir()
35 | except FileExistsError:
36 | pass
37 | p = d / "test_2.sdf"
38 | writer = Chem.SDWriter(str(p))
39 | writer.write(Chem.MolFromSmiles('C1C(=O)NC2=C(C=C(C=C2)Br)C(=N1)C3=CC=CC=N3'))
40 | writer.write(Chem.MolFromSmiles('CC1=NN(C2=C1C(=NCC(=O)N2C)C3=CC=CC=C3F)C'))
41 | writer.close()
42 | return str(p)
43 |
44 |
45 | @pytest.fixture(name='smiles_file')
46 | def mock_smiles_file(tmp_path):
47 | d = tmp_path / "test_data"
48 | d.mkdir()
49 | p = d / "test.smi"
50 | writer = Chem.SmilesWriter(str(p))
51 | writer.write(Chem.MolFromSmiles('CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CC=CC=C3'))
52 | writer.write(Chem.MolFromSmiles('CCC1=CC2=C(S1)N(C(=O)CN=C2C3=CC=CC=C3Cl)C'))
53 | writer.close()
54 | return str(p)
55 |
56 |
57 | def canon(smiles):
58 | """Canonicalize SMILES for safety. If canonicalization ever changes this should remain consistent"""
59 | return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
60 |
--------------------------------------------------------------------------------
/tests/analysis/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.analysis
3 | """
4 |
--------------------------------------------------------------------------------
/tests/analysis/test_enrichment.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.analysis.test_enrichment
3 | """
4 |
5 | import pytest
6 | import networkx as nx
7 | import random
8 |
9 | from scaffoldgraph.analysis import calc_scaffold_enrichment, compound_set_enrichment
10 | from ..test_network import long_test_network
11 |
12 |
13 | def test_enrichment(network):
14 |
15 | ks_data = {}
16 | for molecule in network.get_molecule_nodes():
17 | ks_data[molecule] = {'activity': random.random()}
18 | nx.set_node_attributes(network, ks_data)
19 | enrichment = calc_scaffold_enrichment(network, 'activity')
20 | entry = list(enrichment.items())[0]
21 | assert entry[0] in network
22 | assert 'pval' in entry[1]
23 | assert 'dmax' in entry[1]
24 | assert '_total' in entry[1]
25 | assert type(entry[1]['dmax']) == float
26 | assert type(entry[1]['_total']) == int
27 | compound_set_enrichment(network, 'activity', mode='ks')
28 |
29 | binom_data = {}
30 | for molecule in network.get_molecule_nodes():
31 | binom_data[molecule] = {'activity': random.choice([0, 1])}
32 | nx.set_node_attributes(network, binom_data)
33 | enrichment = calc_scaffold_enrichment(network, 'activity', mode='b')
34 | entry = list(enrichment.items())[0]
35 | assert entry[0] in network
36 | assert 'pval' in entry[1]
37 | assert '_active' in entry[1]
38 | assert '_total' in entry[1]
39 | assert type(entry[1]['_active']) == int
40 | assert type(entry[1]['_total']) == int
41 | compound_set_enrichment(network, 'activity', mode='b')
42 |
43 | with pytest.raises(ValueError):
44 | compound_set_enrichment(network, 'activity', mode='not a mode')
45 |
--------------------------------------------------------------------------------
/tests/analysis/test_general.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.analysis.test_general
3 | """
4 |
5 | from scaffoldgraph.analysis import get_singleton_scaffolds, get_virtual_scaffolds
6 | from ..test_network import long_test_network
7 |
8 |
9 | def test_get_virtual_scaffolds(network):
10 | v = get_virtual_scaffolds(network)
11 | assert len(v) == 19
12 |
13 |
14 | def test_get_singleton_scaffolds(network):
15 | s = get_singleton_scaffolds(network)
16 | assert len(s) == 3
17 |
--------------------------------------------------------------------------------
/tests/analysis/test_representation.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.analysis.test_representation
3 | """
4 |
5 | from scaffoldgraph.analysis import calc_average_pairwise_similarity, get_over_represented_scaffold_classes
6 | from ..test_network import long_test_network
7 |
8 |
9 | def test_representation(network):
10 | aps = calc_average_pairwise_similarity(network)
11 | entry = list(aps.items())[0]
12 | assert entry[0] in network
13 | assert 'members' in entry[1]
14 | assert 'aps' in entry[1]
15 | assert type(entry[1]['aps']) == float
16 | assert type(entry[1]['members']) == int
17 | over = get_over_represented_scaffold_classes(network, 0.80)
18 | for entry in over:
19 | assert entry[1]['aps'] >= 0.80
20 | over = get_over_represented_scaffold_classes(network, 0.75, skip_aps=True)
21 | for entry in over:
22 | assert entry[1]['aps'] >= 0.75
23 |
--------------------------------------------------------------------------------
/tests/core/test_fragment.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.core.test_fragment
3 | """
4 |
5 | import pytest
6 | from rdkit import Chem
7 |
8 | from scaffoldgraph.core.fragment import *
9 |
10 |
11 | @pytest.fixture(name='mol')
12 | def test_molecule():
13 | smiles = 'CCN1CCc2c(C1)sc(NC(=O)Nc3ccc(Cl)cc3)c2C#N'
14 | return Chem.MolFromSmiles(smiles)
15 |
16 |
17 | def canon(smiles):
18 | """Canonicalize SMILES for safety. If canonicalization ever changes this should remain consistent"""
19 | return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
20 |
21 |
22 | def test_murcko(mol):
23 | murcko = get_murcko_scaffold(mol, generic=False)
24 | assert Chem.MolToSmiles(murcko) == canon('O=C(Nc1ccccc1)Nc1cc2c(s1)CNCC2')
25 | murcko = get_murcko_scaffold(mol, generic=True)
26 | assert Chem.MolToSmiles(murcko) == canon('CC(CC1CCCCC1)CC1CC2CCCCC2C1')
27 | murcko = get_murcko_scaffold(mol, generic=True, remove_exocyclic=True)
28 | assert Chem.MolToSmiles(murcko) == canon('C1CCC(CCCC2CC3CCCCC3C2)CC1')
29 | murcko = get_murcko_scaffold(mol, generic=True, remove_exocyclic=True, collapse_linkers=True)
30 | assert Chem.MolToSmiles(murcko) == canon('C1CCC(C2CC3CCCCC3C2)CC1')
31 |
32 |
33 | def test_annotation(mol):
34 | annotation = Chem.MolToSmiles(get_annotated_murcko_scaffold(mol))
35 | annotation = annotation.replace('1*', '*')
36 | annotation = annotation.replace('2*', '*')
37 | annotation = annotation.replace('3*', '*')
38 | assert annotation.count('*') == 3
39 |
40 |
41 | def test_murcko_all(mol):
42 | frags = get_all_murcko_fragments(mol, break_fused_rings=True)
43 | assert len(frags) == 6
44 | frags = get_all_murcko_fragments(mol, break_fused_rings=False)
45 | assert len(frags) == 3
46 |
47 |
48 | def test_murcko_next(mol):
49 | scf = get_murcko_scaffold(mol)
50 | frags_1 = get_next_murcko_fragments(scf, break_fused_rings=True)
51 | frags_1 = {Chem.MolToSmiles(x) for x in frags_1}
52 | assert len(frags_1) == 2
53 | frags_2 = get_next_murcko_fragments(scf, break_fused_rings=False)
54 | frags_2 = {Chem.MolToSmiles(x) for x in frags_2}
55 | assert len(frags_2) == 2
56 | assert len(frags_1.intersection(frags_2)) == 1
57 |
58 |
59 | def test_collect_linker_atoms():
60 | mol = Chem.MolFromSmiles('CCCCCCCCCc1ccccc1')
61 | remove_atoms = set()
62 | a = collect_linker_atoms(mol.GetAtomWithIdx(0), remove_atoms, True)
63 | assert len(a) == 1
64 | assert len(remove_atoms) == 9
65 | remove_atoms.clear()
66 | a = collect_linker_atoms(mol.GetAtomWithIdx(0), remove_atoms, False)
67 | assert len(a) == 1
68 | assert len(remove_atoms) == 8
69 |
70 |
71 | def test_remove_exocylic_attachments(mol):
72 | edited = remove_exocyclic_attachments(mol)
73 | assert Chem.MolToSmiles(edited) == canon('CCN1CCc2c(sc(NCNc3ccc(Cl)cc3)c2C#N)C1')
74 |
75 |
76 | def test_genericise_scaffold(mol):
77 | generic = genericise_scaffold(mol)
78 | assert Chem.MolToSmiles(generic) == canon('CCC1CCC2C(C1)CC(CC(C)CC1CCC(C)CC1)C2CC')
79 |
80 |
81 | def test_linker_collapse(mol):
82 | from scaffoldgraph.core.fragment import _collapse_linker_bonds
83 | collapsed = _collapse_linker_bonds(mol, retain_het=False)
84 | assert Chem.MolToSmiles(collapsed) == canon('CN1CCc2c(sc(C(=O)c3ccc(Cl)cc3)c2N)C1')
85 | collapsed = _collapse_linker_bonds(mol, retain_het=True)
86 | assert Chem.MolToSmiles(collapsed) == canon('CN1CCc2c(sc(NC(=O)Nc3ccc(Cl)cc3)c2N)C1')
87 |
88 |
89 | def test_ring_toplogy():
90 | # Replicate figure 1 from paper: Scaffold Topologies II: Analysis of Chemical Databases
91 | smiles = 'CC(C)c1ccc(C)cc1OC(=O)C2(CCC3C2)C(=C)C3(C)C'
92 | mol = Chem.MolFromSmiles(smiles)
93 | topology = get_ring_toplogy_scaffold(mol)
94 | assert Chem.MolToSmiles(topology) == canon('C1CC1C12CC1C2')
95 |
96 |
97 | def _test_topology_helper(smiles, expected):
98 | mol = Chem.MolFromSmiles(smiles)
99 | topology = get_ring_toplogy_scaffold(mol)
100 | assert Chem.MolToSmiles(topology) == canon(expected)
101 |
102 |
103 | def test_ring_topology_extended():
104 | # Replicate figure 2 from paper: Scaffold Topologies II: Analysis of Chemical Databases
105 | # Figure 2a: topologies, Figure 2b: Examples of molecules with each topology
106 | # First 10 examples
107 | _test_topology_helper('NCC1(CC(=O)O)CCCCC1', 'C1CC1') # 1
108 | _test_topology_helper('CNS(=O)(=O)Cc1ccc2[nH]cc(CCN(C)C)c2c1', 'C1C2CC12') # 2
109 | _test_topology_helper('COc1ccc(C(CN(C)C)C2(O)CCCCC2)cc1', 'C1CC1C1CC1') # 3
110 | _test_topology_helper('[NH3+][Pt]1([NH3+])OC(=O)C2(CC2)C(=O)O1', 'C1CC12CC2') # 4
111 | _test_topology_helper('CC1CCc2cc(F)cc3c(=O)c(C(=O)O)cn1c23', 'C12C3C1C23') # 5
112 | _test_topology_helper('NC(=O)N1C2C=CC=CC2CC(=O)C2C=CC=CC21', 'C1C2C1C1CC21') # 6
113 | _test_topology_helper('Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(C)(=O)=O)cc2)cc1', 'C1CC1C1CC1C1CC1') # 7
114 | _test_topology_helper('COc1ccc2[nH]c(S(=O)Cc3ncc(C)c(OC)c3C)nc2c1', 'C1CC1C1C2CC21') # 8
115 | _test_topology_helper('O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1', 'C1CC1C(C1CC1)C1CC1') # 9
116 | _test_topology_helper('O=C1O[Pt]2(NC3CCCCC3N2)OC1=O', 'C1C2C1C21CC1') # 10
117 |
118 |
119 | def _test_connectivity_helper(smiles, expected, single=False):
120 | mol = Chem.MolFromSmiles(smiles)
121 | connectivity = get_ring_connectivity_scaffold(mol, single)
122 | assert Chem.MolToSmiles(connectivity) == canon(expected)
123 |
124 |
125 | def test_ring_connectivity():
126 | # Test cases from Figure 1 from the paper: Scaffold analysis of pubchem database
127 | # as a background for hierarchial scaffold-based visualisation.
128 | _test_connectivity_helper('CC(C)CC1=CC=C(C=C1)C(C)C(O)=O', '*', False)
129 | _test_connectivity_helper('CC(C)CC1=CC=C(C=C1)C(C)C(O)=O', '*', True)
130 | _test_connectivity_helper('CC1=CC(NS(=O)(=O)C2=CC=C(N)C=C2)=NO1', '**', False)
131 | _test_connectivity_helper('CC1=CC(NS(=O)(=O)C2=CC=C(N)C=C2)=NO1', '**', True)
132 | _test_connectivity_helper('CN1C2=C(C=C(Cl)C=C2)C(=NCC1=O)C1=CC=CC=C1', '**=*', False)
133 | _test_connectivity_helper('CN1C2=C(C=C(Cl)C=C2)C(=NCC1=O)C1=CC=CC=C1', '***', True)
134 | db00741 = '[H][C@@]12CC[C@](O)(C(=O)CO)[C@@]1(C)C[C@H](O)[C@@]1([H])[C@@]2([H])CCC2=CC(=O)CC[c@]12C'
135 | _test_connectivity_helper(db00741, '*=*=*=*', False)
136 | _test_connectivity_helper(db00741, '****', True)
137 |
--------------------------------------------------------------------------------
/tests/core/test_graph.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.core.test_graph
3 | """
4 |
5 | from rdkit import Chem
6 |
7 | from scaffoldgraph.core.graph import *
8 |
9 |
10 | def test_init_molecule_name():
11 | x = Chem.MolFromSmiles('CCC')
12 | assert bool(x.HasProp('_Name')) is False
13 | init_molecule_name(x)
14 | assert x.HasProp('_Name')
15 | assert x.GetProp('_Name') is not None
16 | assert x.GetProp('_Name') != ''
17 |
18 |
19 | def test_graph_subclass():
20 | assert issubclass(ScaffoldGraph, nx.DiGraph)
21 | assert issubclass(ScaffoldGraph, ABC)
22 |
--------------------------------------------------------------------------------
/tests/core/test_scaffold.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.core.test_scaffold
3 | """
4 |
5 | import pytest
6 | import pickle
7 |
8 | from rdkit import Chem
9 |
10 | from scaffoldgraph.core.scaffold import *
11 |
12 |
13 | @pytest.fixture(name='scaffold')
14 | def basic_scaffold():
15 | # murcko scaffold smiles
16 | mol = Chem.MolFromSmiles('O=C(Nc1ccccc1)Nc1cc2c(s1)CNCC2')
17 | scaffold = Scaffold(mol)
18 | return scaffold
19 |
20 |
21 | def test_new():
22 | scaffold = Scaffold(None)
23 | assert scaffold is None
24 |
25 |
26 | def test_pickle(scaffold):
27 | b = pickle.dumps(scaffold)
28 | s = pickle.loads(b)
29 | assert s.atoms
30 | assert s.bonds
31 | assert s.rings
32 | assert s.ring_systems
33 | assert s.smiles
34 |
35 |
36 | def test_smiles(scaffold):
37 | assert scaffold.smiles == 'O=C(Nc1ccccc1)Nc1cc2c(s1)CNCC2'
38 | assert scaffold.get_canonical_identifier() == scaffold.smiles
39 | assert scaffold == Scaffold(Chem.MolFromSmiles(scaffold.smiles))
40 | assert scaffold == scaffold.smiles
41 | assert str(scaffold) == scaffold.smiles
42 | assert hash(scaffold) == hash(scaffold.smiles)
43 |
44 |
45 | def test_name(scaffold):
46 | assert scaffold.name is None
47 | scaffold.name = 'TEST'
48 | assert scaffold.name == 'TEST'
49 | assert repr(scaffold) == ''.format(hex(id(scaffold)))
50 | assert bool(scaffold) is True
51 |
52 |
53 | def test_atoms(scaffold):
54 | atoms = scaffold.atoms
55 | assert len(atoms) == scaffold.mol.GetNumAtoms()
56 | assert all([isinstance(x, Chem.Atom) for x in atoms])
57 |
58 |
59 | def test_bonds(scaffold):
60 | bonds = scaffold.bonds
61 | assert len(bonds) == scaffold.mol.GetNumBonds()
62 | assert all([isinstance(x, Chem.Bond) for x in bonds])
63 |
64 |
65 | def test_rings(scaffold):
66 | rings = scaffold.rings
67 | assert isinstance(rings, RingStack)
68 | assert hasattr(rings, 'owner')
69 | assert hasattr(rings, 'info')
70 | assert hasattr(rings, 'atom_rings')
71 | assert hasattr(rings, 'bond_rings')
72 | assert rings.count == 3 and len(rings) == 3
73 | assert repr(rings) == ''.format(hex(id(rings)))
74 | assert isinstance(rings[0], Ring)
75 | assert len([x for x in rings]) == 3
76 | assert isinstance(rings.info, Chem.RingInfo)
77 | assert len(rings.atom_rings) == 3 and len(rings.bond_rings) == 3
78 | ring = rings[1]
79 | assert hasattr(ring, 'owner')
80 | assert hasattr(ring, 'aix')
81 | assert hasattr(ring, 'bix')
82 | assert all([isinstance(x, Chem.Bond) for x in ring.bonds])
83 | assert all([isinstance(x, Chem.Atom) for x in ring.atoms])
84 | assert isinstance(ring.size, int)
85 | assert len(ring) == len(ring.atoms)
86 | assert repr(ring) == ''.format(hex(id(ring)))
87 | assert len(ring.get_attachment_points()) == 1
88 | assert ring.is_exocyclic_attachment(ring.atoms[0]) is False
89 | assert ring.get_ring_system().size == 9
90 | assert len(rings.to_list()) == 3
91 | subset = rings[0:2]
92 | assert len(subset) == 2
93 | assert subset[0] != subset[1]
94 |
95 |
96 | def test_ring_systems(scaffold):
97 | rings = scaffold.ring_systems
98 | assert isinstance(rings, RingSystemStack)
99 | assert hasattr(rings, 'owner')
100 | assert hasattr(rings, 'ring_indexes')
101 | assert hasattr(rings, 'atom_rings')
102 | assert hasattr(rings, 'bond_rings')
103 | assert rings.count == 2 and len(rings) == 2
104 | assert repr(rings) == ''.format(hex(id(rings)))
105 | assert isinstance(rings[0], RingSystem)
106 | assert len([x for x in rings]) == 2
107 | assert len(rings.atom_rings) == 2 and len(rings.bond_rings) == 2
108 | ring = rings[1]
109 | assert hasattr(ring, 'owner')
110 | assert hasattr(ring, 'aix')
111 | assert hasattr(ring, 'bix')
112 | assert hasattr(ring, 'rix')
113 | assert all([isinstance(x, Chem.Bond) for x in ring.bonds])
114 | assert all([isinstance(x, Chem.Atom) for x in ring.atoms])
115 | assert isinstance(ring.size, int)
116 | assert len(ring) == len(ring.atoms)
117 | assert repr(ring) == ''.format(hex(id(ring)))
118 | assert isinstance(ring[0], Ring)
119 | assert len(list(ring.get_rings())) == 2
120 | assert len(ring.get_attachment_points()) == 1
121 | assert ring.is_exocyclic_attachment(ring.atoms[0]) is False
122 | subset = rings[1:]
123 | assert len(subset) == 1
124 | assert isinstance(subset[0][0], Ring)
125 | assert len(subset[0][0:2]) == 2
126 |
127 |
--------------------------------------------------------------------------------
/tests/data/test_smiles.smi:
--------------------------------------------------------------------------------
1 | CN(C)Cc1n-2c(nn1)CN=C(c1ccccc1)c1cc(Cl)ccc12 Adinazolam
2 | Cc1n-2c(nn1)CN=C(c1ccccc1)c1cc(Cl)ccc12 Alprazolam
3 | Brc1cc2c(cc1)NC(=O)CN=C2c1ncccc1 Bromazepam
4 | CNC1=Nc2c(cc(Cl)cc2)C(c2ccccc2)=[N+]([O-])C1 Chlordiazepoxide
5 | CN1c2ccc(Cl)cc2N(c2ccccc2)C(=O)CC1=O Clobazam
6 | [O-][N+](=O)c1cc2c(cc1)NC(=O)CN=C2c1c(Cl)cccc1 Clonazepam
7 | OC(=O)C1N=C(c2ccccc2)c2cc(Cl)ccc2NC1=O Clorazepate
8 | Clc1cc2c(cc1)NC(=O)CN=C2c1c(Cl)cccc1 Delorazepam
9 | [O-][N+]1=C(c2ccccc2)c2cc(Cl)ccc2NC(=O)C1 Demoxepam
10 | Clc1cc2c(cc1)NC(=O)CC(=O)N2c1ccccc1 Desmethylclobazam
11 |
--------------------------------------------------------------------------------
/tests/prioritization/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.prioritization
3 | """
4 |
--------------------------------------------------------------------------------
/tests/prioritization/test_generic_rules.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.prioritization.test_generic_rules
3 | """
4 |
5 | from rdkit import Chem
6 |
7 | from scaffoldgraph.prioritization.generic_rules import *
8 | from scaffoldgraph import get_next_murcko_fragments
9 | from scaffoldgraph.utils import canonize_smiles
10 | from scaffoldgraph.core import Scaffold
11 |
12 |
13 | def fragment_and_filter(mol, rule):
14 | scf = Scaffold(mol)
15 | parents = get_next_murcko_fragments(mol)
16 | parents = list(map(lambda x: Scaffold(x), parents))
17 | output = [x.smiles for x in rule.filter(scf, parents)]
18 | return output
19 |
20 |
21 | def _test_rule(mol, rule, expected):
22 | result = fragment_and_filter(mol, rule)
23 | assert len(result) == 1
24 | assert result[0] == expected
25 |
26 |
27 | def _test_rule_min_max(mol, rule, expected_min, expected_max):
28 | _test_rule(mol, rule('min'), expected_min)
29 | _test_rule(mol, rule('max'), expected_max)
30 |
31 |
32 | """
33 | SCP RULES scaffold property (parent)
34 | ------------------------------------
35 | SCPNumLinkerBonds
36 | SCPDelta
37 | SCPAbsDelta
38 | SCPNumAromaticRings
39 | SCPNumHetAtoms
40 | SCPNumNAtoms
41 | SCPNumOAtoms
42 | SCPNumSAtoms
43 | """
44 |
45 |
46 | def test_scp_num_linker_bonds():
47 | test_smiles = 'O=C(NCCCCN1CCN(c2ccccc2)CC1)c1ccc2c(c1)Cc1ccccc1-2'
48 | test_mol = Chem.MolFromSmiles(test_smiles)
49 | min_result = canonize_smiles('O=C(NCCCCN1CCNCC1)c1ccc2c(c1)Cc1ccccc1-2')
50 | max_result = canonize_smiles('O=C(NCCCCN1CCN(c2ccccc2)CC1)c1ccc2c(c1)CC=C2')
51 | _test_rule_min_max(test_mol, SCPNumLinkerBonds, min_result, max_result)
52 |
53 |
54 | def test_scp_delta():
55 | test_smiles = 'OC5CC31CC5CC1C4CCc2occc2C4CC3'
56 | test_mol = Chem.MolFromSmiles(test_smiles)
57 | min_result = canonize_smiles('OC1CCC2(CCC3c4ccoc4CCC3C2)C1') # retain spiro
58 | max_result = canonize_smiles('OC1CC23CCC4C=CCCC4C2CC1C3') # retain bridged
59 | _test_rule_min_max(test_mol, SCPDelta, min_result, max_result)
60 |
61 |
62 | def test_scp_abs_delta():
63 | test_smiles = 'C1CC2CN3C(CC=CC3=O)C4C2N(C1)CCC4'
64 | test_mol = Chem.MolFromSmiles(test_smiles)
65 | max_result = canonize_smiles('C1CC2CNCC3CCCN(C1)C23')
66 | _test_rule(test_mol, SCPAbsDelta('max'), max_result)
67 |
68 |
69 | def test_scp_num_het_atoms():
70 | test_smiles = 'C2Oc3ccccc3C(=O)C2=O'
71 | test_mol = Chem.MolFromSmiles(test_smiles)
72 | min_result = canonize_smiles('c1ccccc1')
73 | max_result = canonize_smiles('O=C1C=COCC1=O')
74 | _test_rule_min_max(test_mol, SCPNumHetAtoms, min_result, max_result)
75 |
76 |
77 | def test_scp_num_aromatic_rings():
78 | test_smiles = 'N1CCCC(OC(=O)C(O)(c2ccccc2)c2ccccc2)C1'
79 | test_mol = Chem.MolFromSmiles(test_smiles)
80 | min_result = canonize_smiles('O=C(OC1CCCNC1)C(O)c1ccccc1')
81 | max_result = canonize_smiles('OC(c1ccccc1)c1ccccc1')
82 | _test_rule_min_max(test_mol, SCPNumAromaticRings, min_result, max_result)
83 |
84 |
85 | def test_scp_num_oxygen_atoms():
86 | test_smiles = 'C2Oc3ccccc3C(=O)C2=O'
87 | test_mol = Chem.MolFromSmiles(test_smiles)
88 | min_result = canonize_smiles('c1ccccc1')
89 | max_result = canonize_smiles('O=C1C=COCC1=O')
90 | _test_rule_min_max(test_mol, SCPNumOAtoms, min_result, max_result)
91 |
92 |
93 | def test_scp_num_nitrogen_atoms():
94 | test_smiles = 'N1CCCC(OC(=O)C(O)(c2ccccc2)c2ccccc2)C1'
95 | test_mol = Chem.MolFromSmiles(test_smiles)
96 | min_result = canonize_smiles('OC(c1ccccc1)c1ccccc1')
97 | max_result = canonize_smiles('O=C(OC1CCCNC1)C(O)c1ccccc1')
98 | _test_rule_min_max(test_mol, SCPNumNAtoms, min_result, max_result)
99 |
100 |
101 | def test_scp_num_sulphur_atoms():
102 | test_smiles = 'c1csc2c(NCN3CCN(CCc4ccccc4)CC3)ncnc12'
103 | test_mol = Chem.MolFromSmiles(test_smiles)
104 | min_result = canonize_smiles('c1ccc(CCN2CCN(CNc3ccncn3)CC2)cc1')
105 | max_result = canonize_smiles('c1nc(NCN2CCNCC2)c2sccc2n1')
106 | _test_rule_min_max(test_mol, SCPNumSAtoms, min_result, max_result)
107 |
108 |
109 | """
110 | RRP RULES removed ring property
111 | -------------------------------
112 | RRPRingSize
113 | RRPLinkerLength
114 | RRPHetAtomLinked
115 | RRPNumHetAtoms
116 | RRPNumNAtoms
117 | RRPNumOAtoms
118 | RRPNumSAtoms
119 | """
120 |
121 |
122 | def test_rrp_ring_size():
123 | test_smiles = 'n1nc(-c2ccccc2)nc1=S'
124 | test_mol = Chem.MolFromSmiles(test_smiles)
125 | min_result = canonize_smiles('c1ccccc1')
126 | max_result = canonize_smiles('S=C1N=CN=N1')
127 | _test_rule_min_max(test_mol, RRPRingSize, min_result, max_result)
128 |
129 |
130 | def test_rrp_linker_length():
131 | test_smiles = 'O=C1c2ccccc2-c2c(NCCc3ccccc3)c(=O)[nH]c3cccc1c23'
132 | test_mol = Chem.MolFromSmiles(test_smiles)
133 | min_result = canonize_smiles('O=C1C=Cc2c(NCCc3ccccc3)c(=O)[nH]c3cccc1c23')
134 | max_result = canonize_smiles('O=C1c2ccccc2-c2cc(=O)[nH]c3cccc1c23')
135 | _test_rule_min_max(test_mol, RRPLinkerLength, min_result, max_result)
136 |
137 |
138 | def test_rrp_het_atom_linked():
139 | test_smiles = 'O=C(NCc1ccccc1)N1CCN2C(=O)OC(c3ccccc3)(c3ccccc3)[C@@H]2C1'
140 | test_mol = Chem.MolFromSmiles(test_smiles)
141 | min_result = canonize_smiles('O=C(NCc1ccccc1)N1CCN2C(=O)OC(c3ccccc3)[C@@H]2C1')
142 | max_result = canonize_smiles('O=C1OC(c2ccccc2)(c2ccccc2)[C@@H]2CNCCN12')
143 | _test_rule_min_max(test_mol, RRPHetAtomLinked, min_result, max_result)
144 |
145 |
146 | def test_rrp_num_het_atoms():
147 | test_smiles = 'c1cccc2c(=O)[nH][nH]c(=O)c12'
148 | test_mol = Chem.MolFromSmiles(test_smiles)
149 | min_result = canonize_smiles('O=c1ccc(=O)[nH][nH]1')
150 | max_result = canonize_smiles('c1ccccc1')
151 | _test_rule_min_max(test_mol, RRPNumHetAtoms, min_result, max_result)
152 |
153 |
154 | def test_rrp_num_nitrogen_atoms():
155 | test_smiles = 'c1cccc2c(=O)[nH][nH]c(=O)c12'
156 | test_mol = Chem.MolFromSmiles(test_smiles)
157 | min_result = canonize_smiles('O=c1ccc(=O)[nH][nH]1')
158 | max_result = canonize_smiles('c1ccccc1')
159 | _test_rule_min_max(test_mol, RRPNumNAtoms, min_result, max_result)
160 |
161 |
162 | def test_rrp_num_oxygen_atoms():
163 | test_smiles = 'C1OC(=O)C2=C1CCC=C2'
164 | test_mol = Chem.MolFromSmiles(test_smiles)
165 | min_result = canonize_smiles('O=C1C=CCO1')
166 | max_result = canonize_smiles('C1=CCCC=C1')
167 | _test_rule_min_max(test_mol, RRPNumOAtoms, min_result, max_result)
168 |
169 |
170 | def test_rrp_num_sulphur_atoms():
171 | test_smiles = 'C1CSC(=NNC(=O)C(=O)CC2CCOCC2)N1'
172 | test_mol = Chem.MolFromSmiles(test_smiles)
173 | min_result = canonize_smiles('N=C1NCCS1')
174 | max_result = canonize_smiles('C1CCOCC1')
175 | _test_rule_min_max(test_mol, RRPNumSAtoms, min_result, max_result)
176 |
177 |
178 | """
179 | RSP Rules property of the ring system of a removed ring before removal
180 | ----------------------------------------------------------------------
181 | RSPAbsDelta
182 | RSPDelta
183 | RSPNumAromaticRings
184 | RSPNumHetAtoms
185 | RSPNumNAtoms
186 | RSPNumOAtoms
187 | RSPNumRings
188 | RSPNumSAtoms
189 | """
190 |
191 |
192 | def test_rsp_delta():
193 | test_smiles = 'O=C1N(CCCC3CCNCC3)CCC12CCN1CCCC12'
194 | test_mol = Chem.MolFromSmiles(test_smiles)
195 | min_result = canonize_smiles('O=C1N(CCCC2CCNCC2)CCC12CCNC2')
196 | max_result = canonize_smiles('O=C1NCCC12CCN1CCCC12')
197 | _test_rule_min_max(test_mol, RSPDelta, min_result, max_result)
198 |
199 |
200 | def test_rsp_abs_delta():
201 | test_smiles = 'O=C1N(CCCC3CCNCC3)CCC12CCN1CCCC12'
202 | test_mol = Chem.MolFromSmiles(test_smiles)
203 | min_result = canonize_smiles('O=C1NCCC12CCN1CCCC12')
204 | max_result = canonize_smiles('O=C1N(CCCC2CCNCC2)CCC12CCNC2')
205 | _test_rule_min_max(test_mol, RSPAbsDelta, min_result, max_result)
206 |
207 |
208 | def test_rsp_num_aromatic_rings():
209 | test_smiles = 'O=C(c1c2ccccc2cc2ccccc12)N1CCC(N2CCC[C@@H](C(=O)N3CCOCC3)C2)CC1'
210 | test_mol = Chem.MolFromSmiles(test_smiles)
211 | min_result = canonize_smiles('O=C(c1c2ccccc2cc2ccccc12)N1CCC(N2CCCCC2)CC1')
212 | max_result = canonize_smiles('O=C(c1cccc2ccccc12)N1CCC(N2CCC[C@@H](C(=O)N3CCOCC3)C2)CC1')
213 | _test_rule_min_max(test_mol, RSPNumAromaticRings, min_result, max_result)
214 |
215 |
216 | def test_rsp_num_het_atoms():
217 | test_smiles = 'c1nc2ccc3nc(NC(=O)C(c4ccccc4)c4ccccc4)sc3c2s1'
218 | test_mol = Chem.MolFromSmiles(test_smiles)
219 | min_result = canonize_smiles('O=C(Cc1ccccc1)Nc1nc2ccc3ncsc3c2s1')
220 | max_result = canonize_smiles('O=C(Nc1nc2ccccc2s1)C(c1ccccc1)c1ccccc1')
221 | _test_rule_min_max(test_mol, RSPNumHetAtoms, min_result, max_result)
222 |
223 |
224 | def test_rsp_num_nitogen_atoms():
225 | test_smiles = 'c1nc2ccc3nc(NC(=O)C(c4ccccc4)c4ccccc4)sc3c2s1'
226 | test_mol = Chem.MolFromSmiles(test_smiles)
227 | min_result = canonize_smiles('O=C(Cc1ccccc1)Nc1nc2ccc3ncsc3c2s1')
228 | max_result = canonize_smiles('O=C(Nc1nc2ccccc2s1)C(c1ccccc1)c1ccccc1')
229 | _test_rule_min_max(test_mol, RSPNumNAtoms, min_result, max_result)
230 |
231 |
232 | def test_rsp_num_oxygen_atoms():
233 | test_smiles = 'c1c2c(c3occ(-c4ccccc4)c(=O)c3c1)C=CCO2'
234 | test_mol = Chem.MolFromSmiles(test_smiles)
235 | min_result = canonize_smiles('O=c1ccoc2c3c(ccc12)OCC=C3')
236 | max_result = canonize_smiles('O=c1c(-c2ccccc2)coc2ccccc12')
237 | _test_rule_min_max(test_mol, RSPNumOAtoms, min_result, max_result)
238 |
239 |
240 | def test_rsp_num_sulphur_atoms():
241 | test_smiles = 'c1nc2ccc3nc(NC(=O)C(c4ccccc4)c4ccccc4)sc3c2s1'
242 | test_mol = Chem.MolFromSmiles(test_smiles)
243 | min_result = canonize_smiles('O=C(Cc1ccccc1)Nc1nc2ccc3ncsc3c2s1')
244 | max_result = canonize_smiles('O=C(Nc1nc2ccccc2s1)C(c1ccccc1)c1ccccc1')
245 | _test_rule_min_max(test_mol, RSPNumSAtoms, min_result, max_result)
246 |
247 |
248 | def test_rsp_num_rings():
249 | test_smiles = 'O=C(c1c2ccccc2cc2ccccc12)N1CCC(N2CCC[C@@H](C(=O)N3CCOCC3)C2)CC1'
250 | test_mol = Chem.MolFromSmiles(test_smiles)
251 | min_result = canonize_smiles('O=C(c1c2ccccc2cc2ccccc12)N1CCC(N2CCCCC2)CC1')
252 | max_result = canonize_smiles('O=C(c1cccc2ccccc12)N1CCC(N2CCC[C@@H](C(=O)N3CCOCC3)C2)CC1')
253 | _test_rule_min_max(test_mol, RSPNumRings, min_result, max_result)
254 |
--------------------------------------------------------------------------------
/tests/prioritization/test_prioritization_rules.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.prioritization.test_prioritization_rules
3 | """
4 |
5 | import pytest
6 |
7 | from scaffoldgraph.prioritization.prioritization_rules import *
8 |
9 |
10 | class MockScaffoldFilterRule(BaseScaffoldFilterRule):
11 | def filter(self, child, parents):
12 | return parents[1:]
13 |
14 | @property
15 | def name(self):
16 | return 'mock'
17 |
18 |
19 | def test_prioritization_rules():
20 | """Test abstract ruletypes cannot be initialized."""
21 | with pytest.raises(TypeError):
22 | BaseScaffoldFilterRule()
23 | with pytest.raises(TypeError):
24 | ScaffoldFilterRule()
25 | with pytest.raises(TypeError):
26 | ScaffoldMinFilterRule()
27 | with pytest.raises(TypeError):
28 | ScaffoldMaxFilterRule()
29 |
30 |
31 | def test_base_rule_subclass():
32 | """Test base class can be subclassed"""
33 | mock = MockScaffoldFilterRule()
34 | parents = [0, 1, 2, 3, 4]
35 | assert mock.name == 'mock'
36 | assert str(mock) == 'mock'
37 | assert mock.filter(None, parents) == [1, 2, 3, 4]
38 | assert mock(None, parents) == mock.filter(None, parents)
39 | assert repr(mock) == ''.format(hex(id(mock)))
40 |
41 |
42 | def test_subclassing():
43 | assert issubclass(ScaffoldFilterRule, BaseScaffoldFilterRule)
44 | assert issubclass(ScaffoldMaxFilterRule, BaseScaffoldFilterRule)
45 | assert issubclass(ScaffoldMinFilterRule, BaseScaffoldFilterRule)
46 |
--------------------------------------------------------------------------------
/tests/prioritization/test_prioritization_ruleset.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.prioritization.test_prioritization_ruleset
3 | """
4 |
5 | import tempfile
6 | import pytest
7 | import os
8 |
9 | from scaffoldgraph.prioritization import ScaffoldRuleSet, BaseScaffoldFilterRule
10 | from scaffoldgraph.prioritization.original_rules import original_ruleset, OriginalRule10
11 |
12 |
13 | @pytest.fixture(name='null_set')
14 | def empty_ruleset():
15 | return ScaffoldRuleSet()
16 |
17 |
18 | def test_empty_filter(null_set):
19 | """Test filtering with an empty ruleset raises an error"""
20 | with pytest.raises(ValueError):
21 | assert null_set.filter_scaffolds(null_set, [])
22 | with pytest.raises(ValueError):
23 | assert original_ruleset.filter_scaffolds('', [])
24 |
25 |
26 | def test_name(null_set):
27 | assert null_set.name is 'ScaffoldRuleSet'
28 | null_set.name = 'some_name'
29 | assert null_set.name == 'some_name'
30 |
31 |
32 | def test_rules():
33 | rules = original_ruleset.rules
34 | assert all([issubclass(x.__class__, BaseScaffoldFilterRule) for x in rules])
35 |
36 |
37 | def test_builtins():
38 | assert len(original_ruleset) == 15
39 | assert isinstance(original_ruleset[0], BaseScaffoldFilterRule)
40 | assert original_ruleset.check_valid_rule(OriginalRule10())
41 | original_ruleset.add_rule(OriginalRule10())
42 | assert len(original_ruleset) == 16
43 | original_ruleset.insert_rule(OriginalRule10(), 1)
44 | assert original_ruleset[1].__class__ == OriginalRule10
45 | original_ruleset.delete_rule(16)
46 | original_ruleset.delete_rule(1)
47 | assert len(original_ruleset) == 15
48 | assert repr(original_ruleset) == ''.format(hex(id(original_ruleset)))
49 |
50 |
51 | def test_from_rule_file():
52 | with tempfile.NamedTemporaryFile('w', suffix='.txt', prefix=os.path.basename(__file__)) as tf:
53 | tf.write('OriginalRule01\nOriginalRule02\nSCPNumHetAtoms_min\nRRPRingSizeX_max_6')
54 | tf.seek(0)
55 | ruleset = ScaffoldRuleSet.from_rule_file(tf.name)
56 | assert len(ruleset) == 4
57 | assert ruleset[0].name == 'original rule 01'
58 | assert ruleset[1].name == 'original rule 02'
59 | assert ruleset[2].name == 'SCPNumHetAtoms'
60 | assert ruleset[3].name == 'RRPRingSizeX'
61 | assert ruleset[2].func == min
62 | assert ruleset[3].func == max
63 | assert ruleset[3].size == 6
64 |
65 |
66 | def test_errors(null_set):
67 | with pytest.raises(TypeError):
68 | null_set.add_rule('')
69 | null_set.insert_rule('', 0)
70 | null_set[0] = ''
71 |
--------------------------------------------------------------------------------
/tests/scripts/test_scripts.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.scripts.test_scripts
3 | """
4 |
5 | import tempfile
6 | import pathlib
7 | import pytest
8 | import os
9 |
10 | from subprocess import Popen, PIPE
11 |
12 |
13 | TEST_DATA_DIR = pathlib.Path(__file__).resolve().parent / '..' / 'data'
14 |
15 |
16 | def check_generate_structure(fn):
17 | with open(fn, 'r') as f:
18 | lines = f.readlines()
19 | headings = lines[0].strip().split('\t')
20 | assert 'HIERARCHY' in headings
21 | assert 'SMILES' in headings
22 | assert 'SUBSCAFFOLDS' in headings
23 | assert 'MOLECULES' in headings
24 | assert 'ANNOTATIONS' in headings
25 | assert len(lines) > 1
26 |
27 |
28 | def check_aggregate_structure(fn):
29 | with open(fn, 'r') as f:
30 | lines = f.readlines()
31 | headings = lines[0].strip().split('\t')
32 | assert 'HIERARCHY' in headings
33 | assert 'SMILES' in headings
34 | assert 'SUBSCAFFOLDS' in headings
35 | assert 'ID' in headings
36 | assert len(lines) > 1
37 | smiles = lines[-1].strip().split('\t')[2]
38 | return smiles
39 |
40 |
41 | def check_select_structure(fn):
42 | with open(fn, 'r') as f:
43 | lines = f.readlines()
44 | headings = lines[0].strip().split('\t')
45 | assert 'HIERARCHY' in headings
46 | assert 'SMILES' in headings
47 | assert 'SUBSCAFFOLDS' in headings
48 | assert 'ID' in headings
49 | assert len(lines) > 1
50 |
51 |
52 | # test all utilities in one
53 | # skip: pytest -m "not slow"
54 | @pytest.mark.slow
55 | def test_cli():
56 | funcs = ['tree', 'network', 'hiers']
57 | fn = str(TEST_DATA_DIR / 'test_smiles.smi')
58 | with tempfile.TemporaryDirectory() as tmp:
59 | for func in funcs:
60 |
61 | # Test graph generation
62 | out = os.path.join(tmp, 'output.tmp')
63 | args = ['scaffoldgraph', func, fn, out]
64 | p2 = Popen(args, stdout=PIPE, stderr=PIPE)
65 | stdout, _ = p2.communicate()
66 | assert stdout is not None
67 | assert os.path.exists(out)
68 | check_generate_structure(out)
69 |
70 | # Test graph aggregation
71 | out2 = os.path.join(tmp, 'output.txt')
72 | args = [
73 | 'scaffoldgraph', 'aggregate', out, out2
74 | ]
75 | p2 = Popen(args, stdout=PIPE, stderr=PIPE)
76 | stdout, _ = p2.communicate()
77 | assert stdout is not None
78 | assert os.path.exists(out)
79 | smiles = check_aggregate_structure(out2)
80 |
81 | # Test graph selection
82 | test_smi = os.path.join(tmp, 'test.smi')
83 | out3 = os.path.join(tmp, 'select.txt')
84 | with open(test_smi, 'w') as smi:
85 | smi.write(f'{smiles} fake_scaffold_id')
86 | args = [
87 | 'scaffoldgraph', 'select', out2, test_smi, out3
88 | ]
89 | p2 = Popen(args, stdout=PIPE, stderr=PIPE)
90 | stdout, _ = p2.communicate()
91 | assert stdout is not None
92 | assert os.path.exists(out3)
93 | check_select_structure(out3)
94 |
--------------------------------------------------------------------------------
/tests/test_network.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.test_network
3 | """
4 |
5 | import pytest
6 | import os
7 |
8 | from pathlib import Path
9 |
10 | import scaffoldgraph as sg
11 |
12 | from . import mock_sdf, mock_smiles_file
13 |
14 |
15 | TEST_DATA_DIR = Path(__file__).resolve().parent / 'data'
16 |
17 |
18 | @pytest.fixture(name='test_net')
19 | def test_network(sdf_file):
20 | network = sg.ScaffoldNetwork.from_sdf(sdf_file)
21 | return network
22 |
23 |
24 | @pytest.fixture(name='network')
25 | def long_test_network():
26 | network = sg.ScaffoldNetwork.from_smiles_file(str(TEST_DATA_DIR / 'test_smiles.smi'))
27 | return network
28 |
29 |
30 | def test_network_from_sdf(sdf_file):
31 | network = sg.ScaffoldNetwork.from_sdf(sdf_file)
32 | assert network.num_scaffold_nodes == 8
33 | assert network.num_molecule_nodes == 2
34 |
35 |
36 | def test_network_from_smiles(smiles_file):
37 | network = sg.ScaffoldNetwork.from_smiles_file(smiles_file)
38 | assert network.num_scaffold_nodes == 8
39 | assert network.num_molecule_nodes == 2
40 |
41 |
42 | def test_hiers(sdf_file):
43 | network = sg.HierS.from_sdf(sdf_file)
44 | assert network.num_scaffold_nodes == 5
45 | assert network.num_molecule_nodes == 2
46 |
47 |
48 | def test_hierarchy_functions(network):
49 | hierarchy_sizes = network.get_hierarchy_sizes()
50 | assert hierarchy_sizes[1] == 7
51 | assert hierarchy_sizes[2] == 10
52 | assert hierarchy_sizes[3] == 7
53 | assert hierarchy_sizes[4] == 1
54 | assert network.max_hierarchy() == 4
55 | assert network.min_hierarchy() == 1
56 | s_in_h2 = {
57 | 'C1=Cn2cnnc2CN=C1', 'O=C1CN=C(c2ccccn2)C=CN1', 'O=C1CN=Cc2ccccc2N1',
58 | 'C1=CC(c2ccccc2)=[NH+]CC=N1', 'C1=Nc2ccccc2C=[NH+]C1',
59 | 'O=C1CC(=O)N(c2ccccc2)C=CN1', 'O=C1CC(=O)Nc2ccccc2N1',
60 | 'O=C1CN=C(c2ccccc2)C=CN1', 'O=C1C[NH+]=Cc2ccccc2N1',
61 | 'O=C1C[NH+]=C(c2ccccc2)C=CN1'
62 | }
63 | assert s_in_h2 == set(network.get_scaffolds_in_hierarchy(2))
64 |
65 |
66 | def test_simple_functions(network):
67 | assert network.scaffold_in_graph('C1=Cn2cnnc2CN=C1') is True
68 | # Below is the non-canonical SMILES of the above
69 | assert network.scaffold_in_graph('C1=C-n2:c:n:n:c:2-C-N=C-1') is True
70 | assert network.scaffold_in_graph('c1ccccc1CCNc2ccccc2') is False
71 | assert network.molecule_in_graph('Adinazolam') is True
72 | assert network.molecule_in_graph('Citalopram') is False
73 |
74 |
75 | def test_traversal(network):
76 | s_for_adinazolam = {
77 | 'c1ccc(C2=NCc3nncn3-c3ccccc32)cc1', 'C1=NCc2nncn2-c2ccccc21',
78 | 'C1=Cn2cnnc2CN=C1c1ccccc1', 'C1=Cn2cnnc2CN=C1', 'c1nnc[nH]1'
79 | }
80 | assert set(network.get_scaffolds_for_molecule('Adinazolam')) == s_for_adinazolam
81 | m_for_scaffold = {'Adinazolam', 'Alprazolam'}
82 | assert set(network.get_molecules_for_scaffold('c1nnc[nH]1')) == m_for_scaffold
83 |
84 |
85 | def test_separate_disconnected(network):
86 | assert len(network.separate_disconnected_components(sort=True)) == 2
87 | assert type(network.separate_disconnected_components()[0]) == type(network)
88 |
89 |
90 | def test_repr(test_net):
91 | assert repr(test_net) == ''.format(hex(id(test_net)))
92 |
--------------------------------------------------------------------------------
/tests/test_tree.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.test_tree
3 | """
4 |
5 | import networkx as nx
6 | import pytest
7 |
8 | import scaffoldgraph as sg
9 |
10 | from . import mock_sdf, mock_smiles_file
11 |
12 |
13 | @pytest.fixture(name='test_tree')
14 | def test_tree_graph(sdf_file):
15 | tree = sg.ScaffoldTree.from_sdf(sdf_file)
16 | return tree
17 |
18 |
19 | def test_tree_from_sdf(sdf_file):
20 | tree = sg.ScaffoldTree.from_sdf(sdf_file)
21 | assert tree.num_scaffold_nodes == 5
22 | assert tree.num_molecule_nodes == 2
23 | assert nx.is_tree(tree)
24 |
25 |
26 | def test_tree_from_smiles(smiles_file):
27 | tree = sg.ScaffoldTree.from_smiles_file(smiles_file)
28 | assert tree.num_scaffold_nodes == 5
29 | assert tree.num_molecule_nodes == 2
30 | assert nx.is_tree(tree)
31 |
32 |
33 | def test_repr(test_tree):
34 | assert repr(test_tree) == ''.format(hex(id(test_tree)))
35 |
--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.utils
3 | """
4 |
5 | from .. import mock_sdf, mock_sdf_2
6 |
--------------------------------------------------------------------------------
/tests/utils/test_aggregate.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.utils.test_aggregate
3 | """
4 |
5 | import scaffoldgraph as sg
6 |
7 | from scaffoldgraph.utils import aggregate
8 | from . import mock_sdf, mock_sdf_2
9 |
10 |
11 | def test_aggregate(sdf_file, sdf_file_2):
12 | net_1 = sg.ScaffoldNetwork.from_sdf(sdf_file)
13 | net_2 = sg.ScaffoldNetwork.from_sdf(sdf_file_2)
14 | network = aggregate([net_1, net_2])
15 | assert network.num_scaffold_nodes == 14
16 | assert network.num_molecule_nodes == 4
17 |
--------------------------------------------------------------------------------
/tests/utils/test_bipartite.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.utils.test_bipartite
3 | """
4 |
5 | import scaffoldgraph as sg
6 | import networkx as nx
7 |
8 | from scaffoldgraph.utils.bipartite import make_bipartite_graph
9 | from . import mock_sdf
10 |
11 |
12 | def test_bipartite(sdf_file):
13 | network = sg.ScaffoldNetwork.from_sdf(sdf_file)
14 | biparite = make_bipartite_graph(network)
15 | assert nx.is_bipartite(biparite)
16 |
--------------------------------------------------------------------------------
/tests/utils/test_misc.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.utils.test_misc
3 | """
4 |
5 | import scaffoldgraph as sg
6 |
7 | from scaffoldgraph.utils import summary
8 | from . import mock_sdf
9 |
10 |
11 | SUMMARY_GRAPH = """Type: ScaffoldNetwork
12 | Number of molecule nodes: 2
13 | Number of scaffold nodes: 8
14 | Number of edges: 12
15 | Max hierarchy: 3
16 | Min hierarchy: 1
17 | """
18 |
19 |
20 | SUMMARY_NODE = """Node c1ccccc1 has the following properties:
21 | Type: scaffold
22 | Hierarchy: 1
23 | Degree: 2
24 | Parent scaffolds:
25 | Child scaffolds: O=C1CN=C(c2ccccc2)C=CN1 O=C1CN=Cc2ccccc2N1 O=C1CN=C(c2ccccc2)c2ccccc2N1 O=C1CN=C(c2ccccc2)c2ccsc2N1
26 | Child molecules:
27 | """
28 |
29 |
30 | def test_bipartite(sdf_file):
31 | network = sg.ScaffoldNetwork.from_sdf(sdf_file)
32 | assert summary(network).strip() == SUMMARY_GRAPH.strip()
33 | assert summary(network, 'c1ccccc1').strip() == SUMMARY_NODE.strip()
34 |
--------------------------------------------------------------------------------
/tests/utils/test_subset.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.utils.test_subset
3 |
4 | """
5 | import random
6 | import pytest
7 |
8 | from scaffoldgraph.utils.subset import split_graph_by_molecule_attribute
9 | from collections import defaultdict
10 |
11 | from ..test_network import long_test_network
12 |
13 |
14 | def test_split_by_attribute(network):
15 | key, attrs = 'ATTR', ['ATTR_1', 'ATTR_2', 'ATTR_3']
16 | with pytest.raises(ValueError):
17 | split_graph_by_molecule_attribute(network, True, None)
18 | split_graph_by_molecule_attribute(network, False, None)
19 | assigned = defaultdict(int)
20 | for _, mol_data in network.get_molecule_nodes(True):
21 | attr = random.choice(attrs)
22 | mol_data[key] = attr
23 | assigned[attr] += 1
24 | subgraphs = split_graph_by_molecule_attribute(network, key, None)
25 | assert len(subgraphs) == len(assigned)
26 | for u_attr in assigned.keys():
27 | subgraph = subgraphs[u_attr]
28 | assert subgraph.num_molecule_nodes == assigned[u_attr]
29 | assert subgraph.num_scaffold_nodes > 0
30 | assert all([d == u_attr for n, d in subgraph.get_molecule_nodes(key)])
31 |
--------------------------------------------------------------------------------
/tests/vis/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.vis
3 | """
4 |
5 | from ..test_network import long_test_network
6 |
--------------------------------------------------------------------------------
/tests/vis/test_notebook.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.vis.test_notebook
3 | """
4 |
5 | from pytest import mark
6 |
7 |
8 | @mark.filterwarnings('ignore::UserWarning')
9 | def test_resources():
10 | # Import inside function to supress user warning.
11 | from scaffoldgraph.vis.notebook import cytoscape
12 | check = cytoscape.DEFAULT_STYLE
13 | assert check.parent.exists() # resource directory
14 | assert check.exists() # cytoscape.json
15 | style = cytoscape.read_style_file(str(check))
16 | assert style is not None
17 |
--------------------------------------------------------------------------------
/tests/vis/test_vis_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | scaffoldgraph tests.vis.test_vis_utils
3 | """
4 |
5 | import scaffoldgraph.vis.utils as vis_utils
6 | import matplotlib.pyplot as plt
7 | import random
8 | import pytest
9 | import re
10 |
11 | from rdkit.Chem.Draw import rdMolDraw2D
12 | from rdkit import Chem
13 |
14 | from scaffoldgraph.utils import suppress_rdlogger
15 | from . import long_test_network
16 |
17 |
18 | SVG_PATTERN = r'(?:<\?xml\b[^>]*>[^<]*)?(?:[^<]*)*(?: