├── DGCAN ├── __init__.py ├── dict │ ├── -atom_dict.pickle │ ├── -bond_dict.pickle │ ├── -edge_dict.pickle │ └── -fingerprint_dict.pickle ├── run.py ├── train.py ├── preprocess.py ├── DGCAN.py ├── results │ └── AUC.txt └── predict.py ├── solgan.png ├── screening ├── process.png ├── COVIDVS-3.ipynb ├── Dataset │ ├── testset.csv │ └── finetunev1.csv ├── README.md └── DTI.ipynb ├── LICENSE ├── .gitignore ├── README.md ├── Discussion ├── preprocess.py ├── GPC.py ├── svc.py ├── RF.py ├── CNN.py └── GNN.py ├── Test └── test.ipynb ├── dataset ├── bRo5.txt └── withdrawn.txt └── Tutorial.ipynb /DGCAN/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /solgan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinYSun/D-GCAN/HEAD/solgan.png -------------------------------------------------------------------------------- /screening/process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinYSun/D-GCAN/HEAD/screening/process.png -------------------------------------------------------------------------------- /DGCAN/dict/-atom_dict.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinYSun/D-GCAN/HEAD/DGCAN/dict/-atom_dict.pickle -------------------------------------------------------------------------------- /DGCAN/dict/-bond_dict.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinYSun/D-GCAN/HEAD/DGCAN/dict/-bond_dict.pickle -------------------------------------------------------------------------------- /DGCAN/dict/-edge_dict.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinYSun/D-GCAN/HEAD/DGCAN/dict/-edge_dict.pickle -------------------------------------------------------------------------------- /DGCAN/dict/-fingerprint_dict.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinYSun/D-GCAN/HEAD/DGCAN/dict/-fingerprint_dict.pickle -------------------------------------------------------------------------------- /DGCAN/run.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Apr 28 09:19:10 2022 4 | 5 | @author: BM109X32G-10GPU-02 6 | """ 7 | import torch 8 | import pandas as pd 9 | import train 10 | import predict 11 | import numpy as np 12 | import rdkit 13 | from rdkit import Chem 14 | from rdkit.Chem import QED 15 | from rdkit import rdBase, Chem 16 | from rdkit.Chem import PandasTools, QED, Descriptors, rdMolDescriptors 17 | from rdkit.Chem import Lipinski 18 | 19 | tes = train.train('../dataset/data_test.txt', 20 | radius = 1, 21 | dim = 52, 22 | layer_hidden = 4, 23 | layer_output = 10, 24 | dropout = 0.45, 25 | batch_train = 8, 26 | batch_test = 8, 27 | lr =3e-4, 28 | lr_decay = 0.85, 29 | decay_interval = 25, 30 | iteration = 140, 31 | N = 5000, 32 | dataset_train='../dataset/data_train.txt') 33 | 34 | 35 | test = predict.predict('../dataset/nonUS.txt', 36 | radius = 1, 37 | property = True, 38 | dim = 52 , 39 | layer_hidden = 4, 40 | layer_output = 10, 41 | dropout = 0.45, 42 | batch_train = 8, 43 | batch_test = 8, 44 | lr = 3e-4, 45 | lr_decay = 0.85, 46 | decay_interval = 25 , 47 | iteration = 140, 48 | N = 5000) 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022, JinyuSun 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.pyc 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

logo

2 |

3 |

Prediction of Drug-likeness using Graph Convolutional Attention Network.

4 |

5 | ----------------------------------------------------------------------------------------------------------------- 6 | 7 | 8 | 9 | A deep learning method was developed to predict **d**rug-likeness based on the **g**raph **c**onvolutional **a**ttention **n**etwork (D-GCAN) directly from molecular structures. The model combined the advantages of graph convolution and attention mechanism. D-GCAN is a promising tool to predict drug-likeness for selecting potential candidates and accelerating the process of drug discovery by excluding unpromising candidates and avoiding unnecessary biological and clinical testing. 10 | 11 | 12 | 13 | ![图片](https://user-images.githubusercontent.com/62410732/143736741-05e00f97-b01c-4130-8faa-562b51c0a4b4.png) 14 | 15 | 16 | 17 | 18 | ## Motivation 19 | 20 | The drug-likeness has been widely used as a criterion to distinguish drug-like molecules from non-drugs. Developing reliable computational methods to predict drug-likeness of candidate compounds is crucial to triage unpromising molecules and accelerate the drug discovery process. 21 | 22 | 23 | 24 | 25 | ## Depends 26 | 27 | [Anaconda for python 3.8](https://www.python.org/) 28 | 29 | [conda install pytorch](https://pytorch.org/) 30 | 31 | [conda install -c conda-forge rdkit](https://rdkit.org/) 32 | 33 | 34 | 35 | 36 | ## Discussion 37 | 38 | The [Discussion](https://github.com/JinYSun/D-GCAN/tree/main/Discussion) folder contains the scripts for evaluating the classification performance. We compared sevaral common methods widely used in drug-likeness prediction, such as [GNN](https://github.com/JinYSun/D-GCAN/tree/main/Discussion/GNN.py),[RF](https://github.com/JinYSun/D-GCAN/tree/main/Discussion/GNN.py), [CNN](https://github.com/JinYSun/D-GCAN/tree/main/Discussion/RF.py),[SVC](https://github.com/JinYSun/D-GCAN/tree/main/Discussion/SVC.py),and [GPC](https://github.com/JinYSun/D-GCAN/tree/main/Discussion/GPC.py). 39 | 40 | 41 | 42 | 43 | ## Usage 44 | 45 | If you want to retrain the model, please put the molecule's SMILES files in to data directory and run [D-GCAN](https://github.com/JinYSun/D-GCAN/tree/main/DGCAN/DGCAN.py). The test set can be replaced by changing the path. It is recommended to retrain the model before predicting. The process will take less than 15 minutes. It is as simple as 46 | 47 | ``` 48 | import train 49 | test = train.train('../dataset/bRo5.txt', 50 | radius = 1, 51 | dim = 52, 52 | layer_hidden = 4, 53 | layer_output = 10, 54 | dropout = 0.45, 55 | batch_train = 8, 56 | batch_test = 8, 57 | lr =3e-4, 58 | lr_decay = 0.85, 59 | decay_interval = 25, 60 | iteration = 140, 61 | N = 5000, 62 | dataset_train='../dataset/data_train.txt') 63 | ``` 64 | 65 | If you want to make the prediction of druglikeness of unknown molecule, it can be made as follow 66 | 67 | ``` 68 | import predict 69 | test = predict.predict('../dataset/bRo5.txt', 70 | radius = 1, 71 | property = True, #True if drug-likeness is known 72 | dim = 52 , 73 | layer_hidden = 4, 74 | layer_output = 10, 75 | dropout = 0.45, 76 | batch_train = 8, 77 | batch_test = 8, 78 | lr = 3e-4, 79 | lr_decay = 0.85, 80 | decay_interval = 25 , 81 | iteration = 140, 82 | N = 5000) 83 | 84 | ``` 85 | 86 | or you can run [run.py](https://github.com/JinYSun/D-GCAN/blob/main/DGCAN/run.py) and modify the hyperparameters of the neural network to optimize the model . 87 | 88 | The D-GCAN-screened GDB-13 database [(S-GDB13)](https://doi.org/10.5281/zenodo.7054367) is a more drug-like database and can be used to find new drug candidates. 89 | 90 | #### -Notice- 91 | 92 | As described in paper, the prediction of drug-likeness was deeply influenced by the dataset, especially the negative set. If necessary, retrain the model on your dataset. 93 | 94 | 95 | 96 | # Contact 97 | 98 | Jinyu Sun E-mail: jinyusun@csu.edu.cn 99 | 100 | 101 | 102 | # Cite 103 | 104 | 105 | @article{10.1093/bioinformatics/btac676, 106 | author = {Sun, Jinyu and Wen, Ming and Wang, Huabei and Ruan, Yuezhe and Yang, Qiong and Kang, Xiao and Zhang, Hailiang and Zhang, Zhimin and Lu, Hongmei}, 107 | title = "{Prediction of Drug-likeness using Graph Convolutional Attention Network}", 108 | journal = {Bioinformatics}, 109 | year = {2022}, 110 | month = {10} 111 | } 112 | -------------------------------------------------------------------------------- /screening/COVIDVS-3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "95a57d71", 6 | "metadata": {}, 7 | "source": [ 8 | "# Virtual Screening for anti-SARS-CoV-2 drugs by COVIDVS" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "4c2a3472", 14 | "metadata": {}, 15 | "source": [ 16 | "COVIDVS models are Chemprop models trained with anti-beta-coronavirus actives/inactives collected from published papers and fine-tuned with anti-SARS-CoV-2 actives/inactives." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "ebc1985d", 22 | "metadata": {}, 23 | "source": [ 24 | "## Installation" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "4ee9c28c", 30 | "metadata": {}, 31 | "source": [ 32 | "Requirement\n", 33 | "conda install -c conda-forge rdkit\n", 34 | "pip install git+https://github.com/bp-kelley/descriptastorus\n", 35 | "pip install chemprop\n", 36 | "model: download https://github.com/pkuwangsw/COVIDVS" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "3e00b470", 42 | "metadata": {}, 43 | "source": [ 44 | "## Dataset" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "f4231a29", 50 | "metadata": {}, 51 | "source": [ 52 | "traindata.csv: A set of inhibitors against HCoV-OC43, SARS-CoV and MERS-CoV collected from literatures. All the inhibitors were identified by screening libraries including FDA-approved drugs and pharmacologically active compounds. This primary training dataset (Training Set 1) contains 90 positive data and 1862 negative data." 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "id": "8b1e47f5", 58 | "metadata": {}, 59 | "source": [ 60 | "testset.csv: This dataset (Test Set 1) was derived from Fine-tuning Set 1 by removing repeated molecules in Training Set 1." 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "id": "def004af", 66 | "metadata": {}, 67 | "source": [ 68 | "finetunev1.csv: This dataset (Fine-tuning Set 1) contains 154 data collocted from literatures, including 70 positive data and 84 negative data. The molecular activities against SARS-CoV-2 of these molecules have been experimentally tested." 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "b81c3941", 74 | "metadata": {}, 75 | "source": [ 76 | "## Pretraining" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "id": "fd6fa29f", 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "python train.py --gpu 0 --data_path ./dataset/traindata.csv --features_path ./dataset/traindata-feat.npy --no_features_scaling --save_dir covidvs1/ --dataset_type classification --split_sizes 0.9 0.1 0.0 --num_folds 20 --config_path hyperopt_it20.json " 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "id": "da60e7de", 92 | "metadata": {}, 93 | "source": [ 94 | "## Fine-tuning" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "id": "7fb02ab2", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "python finetune.py --gpu 0 --data_path ../data/finetunev1.csv --features_path ./dataset/finetunev1-feat.npy --save_dir covidvs2/ --checkpoint_path covidvs1/fold_0/model_0/model.pt --split_sizes 0.9 0.1 0.0 --config_path hyperopt_it20.json --dataset_type classification --init_lr 1e-4 --batch_size 20 --epochs 30" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "6a3f6b8f", 110 | "metadata": {}, 111 | "source": [ 112 | "## Prediction" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "50824d76", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "python predict.py --gpu 0 --test_path ./dataset/dataset.csv --features_path ./dataset/launched-feat.npy --preds_path preds_covidvs1_launched.csv --checkpoint_dir covidvs1/ --use_compound_names" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "0ceaff0a", 128 | "metadata": {}, 129 | "source": [ 130 | "## Acknowledgement" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "id": "cbceee2b", 136 | "metadata": {}, 137 | "source": [ 138 | "This project incorporates code from the following repo:\n", 139 | " https://github.com/pkuwangsw/COVIDVS" 140 | ] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": "Python 3 (ipykernel)", 146 | "language": "python", 147 | "name": "python3" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython3", 159 | "version": "3.8.8" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 5 164 | } 165 | -------------------------------------------------------------------------------- /screening/Dataset/testset.csv: -------------------------------------------------------------------------------- 1 | name,smiles,isactive 2 | Salinomycin sodium,CCC(C(=O)O)C1CCC(C)C(C(C)C(O)C(C)C(=O)C(CC)C2OC3(C=CC(O)C4(CCC(C)(C5CCC(O)(CC)C(C)O5)O4)O3)C(C)CC2C)O1,1 3 | LDK378,Cc1cc(Nc2ncc(Cl)c(Nc3ccccc3S(=O)(=O)C(C)C)n2)c(OC(C)C)cc1C1CCNCC1,1 4 | Isoosajin,CC1(C)C=Cc2c(c3c(c4c(=O)c(-c5ccc(O)cc5)coc24)OC(C)(C)CC3)O1,1 5 | Osimertinib mesylate,C=CC(=O)Nc1cc(Nc2nccc(-c3cn(C)c4ccccc34)n2)c(OC)cc1N(C)CCN(C)C,1 6 | Bazedoxifene,Cc1c(-c2ccc(O)cc2)n(Cc2ccc(OCCN3CCCCCC3)cc2)c2ccc(O)cc12,1 7 | Lusutrombopag,CCCCCCOC(C)c1cccc(-c2csc(NC(=O)c3cc(Cl)c(C=C(C)C(=O)O)c(Cl)c3)n2)c1OC,1 8 | Osajin,CC(C)=CCc1c2c(c3occ(-c4ccc(O)cc4)c(=O)c3c1O)C=CC(C)(C)O2,1 9 | Dronedarone HCl,CCCCc1oc2ccc(NS(C)(=O)=O)cc2c1C(=O)c1ccc(OCCCN(CCCC)CCCC)cc1,1 10 | Ciclesonide,CC(C)C(=O)OCC(=O)C12OC(C3CCCCC3)OC1CC1C3CCC4=CC(=O)C=CC4(C)C3C(O)CC12C,1 11 | Isopomiferin,CC1(C)C=Cc2c(c3c(c4c(=O)c(-c5ccc(O)c(O)c5)coc24)OC(C)(C)CC3)O1,1 12 | Anidulafungin,CCCCCOc1ccc(-c2ccc(-c3ccc(C(=O)NC4CC(O)C(O)NC(=O)C5C(O)C(C)CN5C(=O)C(C(C)O)NC(=O)C(C(O)C(O)c5ccc(O)cc5)NC(=O)C5CC(O)CN5C(=O)C(C(C)O)NC4=O)cc3)cc2)cc1,1 13 | Ivacaftor,CC(C)(C)c1cc(C(C)(C)C)c(NC(=O)c2c[nH]c3ccccc3c2=O)cc1O,1 14 | Droloxifene,CCC(=C(c1ccc(OCCN(C)C)cc1)c1cccc(O)c1)c1ccccc1,1 15 | Abemaciclib,CCN1CCN(Cc2ccc(Nc3ncc(F)c(-c4cc(F)c5nc(C)n(C(C)C)c5c4)n3)nc2)CC1,1 16 | Gilteritinib,CCc1nc(C(N)=O)c(Nc2ccc(N3CCC(N4CCN(C)CC4)CC3)c(OC)c2)nc1NC1CCOCC1,1 17 | Ebastine,CC(C)(C)c1ccc(C(=O)CCCN2CCC(OC(c3ccccc3)c3ccccc3)CC2)cc1,1 18 | Mequitazine,c1ccc2c(c1)Sc1ccccc1N2CC1CN2CCC1CC2,1 19 | Eltrombopag,Cc1ccc(-n2[nH]c(C)c(N=Nc3cccc(-c4cccc(C(=O)O)c4)c3O)c2=O)cc1C,1 20 | Atazanavir,COC(=O)NC(C(=O)NC(Cc1ccccc1)C(O)CN(Cc1ccc(-c2ccccn2)cc1)NC(=O)C(NC(=O)OC)C(C)(C)C)C(C)(C)C,1 21 | Benztropine Mesylate,CN1C2CCC1CC(OC(c1ccccc1)c1ccccc1)C2,1 22 | Terconazole,CC(C)N1CCN(c2ccc(OCC3COC(Cn4cncn4)(c4ccc(Cl)cc4Cl)O3)cc2)CC1,1 23 | Oxprenolol hydrochloride,C=CCOc1ccccc1OCC(O)CNC(C)C,1 24 | Alprostadil,CCCCCC(O)C=CC1C(O)CC(=O)C1CCCCCCC(=O)O,1 25 | Dolutegravir,CC1CCOC2Cn3cc(C(=O)NCc4ccc(F)cc4F)c(=O)c(O)c3C(=O)N12,1 26 | Opipramol dihydrochloride,OCCN1CCN(CCCN2c3ccccc3C=Cc3ccccc32)CC1,1 27 | Arbidol,CCOC(=O)c1c(CSc2ccccc2)n(C)c2cc(Br)c(O)c(CN(C)C)c12,1 28 | Ritonavir,CC(C)c1nc(CN(C)C(=O)NC(C(=O)NC(Cc2ccccc2)CC(O)C(Cc2ccccc2)NC(=O)OCc2cncs2)C(C)C)cs1,1 29 | Nelfinavir,Cc1c(O)cccc1C(=O)NC(CSc1ccccc1)C(O)CN1CC2CCCCC2CC1C(=O)NC(C)(C)C,1 30 | Saquinavir,CC(C)(C)NC(=O)C1CC2CCCCC2CN1CC(O)C(Cc1ccccc1)NC(=O)C(CC(N)=O)NC(=O)c1ccc2ccccc2n1,1 31 | Tipranavir,CCCC1(CCc2ccccc2)CC(O)=C(C(CC)c2cccc(NS(=O)(=O)c3ccc(C(F)(F)F)cn3)c2)C(=O)O1,1 32 | Amprenavir,CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(=O)c1ccc(N)cc1,1 33 | Darunavir,CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1COC2OCCC12)S(=O)(=O)c1ccc(N)cc1,1 34 | Indinavir,CC(C)(C)NC(=O)C1CN(Cc2cccnc2)CCN1CC(O)CC(Cc1ccccc1)C(=O)NC1c2ccccc2CC1O,1 35 | Camostat,CN(C)C(=O)COC(=O)Cc1ccc(OC(=O)c2ccc(N=C(N)N)cc2)cc1,0 36 | Favipiravir,NC(=O)c1nc(F)c[nH]c1=O,0 37 | Acetyl spiramycin,COC1C(OC(C)=O)CC(=O)OC(C)CC=CC=CC(OC2CCC(N(C)C)C(C)O2)C(C)CC(CC=O)C1OC1OC(C)C(OC2CC(C)(O)C(O)C(C)O2)C(N(C)C)C1O,0 38 | Alfadolone,CC12CC(=O)C3C(CCC4CC(O)CCC43C)C1CCC2C(=O)CO,0 39 | Ambrisentan,COC(c1ccccc1)(c1ccccc1)C(Oc1nc(C)cc(C)n1)C(=O)O,0 40 | Artenimol,CC1CCC2C(C)C(O)OC3OC4(C)CCC1C32OO4,0 41 | Benoxinate hydrochloride,CCCCOc1cc(C(=O)OCCN(CC)CC)ccc1N,0 42 | Benzathine,c1ccc(CNCCNCc2ccccc2)cc1,0 43 | Bisbentiamine,CC(=C(CCOC(=O)c1ccccc1)SSC(CCOC(=O)c1ccccc1)=C(C)N(C=O)Cc1cnc(C)nc1N)N(C=O)Cc1cnc(C)nc1N,0 44 | Budralazine,CC(C)=CC(C)=NNc1nncc2ccccc12,0 45 | Candesartan,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1,0 46 | Cisatracurium besylate,COc1ccc(CC2c3cc(OC)c(OC)cc3CC[N+]2(C)CCC(=O)OCCCCCOC(=O)CC[N+]2(C)CCc3cc(OC)c(OC)cc3C2Cc2ccc(OC)c(OC)c2)cc1OC,0 47 | Cyclofenil,CC(=O)Oc1ccc(C(=C2CCCCC2)c2ccc(OC(C)=O)cc2)cc1,0 48 | Dipivefrin hydrochloride,CNCC(O)c1ccc(OC(=O)C(C)(C)C)c(OC(=O)C(C)(C)C)c1,0 49 | Eperisone HCl,CCc1ccc(C(=O)C(C)CN2CCCCC2)cc1,0 50 | Fenoldopam,Oc1ccc(C2CNCCc3c2cc(O)c(O)c3Cl)cc1,0 51 | Formoterol,COc1ccc(CC(C)NCC(O)c2ccc(O)c(NC=O)c2)cc1,0 52 | Fosinopril,CCC(=O)OC(OP(=O)(CCCCc1ccccc1)CC(=O)N1CC(C2CCCCC2)CC1C(=O)O)C(C)C,0 53 | Fursultiamine hydrochloride,CC(=C(CCO)SSCC1CCCO1)N(C=O)Cc1cnc(C)nc1N,0 54 | Ibudilast,CC(C)C(=O)c1c(C(C)C)nn2ccccc12,0 55 | Meprylcaine hydrochloride,CCCNC(C)(C)COC(=O)c1ccccc1,0 56 | Meptazinol,CCC1(c2cccc(O)c2)CCCCN(C)C1,0 57 | Mirtazapine,CN1CCN2c3ncccc3Cc3ccccc3C2C1,0 58 | Misoprostol,CCCCC(C)(O)CC=CC1C(O)CC(=O)C1CCCCCCC(=O)OC,0 59 | Nabumetone,COc1ccc2cc(CCC(C)=O)ccc2c1,0 60 | Nalmefene hydrochloride,C=C1CCC2(O)C3Cc4ccc(O)c5c4C2(CCN3CC2CC2)C1O5,0 61 | Olanzapine,Cc1cc2c(s1)Nc1ccccc1N=C2N1CCN(C)CC1,0 62 | Olmesartan,CCCc1nc(C(C)(C)O)c(C(=O)O)n1Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1,0 63 | Ondansetron hydrochloride,Cc1nccn1CC1CCc2c(c3ccccc3n2C)C1=O,0 64 | Oxymetholone,CC12CC(=CO)C(=O)CC1CCC1C2CCC2(C)C1CCC2(C)O,0 65 | Oxyphenisatin,O=C1Nc2ccccc2C1(c1ccc(O)cc1)c1ccc(O)cc1,0 66 | Prednicarbate,CCOC(=O)OC1(C(=O)COC(=O)CC)CCC2C3CCC4=CC(=O)C=CC4(C)C3C(O)CC21C,0 67 | Sparfloxacin,CC1CN(c2c(F)c(N)c3c(=O)c(C(=O)O)cn(C4CC4)c3c2F)CC(C)N1,0 68 | Tolterodine tartrate,Cc1ccc(O)c(C(CCN(C(C)C)C(C)C)c2ccccc2)c1,0 69 | Vonoprazan,CNCc1cc(-c2ccccc2F)n(S(=O)(=O)c2cccnc2)c1,0 70 | Zoledronic acid hydrate,O=P(O)(O)C(O)(Cn1ccnc1)P(=O)(O)O,0 71 | Merbromin,O=C1OC2(c3cc(Br)c([O-])cc3Oc3c2cc(Br)c([O-])c3[Hg])c2ccccc21,0 72 | Methandrostenolone,CC12C=CC(=O)C=C1CCC1C2CCC2(C)C1CCC2(C)O,0 73 | -------------------------------------------------------------------------------- /screening/README.md: -------------------------------------------------------------------------------- 1 | 2 |

3 |

A deep learning-based process related to the screening of SARS-CoV2 3CL inhibitors.

4 | 5 | 6 | ## process 7 | 8 | ![图片](https://user-images.githubusercontent.com/62410732/176149139-b96f2edd-b66b-4007-a0f4-73259b319cb6.png) 9 | 10 | Coronavirus disease 2019 (COVID-19) is a highly infectious disease caused by severe acute respiratory syndrome coronavirus-2 (SARS-CoV-2). It is urgent to find potential antiviral drugs against SARS-CoV-2 in a short time. Deep learning-based virtual screening is one of the approaches that can rapidly search against large molecular libraries. Here, SARS-CoV-2 3C-like protease (SARS-CoV-2 3CLpro) was chosen as the target. As shown in Figure bellow, the utility of D-GCAN is evaluated by comparing the screening results on the GDB-13 and S-GDB13 databases. The process was carried out with the help of the transfer learning method (Wang et al., 2021), DeepPurpose (Huang et al., 2020), and ADMETLab2.0 (Xiong et al., 2021). 11 | 12 | These databases were firstly screened by using a transfer learning method (COVIDVS) proposed by Wang et al (Wang et al., 2021), which was reported for screening inhibitors against SARS-CoV-2. The model was trained on the dataset containing inhibitors against HCoV-OC43, SARS-CoV and MERS-CoV. All of these viruses as well as SARS-CoV-2 belong to β-coronaviruses. They have high consistency in essential functional proteins (Wu et al., 2020; Shen et al.; Pillaiyar et al., 2020). Then, the trained model was fine-tuned by the transfer learning approach with the dataset containing drugs against SARS-CoV-2. In this way, 107 million drug-like molecules were screened out. Then, drug-target interaction prediction (DTI) was carried out based on DeepPurpose (Huang et al., 2020), which provided pretrained model for the interaction prediction between drugs and SARS-CoV-2 3CLpro target. The interaction binding score was evaluated by the dissociation equilibrium constant (Kd). After this step, 17 thousand molecules with high affinity were obtained. Finally, ADMET properties were widely chosen and used for screening SARS-CoV-2 inhibitors (Gajjar et al., 2021; Roy et al., 2021; Dhameliya et al., 2022). These properties were calculated by using ADMETLab2.0 (Xiong et al., 2021), and 65 candidates with good properties were selected. 13 | 14 | ## COVIDVS 15 | 16 | COVIDVS models are Chemprop models trained with anti-beta-coronavirus actives/inactives collected from published papers and fine-tuned with anti-SARS-CoV-2 actives/inactives. 17 | 18 | 19 | 20 | ## DeepPurpose 21 | 22 | DeepPurpose has provied the pretrained model by predicting the interaction between a target (SARS-CoV2 3CL Protease) and a list of repurposing drugs from a curated drug library of 81 antiviral drugs. The Binding Score is the Kd values. Results aggregated from five pretrained model on BindingDB dataset. 23 | 24 | 25 | 26 | ## AMETLab2.0 27 | 28 | Undesirable pharmacokinetics and toxicity of candidate compounds are the main reasons for the failure of drug development, and it has been widely recognized that absorption, distribution, metabolism, excretion and toxicity (ADMET) of chemicals should be evaluated as early as possible. ADMETlab 2.0 is an enhanced version of the widely used [ADMETlab](http://admet.scbdd.com/) for systematical evaluation of ADMET properties, as well as some physicochemical properties and medicinal chemistry friendliness. With significant updates to functional modules, predictive models, explanations, and the user interface, ADMETlab 2.0 has greater capacity to assist medicinal chemists in accelerating the drug research and development process. 29 | 30 | 31 | 32 | ## Acknowledgement 33 | 34 | Dhameliya,T.M. *et al.* (2022) Systematic virtual screening in search of SARS CoV-2 inhibitors against spike glycoprotein: pharmacophore screening, molecular docking, ADMET analysis and MD simulations. *Mol Divers*. 35 | 36 | Gajjar,N.D. *et al.* (2021) In search of RdRp and Mpro inhibitors against SARS CoV-2: Molecular docking, molecular dynamic simulations and ADMET analysis. *Journal of Molecular Structure*, **1239**, 130488. 37 | 38 | Huang,K. *et al.* (2020) DeepPurpose: a deep learning library for drug–target interaction prediction. *Bioinformatics*, **36**, 5545–5547. 39 | 40 | Pillaiyar,T. *et al.* (2020) Recent discovery and development of inhibitors targeting coronaviruses. *Drug Discovery Today*, **25**, 668–688. 41 | 42 | Roy,R. *et al.* (2021) Finding potent inhibitors against SARS-CoV-2 main protease through virtual screening, ADMET, and molecular dynamics simulation studies. *Journal of Biomolecular Structure and Dynamics*, **0**, 1–13. 43 | 44 | Shen,L. *et al.* High-Throughput Screening and Identification of Potent Broad-Spectrum Inhibitors of Coronaviruses. *Journal of Virology*, **93**, e00023-19. 45 | 46 | Wang,S. *et al.* (2021) A transferable deep learning approach to fast screen potential antiviral drugs against SARS-CoV-2. *Briefings in Bioinformatics*. 47 | 48 | Wu,F. *et al.* (2020) A new coronavirus associated with human respiratory disease in China. *Nature*, **579**, 265–269. 49 | 50 | Xiong,G. *et al.* (2021) ADMETlab 2.0: an integrated online platform for accurate and comprehensive predictions of ADMET properties. *Nucleic Acids Research*, **49**, W5–W14. 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /Discussion/preprocess.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import numpy as np 4 | 5 | from rdkit import Chem 6 | 7 | import torch 8 | atom_dict = defaultdict(lambda: len(atom_dict)) 9 | bond_dict = defaultdict(lambda: len(bond_dict)) 10 | fingerprint_dict = defaultdict(lambda: len(fingerprint_dict)) 11 | edge_dict = defaultdict(lambda: len(edge_dict)) 12 | radius=1 13 | if torch.cuda.is_available(): 14 | device = torch.device('cuda') 15 | print('The code uses a GPU!') 16 | else: 17 | device = torch.device('cpu') 18 | print('The code uses a CPU...') 19 | def create_atoms(mol, atom_dict): 20 | """Transform the atom types in a molecule (e.g., H, C, and O) 21 | into the indices (e.g., H=0, C=1, and O=2). 22 | Note that each atom index considers the aromaticity. 23 | """ 24 | atoms = [a.GetSymbol() for a in mol.GetAtoms()] 25 | for a in mol.GetAromaticAtoms(): 26 | i = a.GetIdx() 27 | atoms[i] = (atoms[i], 'aromatic') 28 | atoms = [atom_dict[a] for a in atoms] 29 | return np.array(atoms) 30 | 31 | 32 | def create_ijbonddict(mol, bond_dict): 33 | """Create a dictionary, in which each key is a node ID 34 | and each value is the tuples of its neighboring node 35 | and chemical bond (e.g., single and double) IDs. 36 | 37 | """ 38 | i_jbond_dict = defaultdict(lambda: []) 39 | for b in mol.GetBonds(): 40 | i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx() 41 | bond = bond_dict[str(b.GetBondType())] 42 | i_jbond_dict[i].append((j, bond)) 43 | i_jbond_dict[j].append((i, bond)) 44 | return i_jbond_dict 45 | 46 | 47 | def extract_fingerprints(radius, atoms, i_jbond_dict, 48 | fingerprint_dict, edge_dict): 49 | """Extract the fingerprints from a molecular graph 50 | based on Weisfeiler-Lehman algorithm. 51 | 52 | """ 53 | 54 | if (len(atoms) == 1) or (radius == 0): 55 | nodes = [fingerprint_dict[a] for a in atoms] 56 | 57 | else: 58 | nodes = atoms 59 | i_jedge_dict = i_jbond_dict 60 | 61 | for _ in range(radius): 62 | 63 | """Update each node ID considering its neighboring nodes and edges. 64 | The updated node IDs are the fingerprint IDs.。 65 | """ 66 | nodes_ = [] 67 | for i, j_edge in i_jedge_dict.items(): 68 | neighbors = [(nodes[j], edge) for j, edge in j_edge] 69 | fingerprint = (nodes[i], tuple(sorted(neighbors))) 70 | nodes_.append(fingerprint_dict[fingerprint]) 71 | 72 | """Also update each edge ID considering 73 | its two nodes on both sides. 74 | """ 75 | i_jedge_dict_ = defaultdict(lambda: []) 76 | for i, j_edge in i_jedge_dict.items(): 77 | for j, edge in j_edge: 78 | both_side = tuple(sorted((nodes[i], nodes[j]))) 79 | edge = edge_dict[(both_side, edge)] 80 | i_jedge_dict_[i].append((j, edge)) 81 | 82 | nodes = nodes_ 83 | i_jedge_dict = i_jedge_dict_ 84 | 85 | return np.array(nodes) 86 | 87 | 88 | def split_dataset(dataset, ratio): 89 | """Shuffle and split a dataset.洗牌和拆分数据集""" 90 | np.random.seed(1234) # fix the seed for shuffle为洗牌修正种子. 91 | np.random.shuffle(dataset) 92 | n = int(ratio * len(dataset)) 93 | return dataset[:n], dataset[n:] 94 | 95 | 96 | def create_dataset(filename,path,dataname): 97 | dir_dataset = path+dataname 98 | print(filename) 99 | """Load a dataset.""" 100 | with open(dir_dataset + filename, 'r') as f: 101 | smiles_property = f.readline().strip().split() 102 | data_original = f.read().strip().split('\n') 103 | 104 | """Exclude the data contains '.' in its smiles.排除含.的数据""" 105 | data_original = [data for data in data_original 106 | if '.' not in data.split()[0]] 107 | dataset = [] 108 | for data in data_original: 109 | 110 | smiles, property = data.strip().split() 111 | 112 | """Create each data with the above defined functions.""" 113 | mol = Chem.AddHs(Chem.MolFromSmiles(smiles)) 114 | atoms = create_atoms(mol, atom_dict) 115 | molecular_size = len(atoms) 116 | i_jbond_dict = create_ijbonddict(mol, bond_dict) 117 | fingerprints = extract_fingerprints(radius, atoms, i_jbond_dict, 118 | fingerprint_dict, edge_dict) 119 | adjacency = Chem.GetAdjacencyMatrix(mol) 120 | 121 | """Transform the above each data of numpy 122 | to pytorch tensor on a device (i.e., CPU or GPU). 123 | """ 124 | fingerprints = torch.LongTensor(fingerprints).to(device) 125 | adjacency = torch.FloatTensor(adjacency).to(device) 126 | property = torch.FloatTensor([int(property)]).to(device) 127 | 128 | dataset.append((smiles,fingerprints, adjacency, molecular_size, property)) 129 | 130 | return dataset 131 | 132 | dataset_train = create_dataset('data_train.txt') 133 | dataset_train, dataset_dev = split_dataset(dataset_train, 0.9) 134 | dataset_test = create_dataset('data_test.txt') 135 | 136 | N_fingerprints = len(fingerprint_dict) 137 | 138 | return dataset_train, dataset_dev, dataset_test, N_fingerprints 139 | -------------------------------------------------------------------------------- /Test/test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "65b363bc", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import predict" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 3, 16 | "id": "4cc4f418", 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "The code uses a GPU!\n", 24 | "../dataset/data_test.txt\n", 25 | "bacc_dev: 0.5119539230602043\n", 26 | "pre_dev: 0.5080213903743316\n", 27 | "rec_dev: 0.8837209302325582\n", 28 | "f1_dev: 0.6451612903225807\n", 29 | "mcc_dev: 0.03575604067764825\n", 30 | "sp_dev: 0.14018691588785046\n", 31 | "q__dev: 0.5454545454545454\n", 32 | "acc_dev: 0.5128205128205128\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "test1 = predict.predict('../dataset/data_test.txt',property=True)#Drugs from FDA" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "id": "8f0eba5e", 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "The code uses a GPU!\n", 51 | "../dataset/world_wide.txt\n", 52 | "bacc_dev: 0.46604215456674475\n", 53 | "pre_dev: 0.47987043035631655\n", 54 | "rec_dev: 0.8095238095238095\n", 55 | "f1_dev: 0.6025566531086578\n", 56 | "mcc_dev: -0.09345868862125822\n", 57 | "sp_dev: 0.12256049960967993\n", 58 | "q__dev: 0.3915211970074813\n", 59 | "acc_dev: 0.46604215456674475\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "test2 = predict.predict('../dataset/world_wide.txt',property=True)#Drugs from non-US" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "id": "75c3a192", 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "The code uses a GPU!\n", 78 | "../dataset/beyondRo5.txt\n", 79 | "1\n", 80 | "1\n", 81 | "1\n", 82 | "1\n", 83 | "1\n", 84 | "1\n", 85 | "1\n", 86 | "1\n", 87 | "1\n", 88 | "1\n", 89 | "1\n", 90 | "1\n", 91 | "1\n", 92 | "1\n", 93 | "1\n", 94 | "1\n", 95 | "1\n", 96 | "1\n", 97 | "1\n", 98 | "1\n", 99 | "1\n", 100 | "1\n", 101 | "1\n", 102 | "1\n", 103 | "1\n", 104 | "1\n", 105 | "1\n", 106 | "1\n", 107 | "1\n", 108 | "1\n", 109 | "1\n", 110 | "1\n", 111 | "1\n", 112 | "1\n", 113 | "1\n", 114 | "1\n", 115 | "1\n", 116 | "1\n", 117 | "1\n", 118 | "1\n", 119 | "1\n", 120 | "1\n", 121 | "1\n", 122 | "1\n", 123 | "1\n", 124 | "1\n", 125 | "1\n", 126 | "1\n", 127 | "1\n", 128 | "1\n", 129 | "1\n", 130 | "1\n", 131 | "1\n", 132 | "1\n", 133 | "1\n", 134 | "1\n", 135 | "1\n", 136 | "1\n", 137 | "1\n", 138 | "1\n", 139 | "1\n", 140 | "1\n", 141 | "1\n", 142 | "1\n", 143 | "1\n", 144 | "1\n", 145 | "1\n", 146 | "1\n", 147 | "1\n", 148 | "1\n", 149 | "1\n", 150 | "1\n", 151 | "1\n", 152 | "1\n", 153 | "1\n", 154 | "1\n", 155 | "1\n", 156 | "1\n", 157 | "1\n", 158 | "1\n", 159 | "1\n", 160 | "1\n", 161 | "1\n", 162 | "1\n", 163 | "1\n", 164 | "1\n", 165 | "1\n", 166 | "1\n", 167 | "1\n", 168 | "1\n", 169 | "1\n", 170 | "1\n", 171 | "1\n", 172 | "1\n", 173 | "1\n", 174 | "1\n", 175 | "1\n", 176 | "1\n", 177 | "1\n", 178 | "1\n", 179 | "1\n", 180 | "1\n", 181 | "1\n", 182 | "1\n", 183 | "1\n", 184 | "1\n", 185 | "1\n", 186 | "1\n", 187 | "1\n", 188 | "1\n", 189 | "1\n", 190 | "1\n", 191 | "1\n", 192 | "1\n", 193 | "1\n", 194 | "1\n", 195 | "1\n", 196 | "1\n", 197 | "1\n", 198 | "1\n", 199 | "1\n", 200 | "1\n", 201 | "1\n", 202 | "1\n", 203 | "1\n", 204 | "1\n", 205 | "1\n", 206 | "1\n", 207 | "1\n", 208 | "1\n", 209 | "1\n", 210 | "1\n", 211 | "1\n", 212 | "1\n", 213 | "1\n", 214 | "1\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "test3 = predict.predict('../dataset/beyondRo5.txt',property=False)#Drugs beyond Ro5" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "id": "147a96e5", 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [] 229 | } 230 | ], 231 | "metadata": { 232 | "kernelspec": { 233 | "display_name": "Python 3", 234 | "language": "python", 235 | "name": "python3" 236 | }, 237 | "language_info": { 238 | "codemirror_mode": { 239 | "name": "ipython", 240 | "version": 3 241 | }, 242 | "file_extension": ".py", 243 | "mimetype": "text/x-python", 244 | "name": "python", 245 | "nbconvert_exporter": "python", 246 | "pygments_lexer": "ipython3", 247 | "version": "3.8.8" 248 | } 249 | }, 250 | "nbformat": 4, 251 | "nbformat_minor": 5 252 | } 253 | -------------------------------------------------------------------------------- /DGCAN/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 27 20:09:31 2022 4 | 5 | @author:Jinyu-Sun 6 | """ 7 | 8 | import timeit 9 | import sys 10 | import numpy as np 11 | import math 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | import torch.optim as optim 16 | import pickle 17 | from sklearn.metrics import roc_auc_score, roc_curve,auc 18 | from sklearn.metrics import confusion_matrix 19 | import preprocess as pp 20 | import pandas as pd 21 | import matplotlib.pyplot as plt 22 | from DGCAN import MolecularGraphNeuralNetwork,Trainer,Tester 23 | 24 | def metrics(cnf_matrix): 25 | '''Evaluation Metrics''' 26 | tn = cnf_matrix[0, 0] 27 | tp = cnf_matrix[1, 1] 28 | fn = cnf_matrix[1, 0] 29 | fp = cnf_matrix[0, 1] 30 | 31 | bacc = ((tp / (tp + fn)) + (tn / (tn + fp))) / 2 # balance accurance 32 | pre = tp / (tp + fp) # precision/q+ 33 | rec = tp / (tp + fn) # recall/se 34 | sp = tn / (tn + fp) 35 | q_ = tn / (tn + fn) 36 | f1 = 2 * pre * rec / (pre + rec) # f1score 37 | mcc = ((tp * tn) - (fp * fn)) / math.sqrt( 38 | (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) # Matthews correlation coefficient 39 | acc = (tp + tn) / (tp + fp + fn + tn) # accurancy 40 | 41 | print('bacc:', bacc) 42 | print('pre:', pre) 43 | print('rec:', rec) 44 | print('f1:', f1) 45 | print('mcc:', mcc) 46 | print('sp:', sp) 47 | print('q_:', q_) 48 | print('acc:', acc) 49 | 50 | 51 | def train (test_name, radius, dim, layer_hidden, layer_output, dropout, batch_train, 52 | batch_test, lr, lr_decay, decay_interval, iteration, N , dataset_train): 53 | ''' 54 | 55 | Parameters 56 | ---------- 57 | data_test='../dataset/data_test.txt', #test set 58 | radius = 1, #hops of radius subgraph: 1, 2 59 | dim = 64, #dimension of graph convolution layers 60 | layer_hidden = 4, #Number of graph convolution layers 61 | layer_output = 10, #Number of dense layers 62 | dropout = 0.45, #drop out rate :0-1 63 | batch_train = 8, # batch of training set 64 | batch_test = 8, #batch of test set 65 | lr =3e-4, #learning rate: 1e-5,1e-4,3e-4, 5e-4, 1e-3, 3e-3,5e-3 66 | lr_decay = 0.85, #Learning rate decay:0.5, 0.75, 0.85, 0.9 67 | decay_interval = 25,#Number of iterations for learning rate decay:10,25,30,50 68 | iteration = 140, #Number of iterations 69 | N = 5000, #length of embedding: 2000,3000,5000,7000 70 | dataset_train='../dataset/data_train.txt') #training set 71 | 72 | Returns 73 | ------- 74 | res_test : results 75 | Predicting results. 76 | 77 | ''' 78 | dataset_test = test_name 79 | (radius, dim, layer_hidden, layer_output, 80 | batch_train, batch_test, decay_interval, 81 | iteration, dropout) = map(int, [radius, dim, layer_hidden, layer_output, 82 | batch_train, batch_test, 83 | decay_interval, iteration, dropout]) 84 | lr, lr_decay = map(float, [lr, lr_decay]) 85 | if torch.cuda.is_available(): 86 | device = torch.device('cuda') 87 | print('The code uses a GPU!') 88 | else: 89 | device = torch.device('cpu') 90 | print('The code uses a CPU...') 91 | 92 | lr, lr_decay = map(float, [lr, lr_decay]) 93 | 94 | print('-' * 100) 95 | print('Just a moment......') 96 | print('-' * 100) 97 | path = '' 98 | dataname = '' 99 | 100 | dataset_train= pp.create_dataset(dataset_train,path,dataname) 101 | #dataset_train,dataset_test = pp.split_dataset(dataset_train,0.9) 102 | #dataset_test= pp.create_dataset(dataset_dev,path,dataname) 103 | dataset_test= pp.create_dataset(dataset_test,path,dataname) 104 | np.random.seed(0) 105 | np.random.shuffle(dataset_train) 106 | print('The preprocess has finished!') 107 | print('-' * 100) 108 | 109 | print('Creating a model.') 110 | torch.manual_seed(0) 111 | model = MolecularGraphNeuralNetwork( 112 | N, dim, layer_hidden, layer_output, dropout).to(device) 113 | trainer = Trainer(model,lr,batch_train) 114 | tester = Tester(model,batch_test) 115 | print('# of model parameters:', 116 | sum([np.prod(p.size()) for p in model.parameters()])) 117 | print('-' * 100) 118 | file_result = path + '../DGCAN/results/AUC' + '.txt' 119 | # file_result = '../output/result--' + setting + '.txt' 120 | result = 'Epoch\tTime(sec)\tLoss_train\tLoss_test\tAUC_train\tAUC_test' 121 | file_test_result = path + 'test_prediction' + '.txt' 122 | file_predictions = path + 'train_prediction' + '.txt' 123 | file_model = '../DGCAN/model/model' + '.pth' 124 | with open(file_result, 'w') as f: 125 | f.write(result + '\n') 126 | 127 | print('Start training.') 128 | print('The result is saved in the output directory every epoch!') 129 | 130 | np.random.seed(0) 131 | 132 | start = timeit.default_timer() 133 | 134 | for epoch in range(iteration): 135 | epoch += 1 136 | if epoch % decay_interval == 0: 137 | trainer.optimizer.param_groups[0]['lr'] *= lr_decay 138 | # [‘amsgrad’, ‘params’, ‘lr’, ‘betas’, ‘weight_decay’, ‘eps’] 139 | prediction_train, loss_train, train_res = trainer.train(dataset_train) 140 | prediction_test, loss_test, test_res = tester.test_classifier(dataset_test) 141 | 142 | time = timeit.default_timer() - start 143 | 144 | if epoch == 1: 145 | minutes = time * iteration / 60 146 | hours = int(minutes / 60) 147 | minutes = int(minutes - 60 * hours) 148 | print('The training will finish in about', 149 | hours, 'hours', minutes, 'minutes.') 150 | print('-' * 100) 151 | print(result) 152 | 153 | result = '\t'.join(map(str, [epoch, time, loss_train, loss_test, prediction_train, prediction_test])) 154 | tester.save_result(result, file_result) 155 | tester.save_model(model, file_model) 156 | print(result) 157 | model.eval() 158 | prediction_test, loss_test, test_res = tester.test_classifier(dataset_test) 159 | res_test = test_res.T 160 | 161 | cnf_matrix = confusion_matrix(res_test[:, 0], res_test[:, 1]) 162 | fpr, tpr, thresholds = roc_curve(res_test[:, 0], res_test[:, 1]) 163 | AUC = auc(fpr, tpr) 164 | print('auc:',AUC) 165 | metrics(cnf_matrix) 166 | return res_test -------------------------------------------------------------------------------- /screening/DTI.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "c40219cd", 6 | "metadata": {}, 7 | "source": [ 8 | "# Drug Target Interaction Prediction by using DeepPurpose" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "23b43a6f", 14 | "metadata": {}, 15 | "source": [ 16 | "DeepPurpose has provied the convinient way for DTI prediction especially for SARS_CoV2_Protease. " 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "38ab44af", 22 | "metadata": {}, 23 | "source": [ 24 | "## Installation" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "1ffbb4b3", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "pip\n", 35 | "conda create -n DeepPurpose python=3.6\n", 36 | "conda activate DeepPurpose\n", 37 | "conda install -c conda-forge rdkit\n", 38 | "conda install -c conda-forge notebook\n", 39 | "pip install git+https://github.com/bp-kelley/descriptastorus \n", 40 | "pip install DeepPurpose\n", 41 | "\n", 42 | "or Build from Source\n", 43 | "\n", 44 | "git clone https://github.com/kexinhuang12345/DeepPurpose.git ## Download code repository\n", 45 | "cd DeepPurpose ## Change directory to DeepPurpose\n", 46 | "conda env create -f environment.yml ## Build virtual environment with all packages installed using conda\n", 47 | "conda activate DeepPurpose ## Activate conda environment (use \"source activate DeepPurpose\" for anaconda 4.4 or earlier) \n", 48 | "jupyter notebook ## open the jupyter notebook with the conda env\n", 49 | "\n", 50 | "## run our code, e.g. click a file in the DEMO folder\n", 51 | "... ...\n", 52 | "\n", 53 | "conda deactivate ## when done, exit conda environment " 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "id": "26e590fe", 59 | "metadata": {}, 60 | "source": [ 61 | "## Run" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "55207f2c", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "import os\n", 72 | "os.chdir('../')\n", 73 | "from DeepPurpose import utils\n", 74 | "from DeepPurpose import DTI as models\n", 75 | "X_drug, X_target, y = process_BindingDB(download_BindingDB(SAVE_PATH),\n", 76 | " y = 'Kd', \n", 77 | " binary = False, \n", 78 | " convert_to_log = True)\n", 79 | "\n", 80 | "# Type in the encoding names for drug/protein.\n", 81 | "drug_encoding, target_encoding = 'MPNN', 'CNN'\n", 82 | "\n", 83 | "# Data processing, here we select cold protein split setup.\n", 84 | "train, val, test = data_process(X_drug, X_target, y, \n", 85 | " drug_encoding, target_encoding, \n", 86 | " split_method='cold_protein', \n", 87 | " frac=[0.7,0.1,0.2])\n", 88 | "\n", 89 | "# Generate new model using default parameters; also allow model tuning via input parameters.\n", 90 | "config = generate_config(drug_encoding, target_encoding, transformer_n_layer_target = 8)\n", 91 | "net = models.model_initialize(**config)\n", 92 | "\n", 93 | "# Train the new model.\n", 94 | "# Detailed output including a tidy table storing validation loss, metrics, AUC curves figures and etc. are stored in the ./result folder.\n", 95 | "net.train(train, val, test)\n", 96 | "\n", 97 | "# or simply load pretrained model from a model directory path or reproduced model name such as DeepDTA\n", 98 | "net = models.model_pretrained(MODEL_PATH_DIR or MODEL_NAME)\n", 99 | "\n", 100 | "X_repurpose, drug_name, drug_cid = load_broad_repurposing_hub(SAVE_PATH)\n", 101 | "target, target_name = load_SARS_CoV2_Protease_3CL()\n", 102 | "\n", 103 | "_ = models.virtual_screening(smiles, target, net, drug_name, target_name)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "id": "bab0c86d", 109 | "metadata": {}, 110 | "source": [ 111 | "## Results" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "id": "1136255f", 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "+-------+-----------+------------------------+---------------+\n", 122 | "| Rank | Drug Name | Target Name | Binding Score |\n", 123 | "+-------+-----------+------------------------+---------------+\n", 124 | "| 1 | Drug 4565 | SARS-CoV2 3CL Protease | 8.96 |\n", 125 | "| 2 | Drug 4570 | SARS-CoV2 3CL Protease | 12.42 |\n", 126 | "| 3 | Drug 3690 | SARS-CoV2 3CL Protease | 12.86 |\n", 127 | "| 4 | Drug 3068 | SARS-CoV2 3CL Protease | 13.36 |\n", 128 | "| 5 | Drug 8387 | SARS-CoV2 3CL Protease | 13.47 |\n", 129 | "| 6 | Drug 5176 | SARS-CoV2 3CL Protease | 14.47 |\n", 130 | "| 7 | Drug 438 | SARS-CoV2 3CL Protease | 14.67 |\n", 131 | "| 8 | Drug 4507 | SARS-CoV2 3CL Protease | 16.11 |\n", 132 | "```\n", 133 | "```\n", 134 | "| 9978 | Drug 1377 | SARS-CoV2 3CL Protease | 460788.11 |\n", 135 | "| 9979 | Drug 3768 | SARS-CoV2 3CL Protease | 479737.13 |\n", 136 | "| 9980 | Drug 5106 | SARS-CoV2 3CL Protease | 485684.14 |\n", 137 | "| 9981 | Drug 3765 | SARS-CoV2 3CL Protease | 505994.35 |\n", 138 | "| 9982 | Drug 2207 | SARS-CoV2 3CL Protease | 510293.39 |\n", 139 | "| 9983 | Drug 1161 | SARS-CoV2 3CL Protease | 525921.93 |\n", 140 | "| 9984 | Drug 2477 | SARS-CoV2 3CL Protease | 533613.12 |\n", 141 | "| 9985 | Drug 3320 | SARS-CoV2 3CL Protease | 538902.46 |\n", 142 | "| 9986 | Drug 3783 | SARS-CoV2 3CL Protease | 542639.17 |\n", 143 | "| 9987 | Drug 4834 | SARS-CoV2 3CL Protease | 603510.00 |\n", 144 | "| 9988 | Drug 9653 | SARS-CoV2 3CL Protease | 611796.89 |\n", 145 | "| 9989 | Drug 6606 | SARS-CoV2 3CL Protease | 671138.31 |\n", 146 | "| 9990 | Drug 160 | SARS-CoV2 3CL Protease | 697775.04 |\n", 147 | "| 9991 | Drug 3851 | SARS-CoV2 3CL Protease | 792134.96 |\n", 148 | "| 9992 | Drug 5208 | SARS-CoV2 3CL Protease | 832708.75 |\n", 149 | "| 9993 | Drug 2786 | SARS-CoV2 3CL Protease | 905739.10 |\n", 150 | "| 9994 | Drug 6612 | SARS-CoV2 3CL Protease | 968825.66 |\n", 151 | "| 9995 | Drug 6609 | SARS-CoV2 3CL Protease | 1088788.87 |\n", 152 | "| 9996 | Drug 801 | SARS-CoV2 3CL Protease | 1186364.21 |\n", 153 | "| 9997 | Drug 3844 | SARS-CoV2 3CL Protease | 1199274.11 |\n", 154 | "| 9998 | Drug 3842 | SARS-CoV2 3CL Protease | 1559694.06 |\n", 155 | "| 9999 | Drug 4486 | SARS-CoV2 3CL Protease | 1619297.87 |\n", 156 | "| 10000 | Drug 800 | SARS-CoV2 3CL Protease | 1623061.65 |\n", 157 | "+-------+-----------+------------------------+---------------+" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "id": "83ff4364", 163 | "metadata": {}, 164 | "source": [ 165 | "## Acknowledgement" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "id": "ab0dd49f", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "This project incorporates code from the following repo:\n", 176 | " \n", 177 | " https://github.com/kexinhuang12345/DeepPurpose\n", 178 | " " 179 | ] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "display_name": "Python 3 (ipykernel)", 185 | "language": "python", 186 | "name": "python3" 187 | }, 188 | "language_info": { 189 | "codemirror_mode": { 190 | "name": "ipython", 191 | "version": 3 192 | }, 193 | "file_extension": ".py", 194 | "mimetype": "text/x-python", 195 | "name": "python", 196 | "nbconvert_exporter": "python", 197 | "pygments_lexer": "ipython3", 198 | "version": "3.8.8" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 5 203 | } 204 | -------------------------------------------------------------------------------- /screening/Dataset/finetunev1.csv: -------------------------------------------------------------------------------- 1 | smiles,new_label 2 | CCN(CC)CCCC(C)Nc1ccnc2cc(Cl)ccc12,1 3 | Cc1cccc(C)c1OCC(=O)NC(Cc1ccccc1)C(O)CC(Cc1ccccc1)NC(=O)C(C(C)C)N1CCCNC1=O,1 4 | O=C(Nc1ccc([N+](=O)[O-])cc1Cl)c1cc(Cl)ccc1O,1 5 | CN(C)C(=O)C(CCN1CCC(O)(c2ccc(Cl)cc2)CC1)(c1ccccc1)c1ccccc1,1 6 | CC1OC(OC2CC(O)C3(CO)C4C(O)CC5(C)C(C6=CC(=O)OC6)CCC5(O)C4CCC3(O)C2)C(O)C(O)C1O,1 7 | CCN(CC)CCOc1ccc(C(O)(Cc2ccc(Cl)cc2)c2ccc(C)cc2)cc1,1 8 | COc1ccc2cc1Oc1ccc(cc1)CC1c3cc(c(OC)cc3CCN1C)Oc1c(OC)c(OC)cc3c1C(C2)N(C)CC3,1 9 | CCN(CC)Cc1cc(Nc2ccnc3cc(Cl)ccc23)ccc1O,1 10 | O=C1NCN(c2ccccc2)C12CCN(CCCC(c1ccc(F)cc1)c1ccc(F)cc1)CC2,1 11 | Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc(-c2cccnc2)n1,1 12 | CCN(CC)CCOc1ccc2c(c1)C(=O)c1cc(OCCN(CC)CC)ccc1-2,1 13 | COc1cc2c3cc1Oc1c(OC)c(OC)cc4c1C(Cc1ccc(O)c(c1)Oc1ccc(cc1)CC3N(C)CC2)N(C)CC4,1 14 | OC(c1cc(C(F)(F)F)nc2c(C(F)(F)F)cccc12)C1CCCCN1,1 15 | OCCN1CCN(CCCN2c3ccccc3Sc3ccc(C(F)(F)F)cc32)CC1,1 16 | O=C(Nc1cc(Cl)cc(Cl)c1O)c1c(O)c(Cl)cc(Cl)c1Cl,1 17 | CCN(CCO)CCCC(C)Nc1ccnc2cc(Cl)ccc12,1 18 | CC(CN1c2ccccc2Sc2ccccc21)N(C)C,1 19 | CCSc1ccc2c(c1)N(CCCN1CCN(C)CC1)c1ccccc1S2,1 20 | CC1OC(OC2C(O)CC(OC3C(O)CC(OC4CCC5(C)C(CCC6C5CC(O)C5(C)C(C7=CC(=O)OC7)CCC65O)C4)OC3C)OC2C)CC(O)C1O,1 21 | CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21,1 22 | CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21,1 23 | CCN(CC)CCOc1ccc(C(=C(Cl)c2ccccc2)c2ccccc2)cc1,1 24 | CCC1OC(=O)C(C)C(OC2CC(C)(OC)C(O)C(C)O2)C(C)C(OC2OC(C)CC(N(C)C)C2O)C(C)(O)CC(C)CN(C)C(C)C(O)C1(C)O,1 25 | COc1ncnc(NS(=O)(=O)c2ccc(N)cc2)c1OC,1 26 | COc1ccc2nc(S(=O)Cc3ncc(C)c(OC)c3C)[nH]c2c1,1 27 | CN(C)CCOc1ccc(C(=C(CCCl)c2ccccc2)c2ccccc2)cc1,1 28 | CSc1ccc2c(c1)N(CCC1CCCCN1C)c1ccccc1S2,1 29 | CC=CCC(C)C(O)C1C(=O)NC(CC)C(=O)N(C)CC(=O)N(C)C(CC(C)C)C(=O)NC(C(C)C)C(=O)N(C)C(CC(C)C)C(=O)NC(C)C(=O)NC(C)C(=O)N(C)C(CC(C)C)C(=O)N(C)C(CC(C)C)C(=O)N(C)C(C(C)C)C(=O)N1C,1 30 | CC(C)=CCCC1(C)C=Cc2c(O)c3c(c(CC=C(C)C)c2O1)OC12C(=CC4CC1C(C)(C)OC2(CC=C(C)C(=O)O)C4O)C3=O,1 31 | C=CC1CN2CCC1CC2C(O)c1ccnc2ccc(OC)cc12,1 32 | CCCCCC(=O)OC1(C(C)=O)CCC2C3CCC4=CC(=O)CCC4(C)C3CCC21C,1 33 | COc1ccc2cc1Oc1ccc(cc1)CC1c3c(cc4c(c3Oc3cc5c(cc3OC)CCN(C)C5C2)OCO4)CCN1C,1 34 | CCC(=C(c1ccccc1)c1ccc(OCCN(C)C)cc1)c1ccccc1,1 35 | OC1(c2ccc(Cl)c(C(F)(F)F)c2)CCN(CCCC(c2ccc(F)cc2)c2ccc(F)cc2)CC1,1 36 | Clc1ccc(Cn2c(CN3CCCC3)nc3ccccc32)cc1,1 37 | C1CCC(C(CC2CCCCN2)C2CCCCC2)CC1,1 38 | Oc1c(Cl)cc(Cl)c(Cl)c1Cc1c(O)c(Cl)cc(Cl)c1Cl,1 39 | CCC(C(=O)O)C1CCC(C)C(C(C)C(O)C(C)C(=O)C(CC)C2OC3(C=CC(O)C4(CCC(C)(C5CCC(O)(CC)C(C)O5)O4)O3)C(C)CC2C)O1,1 40 | Cc1cc(Nc2ncc(Cl)c(Nc3ccccc3S(=O)(=O)C(C)C)n2)c(OC(C)C)cc1C1CCNCC1,1 41 | CC1(C)C=Cc2c(c3c(c4c(=O)c(-c5ccc(O)cc5)coc24)OC(C)(C)CC3)O1,1 42 | C=CC(=O)Nc1cc(Nc2nccc(-c3cn(C)c4ccccc34)n2)c(OC)cc1N(C)CCN(C)C,1 43 | Cc1c(-c2ccc(O)cc2)n(Cc2ccc(OCCN3CCCCCC3)cc2)c2ccc(O)cc12,1 44 | CCCCCCOC(C)c1cccc(-c2csc(NC(=O)c3cc(Cl)c(C=C(C)C(=O)O)c(Cl)c3)n2)c1OC,1 45 | CC(C)=CCc1c2c(c3occ(-c4ccc(O)cc4)c(=O)c3c1O)C=CC(C)(C)O2,1 46 | CCCCc1oc2ccc(NS(C)(=O)=O)cc2c1C(=O)c1ccc(OCCCN(CCCC)CCCC)cc1,1 47 | CC(C)C(=O)OCC(=O)C12OC(C3CCCCC3)OC1CC1C3CCC4=CC(=O)C=CC4(C)C3C(O)CC12C,1 48 | CC1(C)C=Cc2c(c3c(c4c(=O)c(-c5ccc(O)c(O)c5)coc24)OC(C)(C)CC3)O1,1 49 | CCCCCOc1ccc(-c2ccc(-c3ccc(C(=O)NC4CC(O)C(O)NC(=O)C5C(O)C(C)CN5C(=O)C(C(C)O)NC(=O)C(C(O)C(O)c5ccc(O)cc5)NC(=O)C5CC(O)CN5C(=O)C(C(C)O)NC4=O)cc3)cc2)cc1,1 50 | CC(C)(C)c1cc(C(C)(C)C)c(NC(=O)c2c[nH]c3ccccc3c2=O)cc1O,1 51 | CCC(=C(c1ccc(OCCN(C)C)cc1)c1cccc(O)c1)c1ccccc1,1 52 | CCN1CCN(Cc2ccc(Nc3ncc(F)c(-c4cc(F)c5nc(C)n(C(C)C)c5c4)n3)nc2)CC1,1 53 | CCc1nc(C(N)=O)c(Nc2ccc(N3CCC(N4CCN(C)CC4)CC3)c(OC)c2)nc1NC1CCOCC1,1 54 | CC(C)(C)c1ccc(C(=O)CCCN2CCC(OC(c3ccccc3)c3ccccc3)CC2)cc1,1 55 | c1ccc2c(c1)Sc1ccccc1N2CC1CN2CCC1CC2,1 56 | CC1=NN(c2ccc(C)c(C)c2)C(=O)C1=NNc1cccc(-c2cccc(C(=O)O)c2)c1O,1 57 | COC(=O)NC(C(=O)NC(Cc1ccccc1)C(O)CN(Cc1ccc(-c2ccccn2)cc1)NC(=O)C(NC(=O)OC)C(C)(C)C)C(C)(C)C,1 58 | CN1C2CCC1CC(OC(c1ccccc1)c1ccccc1)C2,1 59 | CC(C)N1CCN(c2ccc(OCC3COC(Cn4cncn4)(c4ccc(Cl)cc4Cl)O3)cc2)CC1,1 60 | C=CCOc1ccccc1OCC(O)CNC(C)C,1 61 | CCCCCC(O)C=CC1C(O)CC(=O)C1CCCCCCC(=O)O,1 62 | CC1CCOC2Cn3cc(C(=O)NCc4ccc(F)cc4F)c(=O)c(O)c3C(=O)N12,1 63 | OCCN1CCN(CCCN2c3ccccc3C=Cc3ccccc32)CC1,1 64 | CCOC(=O)c1c(CSc2ccccc2)n(C)c2cc(Br)c(O)c(CN(C)C)c12,1 65 | CC(C)c1nc(CN(C)C(=O)NC(C(=O)NC(Cc2ccccc2)CC(O)C(Cc2ccccc2)NC(=O)OCc2cncs2)C(C)C)cs1,1 66 | Cc1c(O)cccc1C(=O)NC(CSc1ccccc1)C(O)CN1CC2CCCCC2CC1C(=O)NC(C)(C)C,1 67 | CC(C)(C)NC(=O)C1CC2CCCCC2CN1CC(O)C(Cc1ccccc1)NC(=O)C(CC(N)=O)NC(=O)c1ccc2ccccc2n1,1 68 | CCCC1(CCc2ccccc2)CC(O)=C(C(CC)c2cccc(NS(=O)(=O)c3ccc(C(F)(F)F)cn3)c2)C(=O)O1,1 69 | CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(=O)c1ccc(N)cc1,1 70 | CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1COC2OCCC12)S(=O)(=O)c1ccc(N)cc1,1 71 | CC(C)(C)NC(=O)C1CN(Cc2cccnc2)CCN1CC(O)CC(Cc1ccccc1)C(=O)NC1c2ccccc2CC1O,1 72 | Nc1ccn(C2OC(CO)C(O)C2(F)F)c(=O)n1,0 73 | CCC1CN2CCc3cc(OC)c(OC)cc3C2CC1CC1NCCc2cc(OC)c(OC)cc21,0 74 | CN(C)CCCN1c2ccccc2Sc2ccccc21,0 75 | CC(=O)OC1CC(OC2C(O)CC(OC3C(O)CC(OC4CCC5(C)C(CCC6C5CC(O)C5(C)C(C7=CC(=O)OC7)CCC65O)C4)OC3C)OC2C)OC(C)C1OC1OC(CO)C(O)C(O)C1O,0 76 | COc1ccc(CC2NCC(O)C2OC(C)=O)cc1,0 77 | Nc1ccc(N=Nc2ccccc2)c(N)n1,0 78 | CC(C)NCC(O)COc1cccc2ccccc12,0 79 | CN(C)CCCSC(=N)N,0 80 | CN1CCCC1Cc1c[nH]c2ccc(CCS(=O)(=O)c3ccccc3)cc12,0 81 | NC1CONC1=O,0 82 | Nc1c2c(nc3ccccc13)CCCC2,0 83 | CC(=O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C,0 84 | O=C1CC2(CCCC2)CC(=O)N1CCCCN1CCN(c2ncccn2)CC1,0 85 | CCN(CC)CCCC(C)Nc1c2ccc(Cl)cc2nc2ccc(OC)cc12,0 86 | CCCCOc1ccc(C(=O)CCN2CCCCC2)cc1,0 87 | O=C(CCCN1CCC2(CC1)C(=O)NCN2c1ccccc1)c1ccc(F)cc1,0 88 | CC(C(O)c1ccc(O)cc1)N1CCC(Cc2ccccc2)CC1,0 89 | Cn1nnnc1SCC1=C(C(=O)[O-])N2C(=O)C(NC(=O)C(O)c3ccccc3)C2SC1,0 90 | CCOc1ccc2nc(S(N)(=O)=O)sc2c1,0 91 | CC=Cc1ccc(OC)cc1,0 92 | CC(CCc1ccccc1)NCC(O)c1ccc(O)c(C(N)=O)c1,0 93 | CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,0 94 | CC(C)(N)Cc1ccccc1,0 95 | CC12CCC3C(=CCc4cc(O)ccc43)C1CCC2=O,0 96 | COc1cc2c(cc1OC)C(=O)C(CC1CCN(Cc3ccccc3)CC1)C2,0 97 | CC(CCc1ccccc1)NC(C)C(O)c1ccc(O)cc1,0 98 | C=C1CC2C(CCC3(C)C(=O)CCC23)C2(C)C=CC(=O)C=C12,0 99 | NC(CCC(=O)NC(CSSCC(NC(=O)CCC(N)C(=O)O)C(=O)NCC(=O)O)C(=O)NCC(=O)O)C(=O)O,0 100 | CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC(=O)O,0 101 | CC12CCC3C(CCC4CC(O)CCC43C)C1CCC2=O,0 102 | Cc1ccc(C(=O)C(C)CN2CCCCC2)cc1,0 103 | CN(C)CCN(Cc1ccccc1)c1ccccn1,0 104 | COc1ccccc1OCC(O)CN1CCN(CC(=O)Nc2c(C)cccc2C)CC1,0 105 | COc1cc(N)c(Cl)cc1C(=O)NC1CCN(Cc2ccccc2)CC1,0 106 | CC1(C)OC2CC3C4CCC5=CC(=O)C=CC5(C)C4C(O)CC3(C)C2(C(=O)CO)O1,0 107 | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)nc21,0 108 | Nc1c(Br)cc(Br)cc1CNC1CCC(O)CC1,0 109 | CC12CCC3C(CCC4=C(O)C(=O)CCC43C)C1CCC2=O,0 110 | NCCc1ccc(O)c(O)c1,0 111 | CC(C)(C)NCC(O)c1ccc(O)c(CO)c1,0 112 | CCCN1CC(CSC)CC2c3cccc4[nH]cc(c34)CC21,0 113 | CN(C)CCOC(=O)C(c1ccccc1)C1(O)CCCC1,0 114 | c1ccc2c(c1)OCC(C1=NCCN1)O2,0 115 | CCCc1cc(C(N)=S)ccn1,0 116 | CN(C)CCOC(c1ccccc1)c1ccccc1,0 117 | O=C1c2c(O)cccc2Cc2cccc(O)c21,0 118 | CN(C)C(=O)COC(=O)Cc1ccc(OC(=O)c2ccc(N=C(N)N)cc2)cc1,0 119 | NC(=O)c1nc(F)c[nH]c1=O,0 120 | COC1C(OC(C)=O)CC(=O)OC(C)CC=CC=CC(OC2CCC(N(C)C)C(C)O2)C(C)CC(CC=O)C1OC1OC(C)C(OC2CC(C)(O)C(O)C(C)O2)C(N(C)C)C1O,0 121 | CC12CC(=O)C3C(CCC4CC(O)CCC43C)C1CCC2C(=O)CO,0 122 | COC(c1ccccc1)(c1ccccc1)C(Oc1nc(C)cc(C)n1)C(=O)O,0 123 | CC1CCC2C(C)C(O)OC3OC4(C)CCC1C32OO4,0 124 | CCCCOc1cc(C(=O)OCCN(CC)CC)ccc1N,0 125 | c1ccc(CNCCNCc2ccccc2)cc1,0 126 | CC(=C(CCOC(=O)c1ccccc1)SSC(CCOC(=O)c1ccccc1)=C(C)N(C=O)Cc1cnc(C)nc1N)N(C=O)Cc1cnc(C)nc1N,0 127 | CC(C)=CC(C)=NNc1nncc2ccccc12,0 128 | CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1,0 129 | COc1ccc(CC2c3cc(OC)c(OC)cc3CC[N+]2(C)CCC(=O)OCCCCCOC(=O)CC[N+]2(C)CCc3cc(OC)c(OC)cc3C2Cc2ccc(OC)c(OC)c2)cc1OC,0 130 | CC(=O)Oc1ccc(C(=C2CCCCC2)c2ccc(OC(C)=O)cc2)cc1,0 131 | CNCC(O)c1ccc(OC(=O)C(C)(C)C)c(OC(=O)C(C)(C)C)c1,0 132 | CCc1ccc(C(=O)C(C)CN2CCCCC2)cc1,0 133 | Oc1ccc(C2CNCCc3c2cc(O)c(O)c3Cl)cc1,0 134 | COc1ccc(CC(C)NCC(O)c2ccc(O)c(NC=O)c2)cc1,0 135 | CCC(=O)OC(OP(=O)(CCCCc1ccccc1)CC(=O)N1CC(C2CCCCC2)CC1C(=O)O)C(C)C,0 136 | CC(=C(CCO)SSCC1CCCO1)N(C=O)Cc1cnc(C)nc1N,0 137 | CC(C)C(=O)c1c(C(C)C)nn2ccccc12,0 138 | CCCNC(C)(C)COC(=O)c1ccccc1,0 139 | CCC1(c2cccc(O)c2)CCCCN(C)C1,0 140 | CN1CCN2c3ncccc3Cc3ccccc3C2C1,0 141 | CCCCC(C)(O)CC=CC1C(O)CC(=O)C1CCCCCCC(=O)OC,0 142 | COc1ccc2cc(CCC(C)=O)ccc2c1,0 143 | C=C1CCC2(O)C3Cc4ccc(O)c5c4C2(CCN3CC2CC2)C1O5,0 144 | Cc1cc2c(s1)Nc1ccccc1N=C2N1CCN(C)CC1,0 145 | CCCc1nc(C(C)(C)O)c(C(=O)O)n1Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1,0 146 | Cc1nccn1CC1CCc2c(c3ccccc3n2C)C1=O,0 147 | CC12CC(=CO)C(=O)CC1CCC1C2CCC2(C)C1CCC2(C)O,0 148 | O=C1Nc2ccccc2C1(c1ccc(O)cc1)c1ccc(O)cc1,0 149 | CCOC(=O)OC1(C(=O)COC(=O)CC)CCC2C3CCC4=CC(=O)C=CC4(C)C3C(O)CC21C,0 150 | CC1CN(c2c(F)c(N)c3c(=O)c(C(=O)O)cn(C4CC4)c3c2F)CC(C)N1,0 151 | Cc1ccc(O)c(C(CCN(C(C)C)C(C)C)c2ccccc2)c1,0 152 | CNCc1cc(-c2ccccc2F)n(S(=O)(=O)c2cccnc2)c1,0 153 | O=P(O)(O)C(O)(Cn1ccnc1)P(=O)(O)O,0 154 | O=C1OC2(c3cc(Br)c([O-])cc3Oc3c2cc(Br)c([O-])c3[Hg])c2ccccc21,0 155 | CC12C=CC(=O)C=C1CCC1C2CCC2(C)C1CCC2(C)O,0 156 | -------------------------------------------------------------------------------- /Discussion/GPC.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 27 22:04:01 2021 4 | 5 | @author: BM109X32G-10GPU-02 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | """ 10 | Created on Sun Nov 15 13:46:29 2020 11 | 12 | @author: de'' 13 | """ 14 | 15 | # -*- coding: utf-8 -*- 16 | """ 17 | Created on Sun Nov 15 10:40:57 2020 18 | 19 | @author: de'' 20 | """ 21 | 22 | from sklearn.datasets import make_blobs 23 | import json 24 | import numpy as np 25 | import math 26 | from tqdm import tqdm 27 | from scipy import sparse 28 | from sklearn.metrics import roc_auc_score,roc_curve,auc 29 | from sklearn.metrics import confusion_matrix 30 | from sklearn.gaussian_process.kernels import RBF 31 | import pandas as pd 32 | import matplotlib.pyplot as plt 33 | from rdkit import Chem 34 | from sklearn.gaussian_process import GaussianProcessClassifier as GPC 35 | from sklearn.ensemble import RandomForestClassifier 36 | from sklearn.model_selection import train_test_split 37 | from sklearn.preprocessing import MinMaxScaler 38 | from sklearn.neural_network import MLPClassifier 39 | from sklearn.svm import SVC 40 | from tensorflow.keras.models import Model, load_model 41 | from tensorflow.keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, concatenate 42 | from tensorflow.keras import metrics, optimizers 43 | from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau 44 | 45 | def split_smiles(smiles, kekuleSmiles=True): 46 | try: 47 | mol = Chem.MolFromSmiles(smiles) 48 | smiles = Chem.MolToSmiles(mol, kekuleSmiles=kekuleSmiles) 49 | except: 50 | pass 51 | splitted_smiles = [] 52 | for j, k in enumerate(smiles): 53 | if len(smiles) == 1: 54 | return [smiles] 55 | if j == 0: 56 | if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c": 57 | splitted_smiles.append(k + smiles[j + 1]) 58 | else: 59 | splitted_smiles.append(k) 60 | elif j != 0 and j < len(smiles) - 1: 61 | if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c": 62 | splitted_smiles.append(k + smiles[j + 1]) 63 | elif k.islower() and smiles[j - 1].isupper() and k != "c": 64 | pass 65 | else: 66 | splitted_smiles.append(k) 67 | 68 | elif j == len(smiles) - 1: 69 | if k.islower() and smiles[j - 1].isupper() and k != "c": 70 | pass 71 | else: 72 | splitted_smiles.append(k) 73 | return splitted_smiles 74 | 75 | def get_maxlen(all_smiles, kekuleSmiles=True): 76 | maxlen = 0 77 | for smi in tqdm(all_smiles): 78 | spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) 79 | if spt is None: 80 | continue 81 | maxlen = max(maxlen, len(spt)) 82 | return maxlen 83 | def get_dict(all_smiles, save_path, kekuleSmiles=True): 84 | words = [' '] 85 | for smi in tqdm(all_smiles): 86 | spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) 87 | if spt is None: 88 | continue 89 | for w in spt: 90 | if w in words: 91 | continue 92 | else: 93 | words.append(w) 94 | with open(save_path, 'w') as js: 95 | json.dump(words, js) 96 | return words 97 | 98 | def one_hot_coding(smi, words, kekuleSmiles=True, max_len=1000): 99 | coord_j = [] 100 | coord_k = [] 101 | spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) 102 | if spt is None: 103 | return None 104 | for j,w in enumerate(spt): 105 | if j >= max_len: 106 | break 107 | try: 108 | k = words.index(w) 109 | except: 110 | continue 111 | coord_j.append(j) 112 | coord_k.append(k) 113 | data = np.repeat(1, len(coord_j)) 114 | output = sparse.csr_matrix((data, (coord_j, coord_k)), shape=(max_len, len(words))) 115 | return output 116 | 117 | if __name__ == "__main__": 118 | data_train= pd.read_csv('E:/code/drug/drugnn/data_train.csv') 119 | data_test=pd.read_csv('E:/code/drug/drugnn/worddrug.csv') 120 | inchis = list(data_train['SMILES']) 121 | rts = list(data_train['type']) 122 | 123 | smiles, targets = [], [] 124 | for i, inc in enumerate(tqdm(inchis)): 125 | mol = Chem.MolFromSmiles(inc) 126 | if mol is None: 127 | continue 128 | else: 129 | smi = Chem.MolToSmiles(mol) 130 | smiles.append(smi) 131 | targets.append(rts[i]) 132 | 133 | words = get_dict(smiles, save_path='E:\code\FingerID Reference\drug-likeness/dict.json') 134 | 135 | features = [] 136 | for i, smi in enumerate(tqdm(smiles)): 137 | xi = one_hot_coding(smi, words, max_len=600) 138 | if xi is not None: 139 | features.append(xi.todense()) 140 | features = np.asarray(features) 141 | targets = np.asarray(targets) 142 | X_train=features 143 | Y_train=targets 144 | 145 | 146 | # physical_devices = tf.config.experimental.list_physical_devices('CPU') 147 | # assert len(physical_devices) > 0, "Not enough GPU hardware devices available" 148 | # tf.config.experimental.set_memory_growth(physical_devices[0], True) 149 | 150 | 151 | 152 | inchis = list(data_test['SMILES']) 153 | rts = list(data_test['type']) 154 | 155 | smiles, targets = [], [] 156 | for i, inc in enumerate(tqdm(inchis)): 157 | mol = Chem.MolFromSmiles(inc) 158 | if mol is None: 159 | continue 160 | else: 161 | smi = Chem.MolToSmiles(mol) 162 | smiles.append(smi) 163 | targets.append(rts[i]) 164 | 165 | # words = get_dict(smiles, save_path='D:/工作文件/work.Data/CNN/dict.json') 166 | 167 | features = [] 168 | for i, smi in enumerate(tqdm(smiles)): 169 | xi = one_hot_coding(smi, words, max_len=600) 170 | if xi is not None: 171 | features.append(xi.todense()) 172 | features = np.asarray(features) 173 | targets = np.asarray(targets) 174 | X_test=features 175 | Y_test=targets 176 | 177 | # kernel = 1.0 * RBF(0.8) 178 | #model = RandomForestClassifier(n_estimators=10,max_features='auto', max_depth=None,min_samples_split=2, bootstrap=True) 179 | model = GPC( random_state=111) 180 | 181 | # earlyStopping = EarlyStopping(monitor='val_loss', patience=0.05, verbose=0, mode='min') 182 | #mcp_save = ModelCheckpoint('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5', save_best_only=True, monitor='accuracy', mode='auto') 183 | # reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min') 184 | from tensorflow.keras import backend as K 185 | X_train = K.cast_to_floatx(X_train).reshape((np.size(X_train,0),np.size(X_train,1)*np.size(X_train,2))) 186 | 187 | Y_train = K.cast_to_floatx(Y_train) 188 | 189 | # X_train,Y_train = make_blobs(n_samples=300, n_features=n_features, centers=6) 190 | model.fit(X_train, Y_train) 191 | 192 | 193 | # model = load_model('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5') 194 | Y_predict = model.predict(K.cast_to_floatx(X_test).reshape((np.size(X_test,0),np.size(X_test,1)*np.size(X_test,2)))) 195 | #Y_predict = model.predict(X_test)#训练数据 196 | x = list(Y_test) 197 | y = list(Y_predict) 198 | from pandas.core.frame import DataFrame 199 | x=DataFrame(x) 200 | y=DataFrame(y) 201 | # X= pd.concat([x,y], axis=1) 202 | #X.to_csv('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/molecularGNN_smiles-master/0825/single-CNN-seed444.csv') 203 | Y_predict = [1 if i >0.4 else 0 for i in Y_predict] 204 | 205 | cnf_matrix=confusion_matrix(Y_test, Y_predict) 206 | cnf_matrix 207 | 208 | tn = cnf_matrix[0,0] 209 | tp = cnf_matrix[1,1] 210 | fn = cnf_matrix[1,0] 211 | fp = cnf_matrix[0,1] 212 | 213 | bacc = ((tp/(tp+fn))+(tn/(tn+fp)))/2#balance accurance 214 | pre = tp/(tp+fp)#precision/q+ 215 | rec = tp/(tp+fn)#recall/se 216 | sp=tn/(tn+fp) 217 | q_=tn/(tn+fn) 218 | f1 = 2*pre*rec/(pre+rec)#f1score 219 | mcc = ((tp*tn) - (fp*fn))/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))#Matthews correlation coefficient 220 | acc=(tp+tn)/(tp+fp+fn+tn)#accurancy 221 | fpr, tpr, thresholds =roc_curve(Y_test, Y_predict) 222 | AUC = auc(fpr, tpr) 223 | print('bacc:',bacc) 224 | print('pre:',pre) 225 | print('rec:',rec) 226 | print('f1:',f1) 227 | print('mcc:',mcc) 228 | print('sp:',sp) 229 | print('q_:',q_) 230 | print('acc:',acc) 231 | print('auc:',AUC) 232 | -------------------------------------------------------------------------------- /Discussion/svc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Nov 15 13:46:29 2020 4 | 5 | @author: de'' 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | """ 10 | Created on Sun Nov 15 10:40:57 2020 11 | 12 | @author: de'' 13 | """ 14 | 15 | from sklearn.datasets import make_blobs 16 | import json 17 | import numpy as np 18 | import math 19 | from tqdm import tqdm 20 | from scipy import sparse 21 | from sklearn.metrics import roc_auc_score,roc_curve,auc 22 | from sklearn.metrics import confusion_matrix 23 | 24 | import pandas as pd 25 | import matplotlib.pyplot as plt 26 | from rdkit import Chem 27 | 28 | from sklearn.ensemble import RandomForestClassifier 29 | from sklearn.model_selection import train_test_split 30 | from sklearn.preprocessing import MinMaxScaler 31 | from sklearn.neural_network import MLPClassifier 32 | from sklearn.svm import SVC 33 | from tensorflow.keras.models import Model, load_model 34 | from tensorflow.keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, concatenate 35 | from tensorflow.keras import metrics, optimizers 36 | from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau 37 | 38 | def split_smiles(smiles, kekuleSmiles=True): 39 | try: 40 | mol = Chem.MolFromSmiles(smiles) 41 | smiles = Chem.MolToSmiles(mol, kekuleSmiles=kekuleSmiles) 42 | except: 43 | pass 44 | splitted_smiles = [] 45 | for j, k in enumerate(smiles): 46 | if len(smiles) == 1: 47 | return [smiles] 48 | if j == 0: 49 | if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c": 50 | splitted_smiles.append(k + smiles[j + 1]) 51 | else: 52 | splitted_smiles.append(k) 53 | elif j != 0 and j < len(smiles) - 1: 54 | if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c": 55 | splitted_smiles.append(k + smiles[j + 1]) 56 | elif k.islower() and smiles[j - 1].isupper() and k != "c": 57 | pass 58 | else: 59 | splitted_smiles.append(k) 60 | 61 | elif j == len(smiles) - 1: 62 | if k.islower() and smiles[j - 1].isupper() and k != "c": 63 | pass 64 | else: 65 | splitted_smiles.append(k) 66 | return splitted_smiles 67 | 68 | def get_maxlen(all_smiles, kekuleSmiles=True): 69 | maxlen = 0 70 | for smi in tqdm(all_smiles): 71 | spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) 72 | if spt is None: 73 | continue 74 | maxlen = max(maxlen, len(spt)) 75 | return maxlen 76 | def get_dict(all_smiles, save_path, kekuleSmiles=True): 77 | words = [' '] 78 | for smi in tqdm(all_smiles): 79 | spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) 80 | if spt is None: 81 | continue 82 | for w in spt: 83 | if w in words: 84 | continue 85 | else: 86 | words.append(w) 87 | with open(save_path, 'w') as js: 88 | json.dump(words, js) 89 | return words 90 | 91 | def one_hot_coding(smi, words, kekuleSmiles=True, max_len=1000): 92 | coord_j = [] 93 | coord_k = [] 94 | spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) 95 | if spt is None: 96 | return None 97 | for j,w in enumerate(spt): 98 | if j >= max_len: 99 | break 100 | try: 101 | k = words.index(w) 102 | except: 103 | continue 104 | coord_j.append(j) 105 | coord_k.append(k) 106 | data = np.repeat(1, len(coord_j)) 107 | output = sparse.csr_matrix((data, (coord_j, coord_k)), shape=(max_len, len(words))) 108 | return output 109 | 110 | if __name__ == "__main__": 111 | 112 | 113 | data_train= pd.read_csv('E:/code/drug/drugnn/data_train.csv') 114 | data_test=pd.read_csv('E:/code/drug/drugnn/bro5.csv') 115 | inchis = list(data_train['SMILES']) 116 | rts = list(data_train['type']) 117 | 118 | smiles, targets = [], [] 119 | for i, inc in enumerate(tqdm(inchis)): 120 | mol = Chem.MolFromSmiles(inc) 121 | if mol is None: 122 | continue 123 | else: 124 | smi = Chem.MolToSmiles(mol) 125 | smiles.append(smi) 126 | targets.append(rts[i]) 127 | 128 | words = get_dict(smiles, save_path='E:\code\FingerID Reference\drug-likeness/dict.json') 129 | 130 | features = [] 131 | for i, smi in enumerate(tqdm(smiles)): 132 | xi = one_hot_coding(smi, words, max_len=2000) 133 | if xi is not None: 134 | features.append(xi.todense()) 135 | features = np.asarray(features) 136 | targets = np.asarray(targets) 137 | X_train=features 138 | Y_train=targets 139 | 140 | 141 | # physical_devices = tf.config.experimental.list_physical_devices('CPU') 142 | # assert len(physical_devices) > 0, "Not enough GPU hardware devices available" 143 | # tf.config.experimental.set_memory_growth(physical_devices[0], True) 144 | 145 | 146 | 147 | inchis = list(data_test['SMILES']) 148 | rts = list(data_test['type']) 149 | 150 | smiles, targets = [], [] 151 | for i, inc in enumerate(tqdm(inchis)): 152 | mol = Chem.MolFromSmiles(inc) 153 | if mol is None: 154 | continue 155 | else: 156 | smi = Chem.MolToSmiles(mol) 157 | smiles.append(smi) 158 | targets.append(rts[i]) 159 | 160 | # words = get_dict(smiles, save_path='D:/工作文件/work.Data/CNN/dict.json') 161 | 162 | features = [] 163 | for i, smi in enumerate(tqdm(smiles)): 164 | xi = one_hot_coding(smi, words, max_len=2000) 165 | if xi is not None: 166 | features.append(xi.todense()) 167 | features = np.asarray(features) 168 | targets = np.asarray(targets) 169 | X_test=features 170 | Y_test=targets 171 | 172 | 173 | #model = RandomForestClassifier(n_estimators=10,max_features='auto', max_depth=None,min_samples_split=2, bootstrap=True) 174 | #model = MLPClassifier(rangdom_state=1,max_iter=300) 175 | model = SVC(C=500, kernel='rbf', gamma='auto', 176 | coef0=0.0, shrinking=True,probability=False, tol=0.0001, cache_size=200, class_weight=None, verbose=False, max_iter=- 1, decision_function_shape='ovr', break_ties=False, random_state=None) 177 | 178 | # earlyStopping = EarlyStopping(monitor='val_loss', patience=0.05, verbose=0, mode='min') 179 | #mcp_save = ModelCheckpoint('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5', save_best_only=True, monitor='accuracy', mode='auto') 180 | # reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min') 181 | from tensorflow.keras import backend as K 182 | X_train = K.cast_to_floatx(X_train).reshape((np.size(X_train,0),np.size(X_train,1)*np.size(X_train,2))) 183 | 184 | Y_train = K.cast_to_floatx(Y_train) 185 | 186 | # X_train,Y_train = make_blobs(n_samples=300, n_features=n_features, centers=6) 187 | model.fit(X_train, Y_train) 188 | 189 | 190 | # model = load_model('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5') 191 | Y_predict = model.predict(K.cast_to_floatx(X_test).reshape((np.size(X_test,0),np.size(X_test,1)*np.size(X_test,2)))) 192 | #Y_predict = model.predict(X_test)#训练数据 193 | x = list(Y_test) 194 | y = list(Y_predict) 195 | from pandas.core.frame import DataFrame 196 | x=DataFrame(x) 197 | y=DataFrame(y) 198 | # X= pd.concat([x,y], axis=1) 199 | #X.to_csv('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/molecularGNN_smiles-master/0825/single-CNN-seed444.csv') 200 | Y_predict = [1 if i >0.5 else 0 for i in Y_predict] 201 | 202 | cnf_matrix=confusion_matrix(Y_test, Y_predict) 203 | cnf_matrix 204 | 205 | tn = cnf_matrix[0,0] 206 | tp = cnf_matrix[1,1] 207 | fn = cnf_matrix[1,0] 208 | fp = cnf_matrix[0,1] 209 | 210 | bacc = ((tp/(tp+fn))+(tn/(tn+fp)))/2#balance accurance 211 | pre = tp/(tp+fp)#precision/q+ 212 | rec = tp/(tp+fn)#recall/se 213 | sp=tn/(tn+fp) 214 | q_=tn/(tn+fn) 215 | f1 = 2*pre*rec/(pre+rec)#f1score 216 | mcc = ((tp*tn) - (fp*fn))/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))#Matthews correlation coefficient 217 | acc=(tp+tn)/(tp+fp+fn+tn)#accurancy 218 | fpr, tpr, thresholds =roc_curve(Y_test, Y_predict) 219 | AUC = auc(fpr, tpr) 220 | print('bacc:',bacc) 221 | print('pre:',pre) 222 | print('rec:',rec) 223 | print('f1:',f1) 224 | print('mcc:',mcc) 225 | print('sp:',sp) 226 | print('q_:',q_) 227 | print('acc:',acc) 228 | print('auc:',AUC) -------------------------------------------------------------------------------- /DGCAN/preprocess.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Apr 27 20:09:31 2022 5 | 6 | @author:Jinyu-Sun 7 | """ 8 | 9 | from collections import defaultdict 10 | import numpy as np 11 | from rdkit import Chem 12 | import torch 13 | 14 | device = torch.device('cuda') 15 | 16 | atom_dict = defaultdict(lambda: len(atom_dict)) 17 | bond_dict = defaultdict(lambda: len(bond_dict)) 18 | fingerprint_dict = defaultdict(lambda: len(fingerprint_dict)) 19 | edge_dict = defaultdict(lambda: len(edge_dict)) 20 | radius=1 21 | def create_atoms(mol, atom_dict): 22 | """Transform the atom types in a molecule (e.g., H, C, and O) 23 | into the indices (e.g., H=0, C=1, and O=2). 24 | Note that each atom index considers the aromaticity. 25 | """ 26 | atoms = [a.GetSymbol() for a in mol.GetAtoms()] 27 | for a in mol.GetAromaticAtoms(): 28 | i = a.GetIdx() 29 | atoms[i] = (atoms[i], 'aromatic') 30 | atoms = [atom_dict[a] for a in atoms] 31 | return np.array(atoms) 32 | 33 | 34 | def create_ijbonddict(mol, bond_dict): 35 | """Create a dictionary, in which each key is a node ID 36 | and each value is the tuples of its neighboring node 37 | and chemical bond (e.g., single and double) IDs. 38 | 39 | """ 40 | i_jbond_dict = defaultdict(lambda: []) 41 | for b in mol.GetBonds(): 42 | i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx() 43 | bond = bond_dict[str(b.GetBondType())] 44 | i_jbond_dict[i].append((j, bond)) 45 | i_jbond_dict[j].append((i, bond)) 46 | return i_jbond_dict 47 | 48 | 49 | def extract_fingerprints(radius, atoms, i_jbond_dict, 50 | fingerprint_dict, edge_dict): 51 | """Extract the fingerprints from a molecular graph 52 | based on Weisfeiler-Lehman algorithm. 53 | """ 54 | 55 | if (len(atoms) == 1) or (radius == 0): 56 | nodes = [fingerprint_dict[a] for a in atoms] 57 | 58 | else: 59 | nodes = atoms 60 | i_jedge_dict = i_jbond_dict 61 | 62 | for _ in range(radius): 63 | 64 | """Update each node ID considering its neighboring nodes and edges. 65 | The updated node IDs are the fingerprint IDs.。 66 | """ 67 | nodes_ = [] 68 | for i, j_edge in i_jedge_dict.items(): 69 | neighbors = [(nodes[j], edge) for j, edge in j_edge] 70 | fingerprint = (nodes[i], tuple(sorted(neighbors))) 71 | nodes_.append(fingerprint_dict[fingerprint]) 72 | 73 | """Also update each edge ID considering 74 | its two nodes on both sides. 75 | """ 76 | i_jedge_dict_ = defaultdict(lambda: []) 77 | for i, j_edge in i_jedge_dict.items(): 78 | for j, edge in j_edge: 79 | both_side = tuple(sorted((nodes[i], nodes[j]))) 80 | edge = edge_dict[(both_side, edge)] 81 | i_jedge_dict_[i].append((j, edge)) 82 | 83 | nodes = nodes_ 84 | i_jedge_dict = i_jedge_dict_ 85 | 86 | return np.array(nodes) 87 | 88 | 89 | def split_dataset(dataset, ratio): 90 | """Shuffle and split a dataset.""" 91 | np.random.seed(1234) # fix the seed for shuffle 92 | # np.random.shuffle(dataset) 93 | n = int(ratio * len(dataset)) 94 | return dataset[:n], dataset[n:] 95 | def create_testdataset(filename,path,dataname,property): 96 | dir_dataset = path+dataname 97 | print(filename) 98 | """Load a dataset.""" 99 | if property== False: 100 | with open(dir_dataset + filename, 'r') as f: 101 | #smiles_property = f.readline().strip().split() 102 | data_original = f.read().strip().split() 103 | data_original = [data for data in data_original 104 | if '.' not in data.split()[0]] 105 | dataset = [] 106 | for data in data_original: 107 | smiles = data 108 | try: 109 | """Create each data with the above defined functions.""" 110 | mol = Chem.AddHs(Chem.MolFromSmiles(smiles)) 111 | atoms = create_atoms(mol, atom_dict) 112 | molecular_size = len(atoms) 113 | i_jbond_dict = create_ijbonddict(mol, bond_dict) 114 | fingerprints = extract_fingerprints(radius, atoms, i_jbond_dict, 115 | fingerprint_dict, edge_dict) 116 | adjacency = Chem.GetAdjacencyMatrix(mol) 117 | """Transform the above each data of numpy 118 | to pytorch tensor on a device (i.e., CPU or GPU). 119 | """ 120 | fingerprints = torch.LongTensor(fingerprints).to(device) 121 | adjacency = torch.FloatTensor(adjacency).to(device) 122 | proper = torch.LongTensor([int(0)]).to(device) 123 | dataset.append((smiles,fingerprints, adjacency, molecular_size,proper )) 124 | except: 125 | print(smiles) 126 | elif property== True: 127 | with open(dir_dataset + filename, 'r') as f: 128 | # smiles_property = f.readline().strip().split() 129 | data_original = f.read().strip().split('\n') 130 | 131 | data_original = [data for data in data_original 132 | if '.' not in data.split()[0]] 133 | dataset = [] 134 | for data in data_original: 135 | smiles, proper = data.strip().split() 136 | try: 137 | """Create each data with the above defined functions.""" 138 | mol = Chem.AddHs(Chem.MolFromSmiles(smiles)) 139 | atoms = create_atoms(mol, atom_dict) 140 | molecular_size = len(atoms) 141 | i_jbond_dict = create_ijbonddict(mol, bond_dict) 142 | fingerprints = extract_fingerprints(radius, atoms, i_jbond_dict, 143 | fingerprint_dict, edge_dict) 144 | adjacency = Chem.GetAdjacencyMatrix(mol) 145 | 146 | """Transform the above each data of numpy 147 | to pytorch tensor on a device (i.e., CPU or GPU). 148 | """ 149 | fingerprints = torch.LongTensor(fingerprints).to(device) 150 | adjacency = torch.FloatTensor(adjacency).to(device) 151 | proper = torch.LongTensor([int(proper)]).to(device) 152 | dataset.append((smiles,fingerprints, adjacency, molecular_size, proper)) 153 | except: 154 | print(smiles+'is error') 155 | return dataset 156 | 157 | def create_dataset(filename,path,dataname): 158 | dir_dataset = path+dataname 159 | print(filename) 160 | """Load a dataset.""" 161 | try: 162 | with open(dir_dataset + filename, 'r') as f: 163 | smiles_property = f.readline().strip().split() 164 | data_original = f.read().strip().split('\n') 165 | except: 166 | with open(dir_dataset + filename, 'r') as f: 167 | smiles_property = f.readline().strip().split() 168 | data_original = f.read().strip().split('\n') 169 | 170 | """Exclude the data contains '.' in its smiles.排除含.的数据""" 171 | data_original = [data for data in data_original 172 | if '.' not in data.split()[0]] 173 | dataset = [] 174 | for data in data_original: 175 | smiles, property = data.strip().split() 176 | try: 177 | """Create each data with the above defined functions.""" 178 | mol = Chem.AddHs(Chem.MolFromSmiles(smiles)) 179 | atoms = create_atoms(mol, atom_dict) 180 | molecular_size = len(atoms) 181 | i_jbond_dict = create_ijbonddict(mol, bond_dict) 182 | fingerprints = extract_fingerprints(radius, atoms, i_jbond_dict, 183 | fingerprint_dict, edge_dict) 184 | adjacency = Chem.GetAdjacencyMatrix(mol) 185 | """ 186 | Transform the above each data of numpy 187 | to pytorch tensor on a device (i.e., CPU or GPU). 188 | """ 189 | fingerprints = torch.LongTensor(fingerprints).to(device) 190 | adjacency = torch.FloatTensor(adjacency).to(device) 191 | property = torch.LongTensor([int(property)]).to(device) 192 | dataset.append((smiles,fingerprints, adjacency, molecular_size, property)) 193 | except: 194 | print(smiles) 195 | return dataset 196 | 197 | 198 | -------------------------------------------------------------------------------- /Discussion/RF.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 27 22:08:23 2021 4 | 5 | @author:Jinyusun 6 | """ 7 | 8 | 9 | from sklearn.datasets import make_blobs 10 | import json 11 | import numpy as np 12 | import math 13 | from tqdm import tqdm 14 | from scipy import sparse 15 | from sklearn.metrics import roc_auc_score,roc_curve,auc 16 | from sklearn.metrics import confusion_matrix 17 | 18 | import pandas as pd 19 | import matplotlib.pyplot as plt 20 | from rdkit import Chem 21 | 22 | from sklearn.ensemble import RandomForestClassifier 23 | from sklearn.model_selection import train_test_split 24 | from sklearn.preprocessing import MinMaxScaler 25 | from sklearn.neural_network import MLPClassifier 26 | from sklearn.svm import SVC 27 | from tensorflow.keras.models import Model, load_model 28 | from tensorflow.keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, concatenate 29 | from tensorflow.keras import metrics, optimizers 30 | from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau 31 | 32 | def split_smiles(smiles, kekuleSmiles=True): 33 | try: 34 | mol = Chem.MolFromSmiles(smiles) 35 | smiles = Chem.MolToSmiles(mol, kekuleSmiles=kekuleSmiles) 36 | except: 37 | pass 38 | splitted_smiles = [] 39 | for j, k in enumerate(smiles): 40 | if len(smiles) == 1: 41 | return [smiles] 42 | if j == 0: 43 | if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c": 44 | splitted_smiles.append(k + smiles[j + 1]) 45 | else: 46 | splitted_smiles.append(k) 47 | elif j != 0 and j < len(smiles) - 1: 48 | if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c": 49 | splitted_smiles.append(k + smiles[j + 1]) 50 | elif k.islower() and smiles[j - 1].isupper() and k != "c": 51 | pass 52 | else: 53 | splitted_smiles.append(k) 54 | 55 | elif j == len(smiles) - 1: 56 | if k.islower() and smiles[j - 1].isupper() and k != "c": 57 | pass 58 | else: 59 | splitted_smiles.append(k) 60 | return splitted_smiles 61 | 62 | def get_maxlen(all_smiles, kekuleSmiles=True): 63 | maxlen = 0 64 | for smi in tqdm(all_smiles): 65 | spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) 66 | if spt is None: 67 | continue 68 | maxlen = max(maxlen, len(spt)) 69 | return maxlen 70 | def get_dict(all_smiles, save_path, kekuleSmiles=True): 71 | words = [' '] 72 | for smi in tqdm(all_smiles): 73 | spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) 74 | if spt is None: 75 | continue 76 | for w in spt: 77 | if w in words: 78 | continue 79 | else: 80 | words.append(w) 81 | with open(save_path, 'w') as js: 82 | json.dump(words, js) 83 | return words 84 | 85 | def one_hot_coding(smi, words, kekuleSmiles=True, max_len=1000): 86 | coord_j = [] 87 | coord_k = [] 88 | spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) 89 | if spt is None: 90 | return None 91 | for j,w in enumerate(spt): 92 | if j >= max_len: 93 | break 94 | try: 95 | k = words.index(w) 96 | except: 97 | continue 98 | coord_j.append(j) 99 | coord_k.append(k) 100 | data = np.repeat(1, len(coord_j)) 101 | output = sparse.csr_matrix((data, (coord_j, coord_k)), shape=(max_len, len(words))) 102 | return output 103 | def split_dataset(dataset, ratio): 104 | """Shuffle and split a dataset.""" 105 | # np.random.seed(111) # fix the seed for shuffle. 106 | #np.random.shuffle(dataset) 107 | n = int(ratio * len(dataset)) 108 | return dataset[:n], dataset[n:] 109 | def edit_dataset(drug,non_drug,task): 110 | # np.random.seed(111) # fix the seed for shuffle. 111 | 112 | # np.random.shuffle(non_drug) 113 | non_drug=non_drug[0:len(drug)] 114 | 115 | 116 | # np.random.shuffle(non_drug) 117 | # np.random.shuffle(drug) 118 | dataset_train_drug, dataset_test_drug = split_dataset(drug, 0.9) 119 | # dataset_train_drug,dataset_dev_drug = split_dataset(dataset_train_drug, 0.9) 120 | dataset_train_no, dataset_test_no = split_dataset(non_drug, 0.9) 121 | # dataset_train_no,dataset_dev_no = split_dataset(dataset_train_no, 0.9) 122 | dataset_train = pd.concat([dataset_train_drug,dataset_train_no], axis=0) 123 | dataset_test=pd.concat([ dataset_test_drug,dataset_test_no], axis=0) 124 | # dataset_dev = dataset_dev_drug+dataset_dev_no 125 | return dataset_train, dataset_test 126 | if __name__ == "__main__": 127 | data_train= pd.read_csv('E:/code/drug/drugnn/data_train.csv') 128 | data_test=pd.read_csv('E:/code/drug/drugnn/bro5.csv') 129 | inchis = list(data_train['SMILES']) 130 | rts = list(data_train['type']) 131 | 132 | smiles, targets = [], [] 133 | for i, inc in enumerate(tqdm(inchis)): 134 | mol = Chem.MolFromSmiles(inc) 135 | if mol is None: 136 | continue 137 | else: 138 | smi = Chem.MolToSmiles(mol) 139 | smiles.append(smi) 140 | targets.append(rts[i]) 141 | 142 | words = get_dict(smiles, save_path='E:\code\FingerID Reference\drug-likeness/dict.json') 143 | 144 | features = [] 145 | for i, smi in enumerate(tqdm(smiles)): 146 | xi = one_hot_coding(smi, words, max_len=600) 147 | if xi is not None: 148 | features.append(xi.todense()) 149 | features = np.asarray(features) 150 | targets = np.asarray(targets) 151 | X_train=features 152 | Y_train=targets 153 | 154 | 155 | # physical_devices = tf.config.experimental.list_physical_devices('CPU') 156 | # assert len(physical_devices) > 0, "Not enough GPU hardware devices available" 157 | # tf.config.experimental.set_memory_growth(physical_devices[0], True) 158 | 159 | 160 | 161 | inchis = list(data_test['SMILES']) 162 | rts = list(data_test['type']) 163 | 164 | smiles, targets = [], [] 165 | for i, inc in enumerate(tqdm(inchis)): 166 | mol = Chem.MolFromSmiles(inc) 167 | if mol is None: 168 | continue 169 | else: 170 | smi = Chem.MolToSmiles(mol) 171 | smiles.append(smi) 172 | targets.append(rts[i]) 173 | 174 | # words = get_dict(smiles, save_path='D:/工作文件/work.Data/CNN/dict.json') 175 | 176 | features = [] 177 | for i, smi in enumerate(tqdm(smiles)): 178 | xi = one_hot_coding(smi, words, max_len=600) 179 | if xi is not None: 180 | features.append(xi.todense()) 181 | features = np.asarray(features) 182 | targets = np.asarray(targets) 183 | X_test=features 184 | Y_test=targets 185 | n_features=10 186 | 187 | model = RandomForestClassifier(n_estimators=5,max_features='auto', max_depth=None,min_samples_split=5, bootstrap=True) 188 | #model = MLPClassifier(rangdom_state=1,max_iter=300) 189 | #model = SVC() 190 | 191 | # earlyStopping = EarlyStopping(monitor='val_loss', patience=0.05, verbose=0, mode='min') 192 | #mcp_save = ModelCheckpoint('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5', save_best_only=True, monitor='accuracy', mode='auto') 193 | # reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min') 194 | from tensorflow.keras import backend as K 195 | X_train = K.cast_to_floatx(X_train).reshape((np.size(X_train,0),np.size(X_train,1)*np.size(X_train,2))) 196 | 197 | Y_train = K.cast_to_floatx(Y_train) 198 | 199 | # X_train,Y_train = make_blobs(n_samples=300, n_features=n_features, centers=6) 200 | model.fit(X_train, Y_train) 201 | 202 | 203 | # model = load_model('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5') 204 | Y_predict = model.predict(K.cast_to_floatx(X_test).reshape((np.size(X_test,0),np.size(X_test,1)*np.size(X_test,2)))) 205 | #Y_predict = model.predict(X_test)#训练数据 206 | x = list(Y_test) 207 | y = list(Y_predict) 208 | from pandas.core.frame import DataFrame 209 | x=DataFrame(x) 210 | y=DataFrame(y) 211 | # X= pd.concat([x,y], axis=1) 212 | #X.to_csv('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/molecularGNN_smiles-master/0825/single-CNN-seed444.csv') 213 | Y_predict = [1 if i >0.4 else 0 for i in Y_predict] 214 | 215 | cnf_matrix=confusion_matrix(Y_test, Y_predict) 216 | cnf_matrix 217 | 218 | tn = cnf_matrix[0,0] 219 | tp = cnf_matrix[1,1] 220 | fn = cnf_matrix[1,0] 221 | fp = cnf_matrix[0,1] 222 | 223 | bacc = ((tp/(tp+fn))+(tn/(tn+fp)))/2#balance accurance 224 | pre = tp/(tp+fp)#precision/q+ 225 | rec = tp/(tp+fn)#recall/se 226 | sp=tn/(tn+fp) 227 | q_=tn/(tn+fn) 228 | f1 = 2*pre*rec/(pre+rec)#f1score 229 | mcc = ((tp*tn) - (fp*fn))/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))#Matthews correlation coefficient 230 | acc=(tp+tn)/(tp+fp+fn+tn)#accurancy 231 | fpr, tpr, thresholds =roc_curve(Y_test, Y_predict) 232 | AUC = auc(fpr, tpr) 233 | print('bacc:',bacc) 234 | print('pre:',pre) 235 | print('rec:',rec) 236 | print('f1:',f1) 237 | print('mcc:',mcc) 238 | print('sp:',sp) 239 | print('q_:',q_) 240 | print('acc:',acc) 241 | print('auc:',AUC) 242 | -------------------------------------------------------------------------------- /Discussion/CNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Nov 15 10:40:57 2020 4 | 5 | @author: de'' 6 | """ 7 | 8 | import json 9 | import numpy as np 10 | import math 11 | from tqdm import tqdm 12 | from scipy import sparse 13 | from sklearn.metrics import roc_auc_score,roc_curve,auc 14 | from sklearn.metrics import confusion_matrix 15 | 16 | import pandas as pd 17 | import matplotlib.pyplot as plt 18 | from rdkit import Chem 19 | 20 | from sklearn.model_selection import train_test_split 21 | from sklearn.preprocessing import MinMaxScaler 22 | from sklearn.metrics import mean_absolute_error, r2_score,median_absolute_error 23 | from tensorflow.keras.models import Model, load_model 24 | from tensorflow.keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, concatenate 25 | from tensorflow.keras import metrics, optimizers 26 | from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau 27 | 28 | def split_smiles(smiles, kekuleSmiles=True): 29 | try: 30 | mol = Chem.MolFromSmiles(smiles) 31 | smiles = Chem.MolToSmiles(mol, kekuleSmiles=kekuleSmiles) 32 | except: 33 | pass 34 | splitted_smiles = [] 35 | for j, k in enumerate(smiles): 36 | if len(smiles) == 1: 37 | return [smiles] 38 | if j == 0: 39 | if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c": 40 | splitted_smiles.append(k + smiles[j + 1]) 41 | else: 42 | splitted_smiles.append(k) 43 | elif j != 0 and j < len(smiles) - 1: 44 | if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c": 45 | splitted_smiles.append(k + smiles[j + 1]) 46 | elif k.islower() and smiles[j - 1].isupper() and k != "c": 47 | pass 48 | else: 49 | splitted_smiles.append(k) 50 | 51 | elif j == len(smiles) - 1: 52 | if k.islower() and smiles[j - 1].isupper() and k != "c": 53 | pass 54 | else: 55 | splitted_smiles.append(k) 56 | return splitted_smiles 57 | 58 | def get_maxlen(all_smiles, kekuleSmiles=True): 59 | maxlen = 0 60 | for smi in tqdm(all_smiles): 61 | spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) 62 | if spt is None: 63 | continue 64 | maxlen = max(maxlen, len(spt)) 65 | return maxlen 66 | def get_dict(all_smiles, save_path, kekuleSmiles=True): 67 | words = [' '] 68 | for smi in tqdm(all_smiles): 69 | spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) 70 | if spt is None: 71 | continue 72 | for w in spt: 73 | if w in words: 74 | continue 75 | else: 76 | words.append(w) 77 | with open(save_path, 'w') as js: 78 | json.dump(words, js) 79 | return words 80 | 81 | def one_hot_coding(smi, words, kekuleSmiles=True, max_len=1000): 82 | coord_j = [] 83 | coord_k = [] 84 | spt = split_smiles(smi, kekuleSmiles=kekuleSmiles) 85 | if spt is None: 86 | return None 87 | for j,w in enumerate(spt): 88 | if j >= max_len: 89 | break 90 | try: 91 | k = words.index(w) 92 | except: 93 | continue 94 | coord_j.append(j) 95 | coord_k.append(k) 96 | data = np.repeat(1, len(coord_j)) 97 | output = sparse.csr_matrix((data, (coord_j, coord_k)), shape=(max_len, len(words))) 98 | return output 99 | def split_dataset(dataset, ratio): 100 | """Shuffle and split a dataset.""" 101 | # np.random.seed(111) # fix the seed for shuffle. 102 | #np.random.shuffle(dataset) 103 | n = int(ratio * len(dataset)) 104 | return dataset[:n], dataset[n:] 105 | def edit_dataset(drug,non_drug,task): 106 | # np.random.seed(111) # fix the seed for shuffle. 107 | 108 | # np.random.shuffle(non_drug) 109 | non_drug=non_drug[0:len(drug)] 110 | 111 | 112 | # np.random.shuffle(non_drug) 113 | # np.random.shuffle(drug) 114 | dataset_train_drug, dataset_test_drug = split_dataset(drug, 0.9) 115 | # dataset_train_drug,dataset_dev_drug = split_dataset(dataset_train_drug, 0.9) 116 | dataset_train_no, dataset_test_no = split_dataset(non_drug, 0.9) 117 | # dataset_train_no,dataset_dev_no = split_dataset(dataset_train_no, 0.9) 118 | dataset_train = pd.concat([dataset_train_drug,dataset_train_no], axis=0) 119 | dataset_test=pd.concat([ dataset_test_drug,dataset_test_no], axis=0) 120 | # dataset_dev = dataset_dev_drug+dataset_dev_no 121 | return dataset_train, dataset_test 122 | if __name__ == "__main__": 123 | data_train= pd.read_csv('E:/code/drug/drugnn/data_train.csv') 124 | data_test=pd.read_csv('E:/code/drug/drugnn/bro5.csv') 125 | inchis = list(data_train['SMILES']) 126 | rts = list(data_train['type']) 127 | 128 | smiles, targets = [], [] 129 | for i, inc in enumerate(tqdm(inchis)): 130 | mol = Chem.MolFromSmiles(inc) 131 | if mol is None: 132 | continue 133 | else: 134 | smi = Chem.MolToSmiles(mol) 135 | smiles.append(smi) 136 | targets.append(rts[i]) 137 | 138 | words = get_dict(smiles, save_path='E:\code\FingerID Reference\drug-likeness/dict.json') 139 | 140 | features = [] 141 | for i, smi in enumerate(tqdm(smiles)): 142 | xi = one_hot_coding(smi, words, max_len=600) 143 | if xi is not None: 144 | features.append(xi.todense()) 145 | features = np.asarray(features) 146 | targets = np.asarray(targets) 147 | X_train=features 148 | Y_train=targets 149 | 150 | import tensorflow as tf 151 | # physical_devices = tf.config.experimental.list_physical_devices('CPU') 152 | # assert len(physical_devices) > 0, "Not enough GPU hardware devices available" 153 | # tf.config.experimental.set_memory_growth(physical_devices[0], True) 154 | 155 | 156 | 157 | inchis = list(data_test['SMILES']) 158 | rts = list(data_test['type']) 159 | 160 | smiles, targets = [], [] 161 | for i, inc in enumerate(tqdm(inchis)): 162 | mol = Chem.MolFromSmiles(inc) 163 | if mol is None: 164 | continue 165 | else: 166 | smi = Chem.MolToSmiles(mol) 167 | smiles.append(smi) 168 | targets.append(rts[i]) 169 | 170 | # words = get_dict(smiles, save_path='D:/工作文件/work.Data/CNN/dict.json') 171 | 172 | features = [] 173 | for i, smi in enumerate(tqdm(smiles)): 174 | xi = one_hot_coding(smi, words, max_len=600) 175 | if xi is not None: 176 | features.append(xi.todense()) 177 | features = np.asarray(features) 178 | targets = np.asarray(targets) 179 | X_test=features 180 | Y_test=targets 181 | layer_in = Input(shape=(X_train.shape[1:3]), name="smile") 182 | layer_conv = layer_in 183 | for i in range(6): 184 | layer_conv = Conv1D(128, kernel_size=4, activation='relu', kernel_initializer='normal')(layer_conv) 185 | layer_conv = MaxPooling1D(pool_size=2)(layer_conv) 186 | layer_dense = Flatten()(layer_conv) 187 | 188 | for i in range(1): 189 | layer_dense = Dense(32, activation="relu", kernel_initializer='normal')(layer_dense) 190 | layer_output = Dense(1, activation="sigmoid", name="output")(layer_dense) 191 | 192 | # earlyStopping = EarlyStopping(monitor='val_loss', patience=0.05, verbose=0, mode='min') 193 | #mcp_save = ModelCheckpoint('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5', save_best_only=True, monitor='accuracy', mode='auto') 194 | # reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min') 195 | 196 | model = Model(layer_in, outputs = layer_output) 197 | opt = optimizers.Adam(lr=0.0005) 198 | model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy']) 199 | from tensorflow.keras import backend as K #转换为张量 200 | X_train = K.cast_to_floatx(X_train) 201 | Y_train = K.cast_to_floatx(Y_train) 202 | history = model.fit(X_train, Y_train, epochs=12) 203 | 204 | # plot loss 205 | plt.plot(history.history['loss']) 206 | plt.plot(history.history['accuracy']) 207 | # plt.plot(history.history['val_loss']) 208 | # plt.plot(history.history['val_accuracy']) 209 | plt.ylabel('values') 210 | plt.xlabel('epoch') 211 | # plt.legend(['loss', 'mae', 'val_loss', 'val_mae'], loc='upper left') 212 | plt.show() 213 | # model = load_model('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5') 214 | Y_predict = model.predict(K.cast_to_floatx(X_test)) 215 | #Y_predict = model.predict(X_test)#训练数据 216 | x = list(Y_test) 217 | y = list(Y_predict) 218 | from pandas.core.frame import DataFrame 219 | x=DataFrame(x) 220 | y=DataFrame(y) 221 | # X= pd.concat([x,y], axis=1) 222 | #X.to_csv('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/molecularGNN_smiles-master/0825/single-CNN-seed444.csv') 223 | Y_predict = [1 if i >0.4 else 0 for i in Y_predict] 224 | 225 | cnf_matrix=confusion_matrix(Y_test, Y_predict) 226 | cnf_matrix 227 | 228 | tn = cnf_matrix[0,0] 229 | tp = cnf_matrix[1,1] 230 | fn = cnf_matrix[1,0] 231 | fp = cnf_matrix[0,1] 232 | 233 | bacc = ((tp/(tp+fn))+(tn/(tn+fp)))/2#balance accurance 234 | pre = tp/(tp+fp)#precision/q+ 235 | rec = tp/(tp+fn)#recall/se 236 | sp=tn/(tn+fp) 237 | q_=tn/(tn+fn) 238 | f1 = 2*pre*rec/(pre+rec)#f1score 239 | mcc = ((tp*tn) - (fp*fn))/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))#Matthews correlation coefficient 240 | acc=(tp+tn)/(tp+fp+fn+tn)#accurancy 241 | fpr, tpr, thresholds =roc_curve(Y_test, Y_predict) 242 | AUC = auc(fpr, tpr) 243 | print('bacc:',bacc) 244 | print('pre:',pre) 245 | print('rec:',rec) 246 | print('f1:',f1) 247 | print('mcc:',mcc) 248 | print('sp:',sp) 249 | print('q_:',q_) 250 | print('acc:',acc) 251 | print('auc:',AUC) -------------------------------------------------------------------------------- /DGCAN/DGCAN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 27 18:04:58 2022 4 | 5 | @author:Jinyu-Sun 6 | """ 7 | 8 | #coding=utf-8 9 | import timeit 10 | import sys 11 | import numpy as np 12 | import math 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | import torch.optim as optim 17 | import pickle 18 | from sklearn.metrics import roc_auc_score, roc_curve 19 | from sklearn.metrics import confusion_matrix 20 | import preprocess as pp 21 | import pandas as pd 22 | import matplotlib.pyplot as plt 23 | 24 | if torch.cuda.is_available(): 25 | device = torch.device('cuda') 26 | 27 | else: 28 | device = torch.device('cpu') 29 | 30 | torch.cuda.empty_cache() 31 | class GraphAttentionLayer(nn.Module): 32 | def __init__(self, in_features, out_features, dropout, alpha, concat=True): 33 | super(GraphAttentionLayer, self).__init__() 34 | self.dropout = dropout 35 | self.concat = concat 36 | self.in_features = in_features #dim of input feature 37 | self.out_features = out_features #dim of output feature 38 | self.alpha = alpha # negative_slope leakyrelu 39 | self.W = nn.Parameter(torch.zeros(size=(in_features, out_features))) 40 | 41 | self.a = nn.Parameter(torch.zeros(size=(2 * out_features, 1))) 42 | torch.nn.init.xavier_uniform_(self.W , gain=2.0) 43 | #torch.nn.init.kaiming_uniform_(self.a, a=0, mode='fan_in', nonlinearity='leaky_relu') 44 | torch.nn.init.xavier_uniform_(self.W , gain=1.9) 45 | self.leakyrelu = nn.LeakyReLU(self.alpha) 46 | 47 | def forward(self, input, adj): 48 | """ 49 | input: input_feature [N, in_features] in_features indicates the number of elements of the input feature vector of the node 50 | adj: adjacency matrix of the graph dimension [N, N] non-zero is one, data structure basics 51 | """ 52 | h = torch.mm(input, self.W) # [N, out_features] 53 | N = h.size()[0] #Number of nodes of the graph 54 | a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features) # [N, N, 2*out_features] 55 | e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2)) 56 | zero_vec =-9e10 *torch.ones_like(e) 57 | attention = torch.where(adj > 0, e, zero_vec) 58 | # indicates that if the adjacency matrix element is greater than 0, then the two nodes are connected and the attention factor at that position is retained. 59 | # Otherwise it is necessary to mask and set to a very small value, the reason is that this minimum value will be disregarded during softmax. 60 | attention = F.softmax(attention, dim=1) 61 | attention = F.dropout(attention, self.dropout, training=self.training) 62 | h_prime = torch.matmul(attention, h) 63 | if self.concat: 64 | return F.elu(h_prime) 65 | else: 66 | return h_prime 67 | 68 | 69 | class GAT(nn.Module): 70 | def __init__(self, nfeat, nhid, dropout, alpha, nheads): 71 | super(GAT, self).__init__() 72 | """ 73 | n_heads indicates that there are several GAL layers, which are finally stitched together, similar to self-attention 74 | to extract features from different subspaces. 75 | """ 76 | self.dropout = dropout 77 | self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in 78 | range(nheads)] 79 | for i, attention in enumerate(self.attentions): 80 | self.add_module('attention_{}'.format(i), attention) 81 | 82 | self.out_att = GraphAttentionLayer(nhid,56, dropout=dropout, alpha=alpha, concat=False) 83 | self.nheads=nheads 84 | 85 | def forward(self, x, adj): 86 | x = F.dropout(x, self.dropout, training=self.training) 87 | #x = torch.cat([att(x, adj) for att in self.attentions], dim=1) 88 | 89 | z=torch.zeros_like(self.attentions[1](x, adj)) 90 | for att in self.attentions: 91 | z=torch.add(z, att(x, adj)) 92 | x = z/self.nheads 93 | x = F.dropout(x, self.dropout, training=self.training) 94 | x = F.elu(self.out_att(x, adj)) 95 | return F.softmax(x, dim=1) 96 | 97 | class MolecularGraphNeuralNetwork(nn.Module): 98 | def __init__(self, N_fingerprints, dim, layer_hidden, layer_output, dropout): 99 | super(MolecularGraphNeuralNetwork, self).__init__() 100 | self.layer_hidden=layer_hidden 101 | self.layer_output=layer_output 102 | self.embed_fingerprint = nn.Embedding(N_fingerprints, dim) 103 | self.W_fingerprint = nn.ModuleList([nn.Linear(dim, dim) for _ in range(layer_hidden)]) 104 | 105 | self.W_output = nn.ModuleList([nn.Linear(56,56) for _ in range(layer_output)]) 106 | self.W_property = nn.Linear(56, 2) 107 | 108 | self.dropout = dropout 109 | self.alpha = 0.25 110 | self.nheads = 2 111 | self.attentions = GAT(dim, dim, dropout, alpha=self.alpha, nheads=self.nheads).to(device) 112 | 113 | def pad(self, matrices, pad_value): 114 | """Pad the list of matrices 115 | with a pad_value (e.g., 0) for batch processing. 116 | For example, given a list of matrices [A, B, C], 117 | we obtain a new matrix [A00, 0B0, 00C], 118 | where 0 is the zero (i.e., pad value) matrix. 119 | """ 120 | shapes = [m.shape for m in matrices] 121 | M, N = sum([s[0] for s in shapes]), sum([s[1] for s in shapes]) 122 | zeros = torch.FloatTensor(np.zeros((M, N))).to(device) 123 | pad_matrices = pad_value + zeros 124 | i, j = 0, 0 125 | for k, matrix in enumerate(matrices): 126 | m, n = shapes[k] 127 | pad_matrices[i:i + m, j:j + n] = matrix 128 | i += m 129 | j += n 130 | return pad_matrices 131 | 132 | def update(self, matrix, vectors, layer): 133 | hidden_vectors = torch.relu(self.W_fingerprint[layer](vectors)) 134 | 135 | return hidden_vectors + torch.matmul(matrix, hidden_vectors) 136 | 137 | def sum(self, vectors, axis): 138 | sum_vectors = [torch.sum(v, 0) for v in torch.split(vectors, axis)] 139 | return torch.stack(sum_vectors) 140 | 141 | def gnn(self, inputs): 142 | """Cat or pad each input data for batch processing.""" 143 | Smiles, fingerprints, adjacencies, molecular_sizes = inputs 144 | fingerprints = torch.cat(fingerprints) 145 | adj = self.pad(adjacencies, 0) 146 | """GNN layer (update the fingerprint vectors).""" 147 | fingerprint_vectors = self.embed_fingerprint(fingerprints) 148 | 149 | for l in range(self.layer_hidden): 150 | hs = self.update(adj, fingerprint_vectors, l) 151 | fingerprint_vectors = F.normalize(hs, 2, 1) 152 | """Attention layer""" 153 | molecular_vectors = self.attentions(fingerprint_vectors, adj) 154 | """Molecular vector by sum or mean of the fingerprint vectors.""" 155 | molecular_vectors = self.sum(molecular_vectors, molecular_sizes) 156 | return Smiles, molecular_vectors 157 | 158 | def mlp(self, vectors): 159 | """Regressor based on multilayer perceptron.""" 160 | for l in range(self.layer_output): 161 | 162 | vectors = torch.relu(self.W_output[l](vectors)) 163 | outputs = torch.sigmoid(self.W_property(vectors)) 164 | return outputs 165 | 166 | def forward_classifier(self, data_batch, train): 167 | 168 | inputs = data_batch[:-1] 169 | correct_labels = torch.cat(data_batch[-1]) 170 | 171 | if train: 172 | Smiles, molecular_vectors = self.gnn(inputs) 173 | predicted_scores = self.mlp(molecular_vectors) 174 | '''loss function''' 175 | loss = F.cross_entropy(predicted_scores, correct_labels) 176 | predicted_scores = predicted_scores.to('cpu').data.numpy() 177 | predicted_scores = [s[1] for s in predicted_scores] 178 | correct_labels = correct_labels.to('cpu').data.numpy() 179 | return Smiles,loss, predicted_scores, correct_labels 180 | else: 181 | with torch.no_grad(): 182 | Smiles, molecular_vectors = self.gnn(inputs) 183 | predicted_scores = self.mlp(molecular_vectors) 184 | loss = F.cross_entropy(predicted_scores, correct_labels) 185 | predicted_scores = predicted_scores.to('cpu').data.numpy() 186 | predicted_scores = [s[1] for s in predicted_scores] 187 | correct_labels = correct_labels.to('cpu').data.numpy() 188 | 189 | return Smiles, loss, predicted_scores, correct_labels 190 | 191 | 192 | class Trainer(object): 193 | def __init__(self, model,lr,batch_train): 194 | self.model = model 195 | self.batch_train=batch_train 196 | self.lr=lr 197 | self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) 198 | 199 | def train(self, dataset): 200 | np.random.shuffle(dataset) 201 | N = len(dataset) 202 | loss_total = 0 203 | SMILES,P, C = '',[], [] 204 | for i in range(0, N, self.batch_train): 205 | data_batch = list(zip(*dataset[i:i + self.batch_train])) 206 | Smiles,loss, predicted_scores, correct_labels = self.model.forward_classifier(data_batch, train=True) 207 | SMILES += ' '.join(Smiles) + ' ' 208 | P.append(predicted_scores) 209 | C.append(correct_labels) 210 | self.optimizer.zero_grad() 211 | loss.backward() 212 | self.optimizer.step() 213 | loss_total += loss.item() 214 | tru = np.concatenate(C) 215 | pre = np.concatenate(P) 216 | AUC = roc_auc_score(tru, pre) 217 | SMILES = SMILES.strip().split() 218 | pred = [1 if i > 0.15 else 0 for i in pre] 219 | predictions = np.stack((tru, pred, pre)) 220 | return AUC, loss_total, predictions 221 | 222 | 223 | class Tester(object): 224 | def __init__(self, model,batch_test): 225 | self.model = model 226 | self.batch_test=batch_test 227 | def test_classifier(self, dataset): 228 | N = len(dataset) 229 | loss_total = 0 230 | SMILES, P, C = '', [], [] 231 | for i in range(0, N, self.batch_test): 232 | data_batch = list(zip(*dataset[i:i + self.batch_test])) 233 | (Smiles, loss, predicted_scores, correct_labels) = self.model.forward_classifier( 234 | data_batch, train=False) 235 | SMILES += ' '.join(Smiles) + ' ' 236 | loss_total += loss.item() 237 | P.append(predicted_scores) 238 | C.append(correct_labels) 239 | SMILES = SMILES.strip().split() 240 | tru = np.concatenate(C) 241 | pre = np.concatenate(P) 242 | pred = [1 if i >0.15 else 0 for i in pre] 243 | #AUC = roc_auc_score(tru, pre) 244 | cnf_matrix=confusion_matrix(tru,pred) 245 | tn = cnf_matrix[0, 0] 246 | tp = cnf_matrix[1, 1] 247 | fn = cnf_matrix[1, 0] 248 | fp = cnf_matrix[0, 1] 249 | acc = (tp + tn) / (tp + fp + fn + tn) 250 | # Tru=map(str,np.concatenate(C)) 251 | # Pre=map(str,np.concatenate(P)) 252 | # predictions = '\n'.join(['\t'.join(x) for x in zip(SMILES, Tru, Pre)]) 253 | predictions = np.stack((tru, pred, pre)) 254 | return acc, loss_total, predictions 255 | 256 | def save_result(self, result, filename): 257 | with open(filename, 'a') as f: 258 | f.write(result + '\n') 259 | 260 | def save_predictions(self, predictions, filename): 261 | with open(filename, 'w') as f: 262 | f.write('Smiles\tCorrect\tPredict\n') 263 | f.write(predictions + '\n') 264 | 265 | def save_model(self, model, filename): 266 | torch.save(model.state_dict(), filename) 267 | 268 | def dump_dictionary(dictionary, filename): 269 | with open('../DGCAN/model'+filename, 'wb') as f: 270 | pickle.dump(dict(dictionary), f) 271 | 272 | 273 | -------------------------------------------------------------------------------- /DGCAN/results/AUC.txt: -------------------------------------------------------------------------------- 1 | Epoch Time(sec) Loss_train Loss_test AUC_train AUC_test 2 | 1 7.2395853999999815 318.02376973629 33.23613902926445 0.6330387783115992 0.5 3 | 2 13.431995 275.59704649448395 28.36697283387184 0.7726390395642837 0.7757009345794392 4 | 3 19.53742679999999 258.0953543186188 26.692712754011154 0.8227923594659818 0.7827102803738317 5 | 4 25.411535499999985 244.29262351989746 25.99587020277977 0.8555440089003034 0.8247663551401869 6 | 5 31.270511900000002 235.61571648716927 26.206634640693665 0.8711811167445258 0.8271028037383178 7 | 6 37.1334444 235.10905063152313 24.74921104311943 0.8782201330617486 0.8504672897196262 8 | 7 43.15194819999999 230.60763642191887 24.482909947633743 0.8858224754798858 0.8644859813084113 9 | 8 49.21481349999999 225.9473716020584 25.168260991573334 0.894445889698231 0.8738317757009346 10 | 9 55.39354119999999 220.88472372293472 23.143073588609695 0.9038094737308211 0.8808411214953271 11 | 10 61.388609 220.29008296132088 23.10640263557434 0.9080117951159034 0.8925233644859814 12 | 11 67.3816218 220.04156962037086 23.67304638028145 0.905873895764607 0.8714953271028038 13 | 12 73.3649585 214.85031658411026 23.34747040271759 0.9159177330794608 0.8551401869158879 14 | 13 79.32801410000002 212.33444252610207 23.14932319521904 0.9178444716275156 0.8714953271028038 15 | 14 85.31854019999997 211.54040449857712 22.778073489665985 0.9219235005645715 0.8901869158878505 16 | 15 91.33987619999999 208.26400744915009 22.901916056871414 0.9267551530985012 0.8995327102803738 17 | 16 98.36570660000001 209.3945328295231 23.913705557584763 0.9246417461863752 0.8878504672897196 18 | 17 104.39726789999997 206.03158766031265 23.282782286405563 0.930114906901056 0.8901869158878505 19 | 18 110.4006412 207.53857171535492 22.225304275751114 0.9226543992295261 0.9018691588785047 20 | 19 116.81378369999999 204.79183167219162 23.475462794303894 0.926265719441185 0.8785046728971962 21 | 20 122.6911399 205.36031165719032 22.78501933813095 0.9291473863661524 0.8878504672897196 22 | 21 128.57552929999997 202.3321330845356 23.385528802871704 0.9302150906635376 0.8855140186915887 23 | 22 134.4078144 202.55410113930702 23.08760157227516 0.9293607611751943 0.8925233644859814 24 | 23 140.28498439999998 198.95897144079208 22.36356022953987 0.9377836694932141 0.8948598130841121 25 | 24 146.0898661 197.13710144162178 23.3654263317585 0.9351247869019417 0.8785046728971962 26 | 25 151.9243611 256.36723348498344 22.31619429588318 0.7880461675559591 0.8901869158878505 27 | 26 157.7956577 199.1333883702755 22.19395723938942 0.9381437221865521 0.9018691588785047 28 | 27 163.5824619 195.61116680502892 21.885735362768173 0.9357744592290832 0.8995327102803738 29 | 28 169.4125838 196.21020331978798 21.808892458677292 0.937718217946731 0.9065420560747663 30 | 29 175.33045549999997 196.93134278059006 22.267054110765457 0.9385652135408595 0.897196261682243 31 | 30 181.20051949999998 195.89555063843727 22.040870487689972 0.9386104622844113 0.8995327102803738 32 | 31 187.0336099 194.0237057507038 22.781775504350662 0.9417760754533178 0.8714953271028038 33 | 32 192.90557769999998 193.68072113394737 22.449314266443253 0.9423293001527663 0.8948598130841121 34 | 33 198.73298769999997 192.5338954925537 22.377066612243652 0.9452480516748955 0.8785046728971962 35 | 34 204.81813569999997 192.58278796076775 23.285291463136673 0.9402174153696283 0.8808411214953271 36 | 35 210.6708656 196.01435166597366 24.061037868261337 0.9359749651294087 0.8691588785046729 37 | 36 216.5086068 193.9636361002922 22.313345968723297 0.936061864857086 0.8901869158878505 38 | 37 222.3077349 192.51033294200897 22.285043627023697 0.9435566896185268 0.9042056074766355 39 | 38 228.07614589999997 188.01407945156097 22.830698162317276 0.94882975955897 0.8995327102803738 40 | 39 233.88910299999998 193.91294729709625 22.711496233940125 0.9402640478668054 0.8808411214953271 41 | 40 239.76910729999997 192.2110168337822 21.79123494029045 0.9453036785706378 0.9042056074766355 42 | 41 245.54325319999998 189.3926584124565 23.183754086494446 0.9464263178869528 0.8925233644859814 43 | 42 251.29193049999998 197.67854461073875 24.210958123207092 0.9356472922709057 0.8644859813084113 44 | 43 257.07465279999997 195.8016073703766 22.462971657514572 0.9351026468439347 0.897196261682243 45 | 44 262.8480727 191.97943636775017 23.224840223789215 0.940738813735692 0.8878504672897196 46 | 45 268.5777391 190.848837941885 22.82283341884613 0.9461044567936768 0.897196261682243 47 | 46 274.3567221 190.04618108272552 22.433310955762863 0.9440021199105542 0.897196261682243 48 | 47 280.1616897 190.5216095149517 22.426137387752533 0.9457633615250072 0.9042056074766355 49 | 48 285.9260854 185.92078268527985 22.22221177816391 0.9480603925432284 0.8995327102803738 50 | 49 291.66768609999997 187.782156676054 22.887968957424164 0.94423528239644 0.8901869158878505 51 | 50 297.4925078 187.28414443135262 21.483285009860992 0.9458896982310093 0.9088785046728972 52 | 51 303.2552989 185.18417713046074 21.38184556365013 0.9481489527752565 0.9205607476635514 53 | 52 309.0365314 182.16105404496193 24.673764526844025 0.9509743009276684 0.8714953271028038 54 | 53 314.8158657 188.7527618408203 23.513393253087997 0.9457229559191445 0.8878504672897196 55 | 54 320.5944295 185.709531635046 21.631520986557007 0.9463949066796555 0.9135514018691588 56 | 55 326.40547119999997 185.12931755185127 22.429152816534042 0.944120984346979 0.9018691588785047 57 | 56 332.1655878 182.88407680392265 21.58257967233658 0.9498509697345405 0.9135514018691588 58 | 57 337.97326309999994 182.04424741864204 21.475889027118683 0.9525705991099698 0.9182242990654206 59 | 58 343.78313679999997 182.934487760067 21.883195608854294 0.949658766355968 0.9088785046728972 60 | 59 349.55780849999996 184.17358297109604 21.290808767080307 0.9471534804171187 0.9182242990654206 61 | 60 355.38770680000005 181.42354640364647 21.597694754600525 0.949361674452587 0.9135514018691588 62 | 61 361.28093179999996 187.25566163659096 21.6785786151886 0.9452393340270551 0.9065420560747663 63 | 62 367.0356904 181.59250125288963 21.666670441627502 0.9521657127991676 0.9112149532710281 64 | 63 372.82640919999994 179.8839019536972 22.01644539833069 0.9538880709367459 0.9065420560747663 65 | 64 378.58320630000003 182.93770709633827 22.33838379383087 0.9484886642903003 0.8995327102803738 66 | 65 384.42532470000003 181.58496183156967 22.23741576075554 0.950153319901698 0.8995327102803738 67 | 66 390.227352 182.49673774838448 21.934344708919525 0.9507491642128103 0.9042056074766355 68 | 67 396.04737980000004 180.1727076768875 22.335491836071014 0.9534794484911551 0.9042056074766355 69 | 68 401.8080486 182.3468733727932 21.559545934200287 0.953210031660283 0.9158878504672897 70 | 69 407.56943720000004 177.6970148384571 21.813909739255905 0.9542144984169858 0.9158878504672897 71 | 70 413.3951455 179.4230616092682 21.47458705306053 0.9503143888236987 0.9182242990654206 72 | 71 419.26894319999997 187.4236896932125 21.997405976057053 0.9434714503952 0.9112149532710281 73 | 72 425.0120637 185.64457353949547 21.80859535932541 0.9493346912568912 0.9112149532710281 74 | 73 430.8028591 183.28448390960693 22.7678345143795 0.942662231275046 0.8995327102803738 75 | 74 436.5995785 181.35295176506042 21.99883532524109 0.9524445391546925 0.9088785046728972 76 | 75 442.4056693 180.42559936642647 22.20555028319359 0.9545189242145815 0.9065420560747663 77 | 76 448.17074230000003 177.4391260445118 21.684407979249954 0.9532330019704651 0.9088785046728972 78 | 77 453.95868340000004 187.34195244312286 22.080274641513824 0.9482340536232203 0.9042056074766355 79 | 78 459.8026899 184.663908213377 22.014076620340347 0.9470819403546837 0.9088785046728972 80 | 79 465.5365296 178.98830798268318 21.47413921356201 0.9541025527486882 0.9158878504672897 81 | 80 471.3356651 178.3373854458332 21.658688694238663 0.9525628500896672 0.9088785046728972 82 | 81 477.18991429999994 176.8597036600113 22.024581998586655 0.9535130737042532 0.9065420560747663 83 | 82 483.0233918 177.2030012011528 21.766023725271225 0.957143212965218 0.9135514018691588 84 | 83 488.7872165 176.38141465187073 21.79708757996559 0.9572966712422787 0.9112149532710281 85 | 84 494.52968880000003 174.46399101614952 21.843281388282776 0.956292066110213 0.9112149532710281 86 | 85 500.3899477 175.65917918086052 21.409487038850784 0.9559244027719352 0.9182242990654206 87 | 86 506.2031879 176.77976202964783 21.49503728747368 0.9538743717758542 0.9228971962616822 88 | 87 512.0002034 179.85141596198082 21.427332252264023 0.9514658102154229 0.9158878504672897 89 | 88 517.7809181 178.52282038331032 21.403560250997543 0.9539066132353267 0.9158878504672897 90 | 89 523.6232049 177.05544209480286 21.411171078681946 0.9535158412115041 0.9135514018691588 91 | 90 529.4495936 176.31908676028252 21.366370409727097 0.955243734363584 0.9228971962616822 92 | 91 535.3322216 176.20382365584373 21.893729746341705 0.9558952055704386 0.9088785046728972 93 | 92 541.1628488 175.1233125925064 22.482848435640335 0.9574447328802002 0.9042056074766355 94 | 93 547.003017 176.93210792541504 21.549375027418137 0.9527253027652932 0.9135514018691588 95 | 94 552.8402735 173.00296890735626 21.5932075381279 0.9579943598202227 0.9205607476635514 96 | 95 558.6800595 179.9282302260399 23.50808882713318 0.9537645017379945 0.8878504672897196 97 | 96 564.5009782 174.56020081043243 22.648311734199524 0.9556187315960767 0.8995327102803738 98 | 97 570.3044254 176.7171704173088 21.891200184822083 0.9524900646489693 0.9088785046728972 99 | 98 576.1318849 178.38612964749336 22.245244562625885 0.9540746009254544 0.9088785046728972 100 | 99 581.9812099 177.73075929284096 24.06598174571991 0.9527554685943277 0.8551401869158879 101 | 100 587.8207973 177.71725061535835 23.09346652030945 0.9563492151349435 0.8925233644859814 102 | 101 593.7006995 173.766254901886 22.383565932512283 0.9588301470099851 0.8995327102803738 103 | 102 599.5459518 173.67894527316093 21.84225881099701 0.9581018774769188 0.9112149532710281 104 | 103 605.2735841 175.92625331878662 22.260210156440735 0.956732791639914 0.9065420560747663 105 | 104 611.1396088 174.88757956027985 21.68474268913269 0.95813674806828 0.9158878504672897 106 | 105 616.9215246 176.1177335381508 22.008816480636597 0.9573001306263422 0.9065420560747663 107 | 106 622.727265 174.49301874637604 21.83292892575264 0.9588928310492174 0.9088785046728972 108 | 107 628.5301017 172.7093889117241 21.64242872595787 0.9596258053446101 0.9135514018691588 109 | 108 634.310622 177.16424638032913 21.55477637052536 0.9525133117098767 0.9158878504672897 110 | 109 640.0827119 178.4102607667446 21.33096119761467 0.9554701164567051 0.9158878504672897 111 | 110 645.8604367 174.0306807756424 21.91256058216095 0.9572584796422166 0.9135514018691588 112 | 111 651.6639703 174.55561447143555 21.62583690881729 0.9593071268846725 0.9158878504672897 113 | 112 657.5478062 172.49658674001694 22.11896824836731 0.9582462029800518 0.9112149532710281 114 | 113 663.3088726999999 173.48215851187706 21.262643307447433 0.9584646976775077 0.9205607476635514 115 | 114 669.1280201 174.29942700266838 21.171885669231415 0.9568379569154472 0.9205607476635514 116 | 115 674.9632922 171.870591878891 21.214154481887817 0.9590825436712644 0.9205607476635514 117 | 116 680.7173928 176.74994710087776 22.71759131550789 0.9544714614652291 0.8901869158878505 118 | 117 686.4742894999999 184.73798117041588 21.845041394233704 0.9432161478513075 0.9112149532710281 119 | 118 692.2357881 177.2356958091259 21.59891825914383 0.9571596796333606 0.9158878504672897 120 | 119 698.004266 176.63044354319572 21.46969723701477 0.9547083600859034 0.9158878504672897 121 | 120 703.704268 172.92364439368248 21.97585704922676 0.9586079161777404 0.9065420560747663 122 | 121 709.4959497 180.26964315772057 21.951832473278046 0.948673395399296 0.9088785046728972 123 | 122 715.3568411 173.81644931435585 21.459405571222305 0.9592527453671928 0.9205607476635514 124 | 123 721.1472172 176.33700492978096 21.565876573324203 0.9538397779352181 0.9205607476635514 125 | 124 726.9662815 172.69166892766953 21.18627032637596 0.9598193924768083 0.9205607476635514 126 | 125 732.7549476 171.38375091552734 21.528594940900803 0.9606258440897115 0.9158878504672897 127 | 126 738.5203703 171.18208953738213 22.116858184337616 0.9604675426749618 0.9088785046728972 128 | 127 744.3382166 171.14211875200272 22.178175538778305 0.9611412923151859 0.9112149532710281 129 | 128 750.1037539 169.92953234910965 21.887178242206573 0.9611555449775278 0.9112149532710281 130 | 129 755.8869365 174.96560329198837 22.6447791159153 0.9555907797728429 0.8995327102803738 131 | 130 761.6342917 178.70918104052544 22.300085812807083 0.9529475335975381 0.9042056074766355 132 | 131 767.4047745 174.0145247578621 22.113102048635483 0.9579599043549494 0.9088785046728972 133 | 132 773.1641292 172.59970355033875 21.501632899045944 0.9596043571634159 0.9182242990654206 134 | 133 779.0603237 172.19679167866707 22.107417851686478 0.9594648747979719 0.9088785046728972 135 | 134 784.8799782 173.3250037431717 23.72211918234825 0.9588380344056502 0.8761682242990654 136 | 135 790.6637747999999 173.15374860167503 22.544156223535538 0.9600561527221201 0.897196261682243 137 | 136 796.4695435 171.82129180431366 22.224039256572723 0.9606888048796689 0.9065420560747663 138 | 137 802.2469289 171.67953670024872 22.0196373462677 0.9615298503332077 0.9042056074766355 139 | 138 808.016359 172.37993958592415 22.44371086359024 0.9589339285318929 0.8995327102803738 140 | 139 813.7643964 174.99391075968742 21.463502824306488 0.9590761784045874 0.9158878504672897 141 | 140 819.5146855 174.17848363518715 21.192844033241272 0.959691256891093 0.9228971962616822 142 | -------------------------------------------------------------------------------- /DGCAN/predict.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # -*- coding: utf-8 -*- 4 | """ 5 | Created on Wed Apr 27 20:09:31 2022 6 | 7 | @author:Jinyu-Sun 8 | """ 9 | 10 | import timeit 11 | import sys 12 | import numpy as np 13 | import math 14 | import torch 15 | import torch.nn as nn 16 | import torch.nn.functional as F 17 | import torch.optim as optim 18 | import pickle 19 | from sklearn.metrics import roc_auc_score, roc_curve 20 | from sklearn.metrics import confusion_matrix 21 | import preprocess as pp 22 | import pandas as pd 23 | import matplotlib.pyplot as plt 24 | torch.cuda.empty_cache() 25 | if torch.cuda.is_available(): 26 | device = torch.device('cuda') 27 | 28 | else: 29 | device = torch.device('cpu') 30 | 31 | class GraphAttentionLayer(nn.Module): 32 | def __init__(self, in_features, out_features, dropout, alpha, concat=True): 33 | super(GraphAttentionLayer, self).__init__() 34 | self.dropout = dropout 35 | self.concat = concat 36 | self.in_features = in_features #dim of input feature 37 | self.out_features = out_features #dim of output feature 38 | self.alpha = alpha # negative_slope leakyrelu 39 | self.W = nn.Parameter(torch.zeros(size=(in_features, out_features))) 40 | self.a = nn.Parameter(torch.zeros(size=(2 * out_features, 1))) 41 | self.leakyrelu = nn.LeakyReLU(self.alpha) 42 | 43 | def forward(self, input, adj): 44 | """ 45 | input: input_feature [N, in_features] in_features indicates the number of elements of the input feature vector of the node 46 | adj: adjacency matrix of the graph dimension [N, N] non-zero is one, data structure basics 47 | """ 48 | h = torch.mm(input, self.W) # [N, out_features] 49 | N = h.size()[0] #Number of nodes of the graph 50 | a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features) # [N, N, 2*out_features] 51 | e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2)) 52 | zero_vec =-9e10 *torch.ones_like(e) 53 | attention = torch.where(adj > 0, e, zero_vec) 54 | # indicates that if the adjacency matrix element is greater than 0, then the two nodes are connected and the attention factor at that position is retained. 55 | # Otherwise it is necessary to mask and set to a very small value, the reason is that this minimum value will be disregarded during softmax. 56 | attention = F.softmax(attention, dim=1) 57 | attention = F.dropout(attention, self.dropout, training=self.training) 58 | h_prime = torch.matmul(attention, h) 59 | if self.concat: 60 | return F.elu(h_prime) 61 | else: 62 | return h_prime 63 | 64 | 65 | class GAT(nn.Module): 66 | def __init__(self, nfeat, nhid, dropout, alpha, nheads): 67 | super(GAT, self).__init__() 68 | """ 69 | n_heads indicates that there are several GAL layers, which are finally stitched together, similar to self-attention 70 | to extract features from different subspaces. 71 | """ 72 | self.dropout = dropout 73 | self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in 74 | range(nheads)] 75 | for i, attention in enumerate(self.attentions): 76 | self.add_module('attention_{}'.format(i), attention) 77 | 78 | self.out_att = GraphAttentionLayer(nhid,56, dropout=dropout, alpha=alpha, concat=False) 79 | self.nheads=nheads 80 | 81 | def forward(self, x, adj): 82 | x = F.dropout(x, self.dropout, training=self.training) 83 | #x = torch.cat([att(x, adj) for att in self.attentions], dim=1) 84 | 85 | z=torch.zeros_like(self.attentions[1](x, adj)) 86 | for att in self.attentions: 87 | z=torch.add(z, att(x, adj)) 88 | x = z/self.nheads 89 | x = F.dropout(x, self.dropout, training=self.training) 90 | x = F.elu(self.out_att(x, adj)) 91 | return F.softmax(x, dim=1) 92 | 93 | class MolecularGraphNeuralNetwork(nn.Module): 94 | def __init__(self, N_fingerprints, dim, layer_hidden, layer_output, dropout): 95 | super(MolecularGraphNeuralNetwork, self).__init__() 96 | self.layer_hidden=layer_hidden 97 | self.layer_output=layer_output 98 | self.embed_fingerprint = nn.Embedding(N_fingerprints, dim) 99 | self.W_fingerprint = nn.ModuleList([nn.Linear(dim, dim) for _ in range(layer_hidden)]) 100 | 101 | self.W_output = nn.ModuleList([nn.Linear(56,56) for _ in range(layer_output)]) 102 | self.W_property = nn.Linear(56, 2) 103 | 104 | self.dropout = dropout 105 | self.alpha = 0.25 106 | self.nheads = 2 107 | self.attentions = GAT(dim, dim, dropout, alpha=self.alpha, nheads=self.nheads).to(device) 108 | 109 | def pad(self, matrices, pad_value): 110 | """Pad the list of matrices 111 | with a pad_value (e.g., 0) for batch processing. 112 | For example, given a list of matrices [A, B, C], 113 | we obtain a new matrix [A00, 0B0, 00C], 114 | where 0 is the zero (i.e., pad value) matrix. 115 | """ 116 | shapes = [m.shape for m in matrices] 117 | M, N = sum([s[0] for s in shapes]), sum([s[1] for s in shapes]) 118 | zeros = torch.FloatTensor(np.zeros((M, N))).to(device) 119 | pad_matrices = pad_value + zeros 120 | i, j = 0, 0 121 | for k, matrix in enumerate(matrices): 122 | m, n = shapes[k] 123 | pad_matrices[i:i + m, j:j + n] = matrix 124 | i += m 125 | j += n 126 | return pad_matrices 127 | 128 | def update(self, matrix, vectors, layer): 129 | hidden_vectors = torch.relu(self.W_fingerprint[layer](vectors)) 130 | 131 | return hidden_vectors + torch.matmul(matrix, hidden_vectors) 132 | 133 | def sum(self, vectors, axis): 134 | sum_vectors = [torch.sum(v, 0) for v in torch.split(vectors, axis)] 135 | return torch.stack(sum_vectors) 136 | 137 | def gnn(self, inputs): 138 | """Cat or pad each input data for batch processing.""" 139 | Smiles, fingerprints, adjacencies, molecular_sizes = inputs 140 | fingerprints = torch.cat(fingerprints) 141 | adj = self.pad(adjacencies, 0) 142 | """GNN layer (update the fingerprint vectors).""" 143 | fingerprint_vectors = self.embed_fingerprint(fingerprints) 144 | 145 | for l in range(self.layer_hidden): 146 | hs = self.update(adj, fingerprint_vectors, l) 147 | fingerprint_vectors = F.normalize(hs, 2, 1) 148 | """Attention layer""" 149 | molecular_vectors = self.attentions(fingerprint_vectors, adj) 150 | """Molecular vector by sum or mean of the fingerprint vectors.""" 151 | molecular_vectors = self.sum(molecular_vectors, molecular_sizes) 152 | return Smiles, molecular_vectors 153 | 154 | def mlp(self, vectors): 155 | """Regressor based on multilayer perceptron.""" 156 | for l in range(self.layer_output): 157 | 158 | vectors = torch.relu(self.W_output[l](vectors)) 159 | outputs = torch.sigmoid(self.W_property(vectors)) 160 | return outputs 161 | 162 | def forward_classifier(self, data_batch): 163 | 164 | inputs = data_batch[:-1] 165 | correct_labels = torch.cat(data_batch[-1]) 166 | 167 | 168 | with torch.no_grad(): 169 | Smiles, molecular_vectors = self.gnn(inputs) 170 | predicted_scores = self.mlp(molecular_vectors) 171 | 172 | predicted_scores = predicted_scores.to('cpu').data.numpy() 173 | predicted_scores = [s[1] for s in predicted_scores] 174 | correct_labels = correct_labels.to('cpu').data.numpy() 175 | 176 | return Smiles,predicted_scores, correct_labels 177 | 178 | 179 | class Tester(object): 180 | def __init__(self, model,batch_test): 181 | self.model = model 182 | self.batch_test=batch_test 183 | def test_classifier(self, dataset): 184 | N = len(dataset) 185 | SMILES, P, C = '', [], [] 186 | for i in range(0, N, self.batch_test): 187 | data_batch = list(zip(*dataset[i:i + self.batch_test])) 188 | Smiles, predicted_scores, correct_labels = self.model.forward_classifier( data_batch) 189 | SMILES += ' '.join(Smiles) + ' ' 190 | 191 | P.append(predicted_scores) 192 | C.append(correct_labels) 193 | SMILES = SMILES.strip().split() 194 | tru = np.concatenate(C) 195 | pre = np.concatenate(P) 196 | pred = [1 if i >0.15 else 0 for i in pre] 197 | #AUC = roc_auc_score(tru, pre) 198 | cnf_matrix=confusion_matrix(tru,pred) 199 | tn = cnf_matrix[0, 0] 200 | tp = cnf_matrix[1, 1] 201 | fn = cnf_matrix[1, 0] 202 | fp = cnf_matrix[0, 1] 203 | acc = (tp + tn) / (tp + fp + fn + tn) 204 | # Tru=map(str,np.concatenate(C)) 205 | # Pre=map(str,np.concatenate(P)) 206 | # predictions = '\n'.join(['\t'.join(x) for x in zip(SMILES, Tru, Pre)]) 207 | predictions = np.stack((tru, pred, pre)) 208 | return acc, predictions 209 | 210 | def save_result(self, result, filename): 211 | with open(filename, 'a') as f: 212 | f.write(result + '\n') 213 | 214 | def save_predictions(self, predictions, filename): 215 | with open(filename, 'w') as f: 216 | f.write('Smiles\tCorrect\tPredict\n') 217 | f.write(predictions + '\n') 218 | 219 | def dump_dictionary(dictionary, filename): 220 | with open('../DGCAN/model'+filename, 'wb') as f: 221 | pickle.dump(dict(dictionary), f) 222 | def metrics(cnd_matrix): 223 | '''Evaluation Metrics''' 224 | 225 | tn = cnd_matrix[0, 0] 226 | tp = cnd_matrix[1, 1] 227 | fn = cnd_matrix[1, 0] 228 | fp = cnd_matrix[0, 1] 229 | 230 | bacc = ((tp / (tp + fn)) + (tn / (tn + fp))) / 2 # balance accurance 231 | pre = tp / (tp + fp) # precision/q+ 232 | rec = tp / (tp + fn) # recall/se 233 | sp = tn / (tn + fp) 234 | q_ = tn / (tn + fn) 235 | f1 = 2 * pre * rec / (pre + rec) # f1score 236 | mcc = ((tp * tn) - (fp * fn)) / math.sqrt( 237 | (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) # Matthews correlation coefficient 238 | acc = (tp + tn) / (tp + fp + fn + tn) # accurancy 239 | 240 | print('bacc:', bacc) 241 | print('pre:', pre) 242 | print('rec:', rec) 243 | print('f1:', f1) 244 | print('mcc:', mcc) 245 | print('sp:', sp) 246 | print('q_:', q_) 247 | print('acc:', acc) 248 | 249 | 250 | def predict (test_name, property, radius, dim, layer_hidden, layer_output, dropout, batch_train, 251 | batch_test, lr, lr_decay, decay_interval, iteration, N): 252 | ''' 253 | 254 | Parameters 255 | ---------- 256 | data_test = '../dataset/data_test.txt', #test set 257 | radius = 1, #hops of radius subgraph: 1, 2 258 | dim = 52, #dimension of graph convolution layers 259 | layer_hidden = 4, #Number of graph convolution layers 260 | layer_output = 10, #Number of dense layers 261 | dropout = 0.45, #drop out rate :0-1 262 | batch_train = 8, # batch of training set 263 | batch_test = 8, #batch of test set 264 | lr =3e-4, #learning rate: 1e-5,1e-4,3e-4, 5e-4, 1e-3, 3e-3,5e-3 265 | lr_decay = 0.85, #Learning rate decay:0.5, 0.75, 0.85, 0.9 266 | decay_interval = 25,#Number of iterations for learning rate decay:10,25,30,50 267 | iteration = 140, #Number of iterations 268 | N = 5000, #length of embedding: 2000,3000,5000,7000 269 | dataset_train = ('../dataset/data_train.txt') #training set 270 | 271 | Returns 272 | ------- 273 | res_dev 274 | Predicting results 275 | 276 | ''' 277 | (radius, dim, layer_hidden, layer_output, 278 | batch_train, batch_test, decay_interval, 279 | iteration, dropout) = map(int, [radius, dim, layer_hidden, layer_output, 280 | batch_train, batch_test, 281 | decay_interval, iteration, dropout]) 282 | 283 | lr, lr_decay = map(float, [lr, lr_decay]) 284 | if torch.cuda.is_available(): 285 | device = torch.device('cuda') 286 | print('The code uses a GPU!') 287 | else: 288 | device = torch.device('cpu') 289 | print('The code uses a CPU...') 290 | 291 | lr, lr_decay = map(float, [lr, lr_decay]) 292 | path = '' 293 | dataname = '' 294 | torch.manual_seed(0) 295 | model = MolecularGraphNeuralNetwork( 296 | N, dim, layer_hidden, layer_output, dropout).to(device) 297 | model.load_state_dict(torch.load(r'model/model.pth')) 298 | model.eval() 299 | tester = Tester(model,batch_test) 300 | dataset_dev=pp.create_testdataset(test_name, path, dataname,property) 301 | np.random.seed(0) 302 | #np.random.shuffle(dataset_dev) 303 | prediction_dev, dev_res = tester.test_classifier(dataset_dev) 304 | if property == True: 305 | res_dev = dev_res.T 306 | cnd_matrix=confusion_matrix(res_dev[:,0], res_dev[:,1]) 307 | cnd_matrix 308 | metrics(cnd_matrix) 309 | elif property == False: 310 | res_dev = dev_res.T[:,1] 311 | 312 | return res_dev 313 | -------------------------------------------------------------------------------- /dataset/bRo5.txt: -------------------------------------------------------------------------------- 1 | CC1=NC=C(N=C1)C(=O)NCCC1=CC=C(C=C1)S(=O)(=O)NC(=O)NC1CCCCC1 1 2 | CN1CCN(CC1)C(=O)O[C@@H]1N(C(=O)C2=NC=CN=C12)C1=NC=C(Cl)C=C1 1 3 | [H][C@@]12CCO[C@]1([H])OC[C@@H]2OC(=O)N[C@@H](CC1=CC=CC=C1)[C@H](O)CN(CC(C)C)S(=O)(=O)C1=CC=C(N)C=C1 1 4 | COC1=C(OC)C=C2[C@@H](CN(C)CCCN3CCC4=CC(OC)=C(OC)C=C4CC3=O)CC2=C1 1 5 | [H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)[C@H](C(O)=O)C1=CSC=C1)C(O)=O 1 6 | CC[N+](C)(CC)CCOC(=O)C1C2=CC=CC=C2OC2=CC=CC=C12 1 7 | CC(C)O 1 8 | COC1=CC2=C(NC(=N2)[S@@](=O)CC2=NC=C(C)C(OC)=C2C)C=C1 1 9 | COC1=C(OC)C=C2C(N)=NC(=NC2=C1)N1CCN(CC1)C(=O)C1CCCO1 1 10 | CCCCC(=O)N(CC1=CC=C(C=C1)C1=CC=CC=C1C1=NNN=N1)[C@@H](C(C)C)C(O)=O 1 11 | CN(CCCCCCCCCCN(C)C(=O)OC1=CC=CC(=C1)[N+](C)(C)C)C(=O)OC1=CC=CC(=C1)[N+](C)(C)C 1 12 | NC[C@H](O)C1=CC(O)=C(O)C=C1 1 13 | C[N+]1(C)CCC(C1)OC(=O)C(O)(C1CCCC1)C1=CC=CC=C1 1 14 | CCC1(CCC(=O)NC1=O)C1=CC=CC=C1 1 15 | OCC(O)COC1=CC=C(Cl)C=C1 1 16 | O=[N+]([O-])c1cc([N+](=O)[O-])c(O)c([N+](=O)[O-])c1 1 17 | NC1=NC(=O)C2=C(N1)N(CCC(CO)CO)C=N2 1 18 | CC1=CC(CN2CCC(CC2)=C2C3=CC=C(Cl)C=C3CCC3=C2N=CC=C3)=CN=C1 1 19 | CSCCNC1=C2N=CN([C@@H]3O[C@H](COP(O)(=O)OP(O)(=O)C(Cl)(Cl)P(O)(O)=O)[C@@H](O)[C@H]3O)C2=NC(SCCC(F)(F)F)=N1 1 20 | CCC(C1=CC=CC=C1)C1=C(O)C2=C(OC1=O)C=CC=C2 1 21 | CS(=O)(=O)OCCCCOS(C)(=O)=O 1 22 | CN1[C@@H](CNC2=CC=C(C=C2)C(=O)N[C@@H](CCC(O)=O)C(O)=O)CNC2=C1C(=O)N=C(N)N2 1 23 | CC(C)(C)C(=O)OCOP(=O)(COCCN1C=NC2=C(N)N=CN=C12)OCOC(=O)C(C)(C)C 1 24 | CNC[C@H](O)C1=CC(O)=C(O)C=C1 1 25 | CC(C)(C)NCC(O)C1=CC(Cl)=C(N)C(Cl)=C1 1 26 | CCCCOC1=CC=C(OCCCN2CCOCC2)C=C1 1 27 | CC1(C)C[C@@H]1C(=O)N\C(=C/CCCCSC[C@H](N)C(O)=O)C(O)=O 1 28 | CC(C)[C@@H](C)\C=C\[C@@H](C)[C@@]1([H])CC[C@@]2([H])\C(CCC[C@]12C)=C\C=C1\C[C@@H](O)CCC1=C 1 29 | CN1C[C@@H](C2=CC(=CC=C2)S(=O)(=O)NCCOCCOCCNC(=O)NCCCCNC(=O)NCCOCCOCCNS(=O)(=O)C2=CC(=CC=C2)[C@@H]2CN(C)CC3=C2C=C(Cl)C=C3Cl)C2=C(C1)C(Cl)=CC(Cl)=C2 1 30 | CN1CCOC(C2=CC=CC=C2)C2=CC=CC=C2C1 1 31 | CC(C(O)=O)C1=CC(OC2=CC=CC=C2)=CC=C1 1 32 | [H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)[C@H](NC(=O)N1CCN(CC)C(=O)C1=O)C1=CC=CC=C1)C(O)=O 1 33 | CC(C)NCC(O)C1=CC(O)=C(O)C=C1 1 34 | [H][C@@]12CCCN1C(=O)[C@H](CC(C)C)N1C(=O)[C@](NC(=O)[C@H]3CN(C)[C@]4([H])CC5=CNC6=CC=CC(=C56)[C@@]4([H])C3)(O[C@@]21O)C(C)C 1 35 | [H][C@]12CC[C@]([H])(C[C@@H](C1)OC(=O)C(O)C1=CC=CC=C1)N2C 1 36 | NC(N)=N 1 37 | OC1=C(CC2=C(O)C(Cl)=CC(Cl)=C2Cl)C(Cl)=C(Cl)C=C1Cl 1 38 | C\C(N(CC1=CN=C(C)NC1=N)C=O)=C(\CCO)SSCC1CCCO1 1 39 | ONC(=O)\C=C\C1=CC=CC(=C1)S(=O)(=O)NC1=CC=CC=C1 1 40 | CN1C(C(=O)NC2=CC=CC=N2)=C(O)C2=C(C=C(Cl)S2)S1(=O)=O 1 41 | CCOC1=NC2=C(N1CC1=CC=C(C=C1)C1=CC=CC=C1C1=NOC(=O)N1)C(=CC=C2)C(=O)OCC1=C(C)OC(=O)O1 1 42 | C[N+]1=C(\C=N\O)C=CC=C1 1 43 | NC(=N)NC(N)=N 1 44 | CC(C)=C[C@@H]1[C@@H](C(=O)OC2CC(=O)C(CC=C)=C2C)C1(C)C 1 45 | CCCCN1CCCCC1C(=O)NC1=C(C)C=CC=C1C 1 46 | FC(F)(F)CNC(=O)C1(CCCCN2CCC(CC2)NC(=O)C2=C(C=CC=C2)C2=CC=C(C=C2)C(F)(F)F)C2=CC=CC=C2C2=CC=CC=C12 1 47 | NC1=CC=C(C=C1)C(O)=O 1 48 | [H][C@@]12C[C@@H](C)[C@](OC(=O)CC)(C(=O)SCF)[C@@]1(C)C[C@H](O)[C@@]1(F)[C@@]2([H])C[C@H](F)C2=CC(=O)C=C[C@]12C 1 49 | CCCCCOC1=CC=C(C=C1)C1=CC(=NO1)C1=CC=C(C=C1)C(=O)N[C@H]1C[C@@H](O)[C@@H](O)NC(=O)[C@@H]2[C@@H](O)[C@@H](C)CN2C(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@@H](NC1=O)[C@@H](C)O)[C@H](O)[C@@H](O)C1=CC(OS(O)(=O)=O)=C(O)C=C1)[C@H](O)CC(N)=O 1 50 | C=CC[C@@H](CCC)C(=O)O 1 51 | [H][C@@]12CC[C@](OC(=O)CCC)(C(=O)CO)[C@@]1(C)C[C@H](O)[C@@]1([H])[C@@]2([H])CCC2=CC(=O)CC[C@]12C 1 52 | OC(C(O)=O)C1=CC=CC=C1 1 53 | [H][C@@]12OC3=C(O)C=CC4=C3[C@@]11CCN(C)[C@]([H])(C4)[C@]1([H])C=C[C@@H]2O 1 54 | COC1=CC=CC(=C1)N1CCN(CC1)C1=NC2=C(C=CC=C2F)[C@H](CC(O)=O)N1C1=CC(=CC=C1OC)C(F)(F)F 1 55 | [H][C@]1(O)CO[C@]2([H])[C@]([H])(O)CO[C@]12[H] 1 56 | COc1cc2c(cc1OC)[C@@H]1C[C@H](O)[C@@H](CC(C)C)CN1CC2 1 57 | CC[C@@H](c1ccccc1)[C@H](c1ccc(OCCNC)cc1)c1ccc(O[C@@H]2O[C@H](C(=O)O)[C@@H](O)[C@@H](O)[C@@H]2O)cc1 1 58 | C[C@@H]1CNc2c(cccc2S(=O)(=O)N[C@@H](CCCNC(=N)N)C(=O)N2CC[C@@H](C)C[C@@H]2C(=O)O)C1 1 59 | O=C[C@H](O)[C@@H](O)[C@H](O)CO 1 60 | O=C1c2cccc(O)c2C(=O)c2c(O)cccc21 1 61 | C[C@]12CC[C@@H]3c4ccc(O)c(O)c4CC[C@H]3[C@@H]1CC[C@@H]2O 1 62 | CCNCC#CCOC(=O)[C@](O)(c1ccccc1)C1CCCCC1 1 63 | CCN(CC)CC(=O)Nc1c(C)ccc(O)c1C 1 64 | NC(=O)c1c[nH]c(=O)cn1 1 65 | C[C@H]1O[C@@H](n2cc(F)c(=O)[nH]c2=O)[C@H](O)[C@@H]1O 1 66 | C=C1CC[C@@]2(O)[C@H]3Cc4ccc(O)c5c4[C@@]2(CCN3CC2CC2)[C@H]1O5 1 67 | CCN(CC)C(=O)N1CC[N+](C)([O-])CC1 1 68 | COc1ccccc1O 1 69 | CCCCCCCCCCCC[N+](C)(C)CCOc1ccccc1 1 70 | C[N+]1([O-])CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1 1 71 | CCCCCc1cc(O)c2c(c1)OC(C)(C)[C@H]1CC[C@]3(C)O[C@@H]3[C@@H]21 1 72 | NC(=O)N1c2ccccc2[C@H](O)[C@@H](O)c2ccccc21 1 73 | CC(C)(O)CNc1nc(Nc2ccnc(C(F)(F)F)c2)nc(-c2cccc(C(F)(F)F)n2)n1 1 74 | CCCCCCCCCCC(=O)O[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(C)[C@H]3CC[C@]12C 1 75 | O[Si](O)(O)O 1 76 | CN1C(=O)NC(=O)[C@@](C)(C2=C[C@@H](O)CCC2)C1=O 1 77 | C[C@@H]([C@H](O)c1ccccc1)N(C)C 1 78 | O=C1N[C@H](O)[C@@H](c2ccccc2)CO1 1 79 | Nc1nc2c(c(=O)[nH]1)N[C@@H](CNc1ccc(C(=O)N[C@H](CCC(=O)O)C(=O)O)cc1)CN2 1 80 | O=[N+]([O-])OCC(CO[N+](=O)[O-])(CO[N+](=O)[O-])CO[N+](=O)[O-] 1 81 | C[C@H](NC(=O)[C@@H](Cc1ccc2ccccc2c1)NC(=O)[C@@H](C)N)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@@H](CCCCN)C(N)=O 1 82 | CC1=C(/C=C/C(C)=C/C=C/C(C)=C/C(=O)O)C(C)(C)CCC1=O 1 83 | OCCN1CCOCC1 1 84 | COC(=O)c1ccc(C)cc1O 1 85 | COc1cc(C[C@@](C)(O)C(=O)O)ccc1O 1 86 | CN1CCCN=C1/C=C/c1cccs1 1 87 | CCc1nc(C2CC2)c(C(N)=O)n1Cc1ccc2oc(-c3ccccc3NS(=O)(=O)C(F)(F)F)c(Br)c2c1 1 88 | NC[C@H](O)COc1ccccc1C(=O)CCc1ccccc1 1 89 | O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(OC(F)(F)[C@@H](F)C(F)(F)F)cc1Cl 1 90 | CC(C)C(=O)O[C@H](C)OC(=O)NCC1(CC(=O)O)CCCCC1 1 91 | CN(C(=O)CO)c1c(I)c(C(=O)NC[C@H](O)CO)c(I)c(C(=O)NC[C@H](O)CO)c1I 1 92 | C[C@]12C=CC(=O)C=C1CC[C@@H]1[C@@H]2CC[C@]2(C)C(=O)CC[C@@H]12 1 93 | CC(C)(C)[C@@H]1NC(=O)O[C@@H]2CCC[C@H]2OC/C=C/C(F)(F)c2nc3ccccc3nc2O[C@@H]2C[C@@H](C(=O)N[C@]3(C(=O)NS(=O)(=O)C4(C)CC4)C[C@H]3C(F)F)N(C2)C1=O 1 94 | CCN(CC)CCc1nc(-c2ccccc2)no1 1 95 | CN1C(=O)CC[C@H]1c1ccc[n+]([O-])c1 1 96 | CNCc1ccc(-c2[nH]c3cc(F)cc4c3c2CCNC4=O)cc1 1 97 | O=C(O[C@@H]1CN2CCC1CC2)c1ccccc1 1 98 | CC(C)[C@@H](C(=O)O)N(Cc1ccc(-c2ccccc2-c2nnn[nH]2)cc1)C(=O)CC[C@@H](C)O 1 99 | O=C1CN2Cc3c(ccc(Cl)c3Cl)NC2=N1 1 100 | CC#CC[C@@H](C)[C@H](O)/C=C/[C@@H]1[C@@H](O)C[C@@H]2C/C(=C/CCCC(=O)OCC(=O)c3ccccc3)C[C@H]21 1 101 | C[C@@H](c1cc2ccccc2s1)N(O[C@@H]1O[C@H](C(=O)O)[C@@H](O)[C@H](O)[C@H]1O)C(N)=O 1 102 | CCN(CC)CCOCCOC(=O)C(CC)(CC)c1ccccc1 1 103 | O=C(CSc1ccncc1)N[C@H]1C(=O)N2C(C(=O)O)=C(CO)CS[C@H]12 1 104 | COc1cc2c(cc1OC)[C@@H]1C[C@@H](O)[C@@H](CC(C)C)CN1CC2 1 105 | COc1ccc2[nH]c(C)c(CC(=O)O)c2c1 1 106 | CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](C(=O)Oc3ccc4c(c3)CCC4)c3ccccc3)C(=O)N2[C@H]1C(=O)O 1 107 | [O-][n+]1ccccc1SSc1cccc[n+]1[O-] 1 108 | Cc1cc(OS(=O)(=O)O)c2ccccc2c1OS(=O)(=O)O 1 109 | CCCCCCCCCCCC(=O)OCN1C(=O)CCc2ccc(OCCCCN3CCN(c4cccc(Cl)c4Cl)CC3)cc21 1 110 | CCCCCCc1ccc(O)cc1O 1 111 | C[C@H](I)[C@H]1OC[C@@H](CO)O1 1 112 | CN1CCC[C@@H]1c1ccc[n+](C)c1 1 113 | C[C@H](CCC(=O)O)[C@H]1CC[C@H]2[C@@H]3C(=O)C[C@@H]4CC(=O)CC[C@]4(C)[C@H]3CC(=O)[C@@]21C 1 114 | CCC[C@@H](Cc1ccccc1)N1CCCC1 1 115 | CCCNC(=O)NS(=O)(=O)c1ccc(Cl)c(O)c1 1 116 | O=C(O)COc1ccc(C(=O)c2cccs2)c(Cl)c1Cl 1 117 | CNCCN1C(=O)[C@H](OC(C)=O)[C@@H](c2ccc(OC)cc2)Sc2ccccc21 1 118 | CCCCCCCCCC[N+](C)(C)CCCCCCCCCC 1 119 | O=C(O)CCCCCCNC1c2ccccc2CCc2ccccc21 1 120 | NC(=O)N1c2ccccc2C=Cc2cc(O)ccc21 1 121 | C/C(=C(/CCO)SSC[C@H]1CCCO1)N(C=O)Cc1cnc(C)nc1N 1 122 | O=C1NC(=O)[C@@H](N2C(=O)c3ccccc3C2=O)C[C@@H]1O 1 123 | CC(C)(C)NC[C@H](O)COc1cccc2c1SCCC2 1 124 | CN1C(=O)NC(=O)[C@](C)([C@]23CCCC[C@@H]2O3)C1=O 1 125 | N[C@H](CO)C(=O)NNCc1ccc(O)c(O)c1O 1 126 | OC[C@H]1O[C@](O)(CO[C@]2(CO)O[C@H](CO)[C@@H](O)[C@@H]2O)[C@@H](O)[C@@H]1O 1 127 | Nc1nccs1 1 128 | CCn1cc(C(=O)O)c(=O)c2cnc(N3CCCC3)nc21 1 129 | CN(Cc1nc2c(N)[nH]c(=N)[nH]c-2nc1=O)c1ccc(C(=O)N[C@@H](CCC(=O)O)C(=O)O)cc1 1 130 | C[C@]12C[C@H]3C[C@@](N)(C1)C[C@@](CO)(C3)C2 1 131 | CNCCC[C@](C#N)(c1ccc(OC)c(OC)c1)C(C)C 1 132 | CN1C(=O)[C@H](O[C@@H]2O[C@H](C(=O)O)[C@@H](O)[C@H](O)[C@H]2O)C[C@H]1c1cccnc1 1 133 | CCOc1c(N)c2c(c(OCC)c1OCC)[C@H]([C@@H]1c3c(cc4c(c3OC)OCO4)CCN1C)OC2=O 1 134 | O=C(c1ccc(O)cc1)c1ccc2n1CC[C@@H]2C(=O)O 1 135 | CN(c1nccc(=O)[nH]1)C1CCN(c2nc3ccccc3n2Cc2ccc(F)cc2)CC1 1 136 | CCCc1nnc(NC(=O)[C@@]2(C)Sc3ccccc3N(C)C2=O)s1 0 137 | COc1ccccc1OC(=O)Cn1nc(C)c([N+](=O)[O-])c1C 0 138 | Cc1ccc2c(c1)C(=O)NC[C@]1(CC[C@H](N(C)C(=O)c3ccno3)CC1)O2 0 139 | CC(=O)N[C@@H](C)C(=O)Nc1ccccc1N1CCCCC1 0 140 | CCn1/c(=N/C(=O)c2ccc([N+](=O)[O-])s2)sc2cc([N+](=O)[O-])ccc21 0 141 | O=C(Nc1cccc(N2CCCC2)c1)c1ccccc1 0 142 | O=C([C@H]1CCCO1)N1CCN(Cc2cn3cc(Cl)ccc3n2)CC1 0 143 | CN(C(=O)C[N@H+]1CC[C@H](C(=O)NC2CCCCC2)CC1)c1ccccc1 0 144 | O=C(CN1CCN(Cc2ccc(Cl)s2)CC1)Nc1cccc(S(=O)(=O)N2CCCCC2)c1 0 145 | COc1ccc2sc(N(Cc3ccccc3)C(=O)C34C[C@H]5C[C@@H](C3)C[C@@H](C4)C5)nc2c1 0 146 | COc1ccc(OC)c2sc(N(CCCN(C)C)C(=O)[C@H]3CC(=O)N(c4ccc(F)cc4)C3)nc12 0 147 | Cc1nc(-c2ccc(F)cc2)n(CC(=O)N(C)C2CCCCC2)c(=O)c1CCO 0 148 | Cn1c(C[N@@H+]2CC[C@H](CNc3ncccc3C#N)C2)nc2ccccc21 0 149 | CCCNC(=O)[C@H]1CCCN1C(=O)/C=C/c1ccc(SC(F)(F)F)cc1 0 150 | COc1cc(C(=O)OCC(=O)NC2CC2)ccc1OCCC(C)C 0 151 | NC(=O)[C@H]1CC(C(=O)Nc2ncn[n-]2)=NN1c1ccc(F)cc1 0 152 | Cn1c(=O)c2ccccc2n(CC(=O)NCc2ccccc2Cl)c1=O 0 153 | C[C@](O)(CNC(=O)c1cc2ccccc2o1)C1CC1 0 154 | COc1ccccc1C(=O)OCC(=O)N[C@@H](C)c1ccccc1 0 155 | COc1cccc(-c2nn(-c3ccc(C)cc3)cc2C(=O)N2CCC(N(C)C)CC2)c1 0 156 | CC(=O)Nc1ccc(/C(C)=N/O[C@@H](C)C(=O)Nc2ccc(F)c(F)c2)cc1 0 157 | COC(=O)C[C@@H]1CN(Cc2c(Cl)nc3sccn23)CCO1 0 158 | Cc1ccc(N2C[C@H](C(=O)N3CCC[C@H]([C@@H](C)O)C3)CCC2=O)cc1C 0 159 | CC[C@H](C)c1ccc(NC(=O)CSc2nnc(-c3cccnc3)n2N)cc1 0 160 | CC(=O)Nc1ccc(-c2nnc3n(Cc4ccc(C)cc4)c(=O)c4ccccc4n23)cc1 0 161 | CC(C)CNC(=O)C[C@H]1CSc2nc3c(cnn3-c3ccc(Cl)cc3)c(=O)n21 0 162 | O[C@]1(c2ccccc2)CC[N@H+](CCOc2cccc(Cl)c2)CC1 0 163 | CCc1nc2n(n1)CCC[C@@H]2NC(=O)N[C@@H]1CCC(C)(C)c2ccccc21 0 164 | C/C(=N\NC(=O)COc1ccccc1[N+](=O)[O-])c1ccc(-c2ccccc2)cc1 0 165 | O=C(CSc1ccc(F)cc1)N1CCN(c2nc3ccc(Br)cc3s2)CC1 0 166 | COCc1ccc(C[N@H+]2CCCC[C@@H]2CNC(=O)c2ccccc2)cc1 0 167 | Cc1cccc(Cl)c1OCc1cc(=O)n(C)c(=O)n1C 0 168 | CCOC(=O)CSc1nnc(-c2ccccc2[N+](=O)[O-])o1 0 169 | O=C(CC1=CCCCC1)NC[C@@H]1Cc2cc(F)cc(-c3cncnc3)c2O1 0 170 | CN(C)CCN(C(=O)c1ccc([N+](=O)[O-])cc1)c1nc2ccc(OC(F)(F)F)cc2s1 0 171 | CC(C)N(C(=O)/C=C/c1cncc(F)c1)C1CCC1 0 172 | Cc1cc(C)n2nc(C(=O)N/N=C/c3ccc(Cl)c(Cl)c3)nc2n1 0 173 | O=C(Nc1ccccc1F)c1nc(SCc2ccc(Cl)cc2)ncc1Br 0 174 | CCOC(=O)C1=C(c2ccccc2)N=c2s/c(=C\c3cc4c(cc3C)OCO4)c(=O)n2[C@H]1c1ccc(Cl)cc1 0 175 | COc1ccc(Br)c(C(=O)NCCCN2CCc3ccccc3C2)c1 0 176 | COc1ccc(S[C@@H](C)C(=O)Nc2ccc(CN3CCCC3)cc2)cc1 0 177 | CC(C)C[C@@H](N[C@@H](C)C(=O)Nc1ccc(S(N)(=O)=O)cc1)c1ccccc1 0 178 | C[C@H](NC(=O)NC[C@H](O)c1cccc(F)c1)c1ccc(F)cc1 0 179 | COc1ccc([C@@H](C)NCc2cccc(OC)c2OC)c(F)c1 0 180 | Cn1c(SCC(=O)N(c2ccccc2)[C@H]2CCS(=O)(=O)C2)nnc1C1CC1 0 181 | Cn1c(CSc2ncccn2)nnc1SCC(=O)O 0 182 | CC1=C(C#N)C(=O)N(C2CCCC2)C(=O)/C1=C/Nc1ccc([N+](=O)[O-])cc1C 0 183 | COc1cc([C@H]2Nc3ccc(OCCC(C)C)cc3[C@@H]3C=CC[C@@H]23)ccc1OC(C)=O 0 184 | CC(=O)NCC(=O)N1CCN(CC(=O)NC(C)C)CC1 0 185 | CCCc1ccc(OCC(=O)c2ccc3c(c2)N(CC(=O)NCc2cccnc2)C(=O)CO3)cc1 0 186 | COc1ccc(S(=O)(=O)N(C)c2ccc(C(=O)Nc3ccccc3SC)cc2)cc1OC 0 187 | Cc1cc([C@H]2CCC[N@@H+]2Cc2ccccc2Br)on1 0 188 | O=[N+]([O-])c1ccccc1-c1ccc(CN2CCC(n3cncn3)CC2)s1 0 189 | COc1cc([N+](=O)[O-])ccc1NC(=O)[C@@H]1CCCC[C@H]1C(=O)O 0 190 | Cc1ccc(OCc2cccc(C(=O)N(C3CCCC3)C3CC3)c2)cn1 0 191 | COc1ccc(CNC(=O)NCc2ccc(Cn3cnc4ccccc43)cc2)cc1 0 192 | CC1(C)CCc2cc(S(=O)(=O)NCc3csc(-c4cccs4)n3)ccc2O1 0 193 | CCn1c(SCC(=O)Nc2ccc(Br)cc2C(=O)O)nnc1C1CC1 0 194 | CC(=O)c1cccc(NC(=O)[C@H](N[C@H](C)c2ccc(F)cc2)c2ccccc2)c1 0 195 | C[C@@H](NC(=O)COc1ccc2c3c(c(=O)oc2c1)CCCC3)C(=O)NCC(=O)NCC(=O)O 0 196 | CC(C)(C)c1n[nH]c([C@H]2CN(C(=O)CCn3cncn3)CCO2)n1 0 197 | Cc1nc(C(C)(C)NC(=O)NCCOC[C@H]2CCCO2)sc1C 0 198 | O=C(O)c1ccc2c(c1)=N[C@@H](c1ccccc1Cl)N=2 0 199 | CN(CC(=O)N(C)c1cccc2ncccc12)C(=O)OC(C)(C)C 0 200 | C[C@@H]1COCC[N@H+]1Cc1ccc(-c2c(F)cccc2F)o1 0 201 | COc1ccc(NC(=O)c2nc(-c3ccco3)n(-c3cccc(C)c3)n2)cc1 0 202 | COc1ccc(N2CCN(Cc3cc(OC)c(O)c([N+](=O)[O-])c3)CC2)cc1 0 203 | COCCNC(=O)[C@@H]1CCCN([C@H]2CC[N@H+](Cc3cc(C)cc(C)c3)CC2)C1 0 204 | N#C[C@@H]1C(=O)Nc2nc(NCc3ccc4c(c3)OCCO4)[nH]c(=O)c2[C@@H]1/C=C/c1ccccc1 0 205 | Cc1ccc(C(C)C)cc1OCC(=O)Nc1ccc2c(c1)OCC(=O)N2 0 206 | CCCCOc1c(Cl)cc(C(=O)Nc2ccc(S(=O)(=O)N3C[C@H](C)O[C@@H](C)C3)cc2)cc1Cl 0 207 | Cc1nccc(CN2CCC[C@H](c3cc(=O)[nH]c(-c4ccncc4)n3)C2)n1 0 208 | COCc1ccc(CN[C@H]2CCCN(c3ccccc3F)C2=O)cc1 0 209 | Cc1ccc([C@H](NC(=O)CS(=O)(=O)Cc2cccc(Br)c2)C2CC2)cc1 0 210 | COc1cccc([C@@H]2[C@@H]3CCCC=C3[C@H](C#N)C(=N)C2(C#N)C#N)c1 0 211 | Cc1n[nH]c(C)c1[C@H]1CCCN1C(=O)COc1ccccc1 0 212 | O=C(NCc1cccs1)c1cccc(NC(=O)c2cc3sccn3c2)c1 0 213 | CC(C)[C@@H]1CCc2ccccc2N1C(=O)c1ccc2c(c1)C(=O)N(C)C2=O 0 214 | Cc1nc([N+](=O)[O-])cn1CC(=O)NCc1cccnc1 0 215 | CCN(CC(=O)Nc1nc(-c2ccc(OC)cc2)cn1-c1ccc(C(C)C)cc1)C(=O)c1ccc(C)cc1 0 216 | COc1ccc([C@@H]2c3c(n[nH]c3-c3ccc(Cl)cc3)C(=O)N2c2ccc(C)cc2)c(OC)c1 0 217 | O=C(Cn1nc2nc(Nc3ccccc3)ccn2c1=O)NCc1ccc(F)cc1F 0 218 | Cc1cccc(OCCCCNc2ccc(C)cc2C(=O)O)c1 0 219 | CC(C)n1cc(S(=O)(=O)N2CCC(n3cccc3)CC2)cn1 0 220 | O=C1CCc2ccc(C(=O)NCc3ccc(CC(=O)NC4CCCCC4)cc3)nc2N1 0 221 | CCOc1cccc(C(=O)NCC23CC4(CNC(=O)c5cccc(OCC)c5)C[C@H](C2)C[C@H](C3)C4)c1 0 222 | C[C@H](Oc1cccc(C=O)c1)C(=O)Nc1ccc(N2CCCCC2)cc1 0 223 | Cc1ccc(-c2[nH]ncc2C[N@H+](C)Cc2cc(C)on2)cc1 0 224 | Cc1cccc(Cl)c1NC(=O)[C@@H](C)c1ccc(Cl)s1 0 225 | Cc1nnc(NC(=O)CSc2nnc(C)c(O)n2)s1 0 226 | O=C(OCCN1CCCC1=O)[C@H]1C[C@H]1c1ccc(OC(F)(F)F)cc1 0 227 | Cc1ccccc1NC(=O)Cc1cc(-c2ccccc2)on1 0 228 | Cc1cccc(NCC(=O)N/N=C/c2ccc3c(c2)OCO3)c1 0 229 | Cc1ccc(C)n1N1C(=O)/C(=C/c2c(F)cccc2Cl)SC1=S 0 230 | COc1ccc(/N=C2\NC(=O)/C(=C\C=C\c3ccccc3)S2)cc1 0 231 | COc1cccc(NC2=NN=C(c3cc(C)n(-c4ncn[nH]4)c3C)CS2)c1 0 232 | Cc1nn(-c2ccccc2)c(C)c1CN(C)C(=O)Cc1ccc2c(c1)CCCC2 0 233 | Cc1cc(C)c([N+](=O)[O-])cc1NC(=O)N1CCC(c2ccn[nH]2)CC1 0 234 | COC(=O)c1cc(C2OCCO2)ccn1 0 235 | Cc1cccc(OCCCCn2c([C@@H](C)NC(=O)c3cccc(Br)c3)nc3ccccc32)c1 0 236 | CCc1cccc(CC)c1NC(=O)CN(c1ccc(F)cc1)S(C)(=O)=O 0 237 | O=C(NCc1ccc2c(c1)OCO2)c1ccc2c(=O)n(C[C@H]3CCCO3)c(=S)[nH]c2c1 0 238 | O=C(O)CNC(=O)CCCCCN1C(=O)/C(=C/C=C/c2ccccc2)SC1=S 0 239 | COc1cc(/C=C(\C#N)C(=O)Nc2cc(Cl)ccc2Cl)cc2c1OCCO2 0 240 | NC(=O)c1ccc(C[N@H+](CC(=O)N2CCC(C(=O)Nc3ccccc3)CC2)C2CC2)cc1 0 241 | COCCOc1ccc(C#N)cc1NC(=O)c1ccsc1 0 242 | Cn1c(=O)c2c(nc(CN3CCOCC3)n2Cc2ccccc2F)n(C)c1=O 0 243 | CC(=O)[C@H]1C(=O)C(=O)N(CCC(=O)O)[C@H]1c1ccc(C(C)C)cc1 0 244 | CCN1CCN(c2ccc(NC(=O)[C@@H](NC(=O)c3ccc(Cl)cc3)C(C)C)cc2C)CC1 0 245 | C#CCNC(=O)N1CCN(S(=O)(=O)c2ccc3c(c2)OCCO3)CC1 0 246 | O=C(c1ccc(Cl)cc1Cl)N1CCCN(c2ccc(C(F)(F)F)cn2)CC1 0 247 | CC(C)[C@H](CNC(=O)CCS(C)(=O)=O)N1CCc2ccccc2C1 0 248 | O=C(NCCNC(=O)c1cccnc1OCC(F)F)c1cccnc1 0 249 | CCc1cc(C(F)(F)F)n2nc([C@H]3CCCN3C(=O)[C@H]3CCS(=O)(=O)C3)cc2n1 0 250 | CCCN(CCC)CCCNC(=O)c1oc(=O)c2ccccc2c1-c1ccccc1 0 251 | CN(C(=O)c1cccc(-c2ccoc2)c1)c1ccc(F)c(F)c1 0 252 | Cc1cc(C)cc(NC(=S)N(CCCn2ccnc2)C[C@H]2CC=CCC2)c1 0 253 | CCc1ccc([N+](=O)[O-])cc1S(=O)(=O)NC(C)(C)CC 0 254 | Nc1cc(N2CCC[C@@H](c3[nH+]ccn3Cc3cscn3)C2)ncn1 0 255 | O=c1[nH]cc(-c2cc(Cl)cc(Cl)c2)c(=O)[nH]1 0 256 | O=C(O)c1nc(O)n(-c2cccc(Cl)c2)n1 0 257 | CC/N=C(/NCc1cc(C)on1)N(C)Cc1ccc(OC)cc1 0 258 | Fc1ccc(NCl)c(S)c1 0 259 | CN(CCCO)C(=O)OC(C)(C)C 0 260 | O=C1C[C@H](C(=O)Nc2ccccc2C(F)(F)F)c2c(nc(Nc3cc(Cl)cc(Cl)c3)[nH]c2=O)N1 0 -------------------------------------------------------------------------------- /Discussion/GNN.py: -------------------------------------------------------------------------------- 1 | import timeit 2 | 3 | import numpy as np 4 | import math 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | import pickle 10 | from sklearn.metrics import roc_auc_score,roc_curve 11 | from sklearn.metrics import confusion_matrix 12 | import preprocess as pp 13 | import pandas as pd 14 | import matplotlib.pyplot as plt 15 | 16 | 17 | class MolecularGraphNeuralNetwork(nn.Module): 18 | def __init__(self, N_fingerprints, dim, layer_hidden, layer_output): 19 | super(MolecularGraphNeuralNetwork, self).__init__() 20 | self.embed_fingerprint = nn.Embedding(N_fingerprints, dim) 21 | self.W_fingerprint = nn.ModuleList([nn.Linear(dim, dim) 22 | for _ in range(layer_hidden)]) 23 | 24 | self.W_output = nn.ModuleList([nn.Linear(dim, dim) 25 | for _ in range(layer_output)]) 26 | self.W_property = nn.Linear(dim, 2) 27 | 28 | 29 | def pad(self, matrices, pad_value): 30 | """Pad the list of matrices 31 | with a pad_value (e.g., 0) for batch processing. 32 | For example, given a list of matrices [A, B, C], 33 | we obtain a new matrix [A00, 0B0, 00C], 34 | where 0 is the zero (i.e., pad value) matrix. 35 | """ 36 | shapes = [m.shape for m in matrices] 37 | M, N = sum([s[0] for s in shapes]), sum([s[1] for s in shapes]) 38 | zeros = torch.FloatTensor(np.zeros((M, N))).to(device) 39 | pad_matrices = pad_value + zeros 40 | i, j = 0, 0 41 | for k, matrix in enumerate(matrices): 42 | m, n = shapes[k] 43 | pad_matrices[i:i+m, j:j+n] = matrix 44 | i += m 45 | j += n 46 | return pad_matrices 47 | 48 | def update(self, matrix, vectors, layer): 49 | hidden_vectors = torch.relu(self.W_fingerprint[layer](vectors)) 50 | 51 | return hidden_vectors + torch.matmul(matrix, hidden_vectors) 52 | 53 | def sum(self, vectors, axis): 54 | sum_vectors = [torch.sum(v, 0) for v in torch.split(vectors, axis)] 55 | return torch.stack(sum_vectors) 56 | def gnn(self, inputs): 57 | 58 | """Cat or pad each input data for batch processing.""" 59 | Smiles,fingerprints, adjacencies, molecular_sizes = inputs 60 | fingerprints = torch.cat(fingerprints) 61 | adjacencies = self.pad(adjacencies, 0) 62 | 63 | """GNN layer (update the fingerprint vectors).""" 64 | fingerprint_vectors = self.embed_fingerprint(fingerprints) 65 | for l in range(layer_hidden): 66 | hs = self.update(adjacencies, fingerprint_vectors, l) 67 | fingerprint_vectors = F.normalize(hs, 2, 1) # normalize. 68 | 69 | """Molecular vector by sum or mean of the fingerprint vectors.""" 70 | molecular_vectors = self.sum(fingerprint_vectors, molecular_sizes) 71 | 72 | return Smiles,molecular_vectors 73 | 74 | def mlp(self, vectors): 75 | """Classifier based on multilayer perceptron给予多层感知器的分类器.""" 76 | for l in range(layer_output): 77 | vectors = torch.relu(self.W_output[l](vectors)) 78 | outputs = torch.sigmoid(self.W_property(vectors)) 79 | return outputs 80 | 81 | 82 | def forward_classifier(self, data_batch, train): 83 | 84 | inputs = data_batch[:-1] 85 | correct_labels = torch.cat(data_batch[-1]) 86 | 87 | if train: 88 | Smiles,molecular_vectors = self.gnn(inputs) 89 | 90 | predicted_scores = self.mlp(molecular_vectors) 91 | 92 | loss = F.cross_entropy(predicted_scores, correct_labels.long()) 93 | predicted_scores = predicted_scores.to('cpu').data.numpy() 94 | predicted_scores = [s[1] for s in predicted_scores] 95 | 96 | 97 | correct_labels = correct_labels.to('cpu').data.numpy() 98 | return loss,predicted_scores, correct_labels 99 | else: 100 | with torch.no_grad(): 101 | Smiles,molecular_vectors = self.gnn(inputs) 102 | predicted_scores = self.mlp(molecular_vectors) 103 | loss = F.cross_entropy(predicted_scores, correct_labels.long()) 104 | predicted_scores = predicted_scores.to('cpu').data.numpy() 105 | predicted_scores = [s[1] for s in predicted_scores] 106 | correct_labels = correct_labels.to('cpu').data.numpy() 107 | 108 | return Smiles,loss,predicted_scores, correct_labels 109 | 110 | class Trainer(object): 111 | def __init__(self, model): 112 | self.model = model 113 | self.optimizer = optim.Adam(self.model.parameters(), lr=lr) 114 | 115 | def train(self, dataset): 116 | np.random.shuffle(dataset) 117 | N = len(dataset) 118 | loss_total = 0 119 | P, C = [], [] 120 | for i in range(0, N, batch_train): 121 | data_batch = list(zip(*dataset[i:i+batch_train])) 122 | loss,predicted_scores, correct_labels= self.model.forward_classifier(data_batch, train=True) 123 | 124 | P.append(predicted_scores) 125 | C.append(correct_labels) 126 | self.optimizer.zero_grad() 127 | loss.backward() 128 | self.optimizer.step() 129 | loss_total += loss.item() 130 | tru=np.concatenate(C) 131 | pre=np.concatenate(P) 132 | AUC = roc_auc_score(tru, pre) 133 | pred = [1 if i >0.4 else 0 for i in pre] 134 | predictions =np.stack((tru,pred,pre)) 135 | return AUC, loss_total,predictions 136 | 137 | 138 | class Tester(object): 139 | def __init__(self, model): 140 | self.model = model 141 | 142 | def test_classifier(self, dataset): 143 | N = len(dataset) 144 | loss_total = 0 145 | SMILES,P, C ='', [], [] 146 | for i in range(0, N, batch_test): 147 | data_batch = list(zip(*dataset[i:i+batch_test])) 148 | (Smiles,loss,predicted_scores,correct_labels) = self.model.forward_classifier( 149 | data_batch, train=False) 150 | 151 | SMILES += ' '.join(Smiles) + ' ' 152 | 153 | loss_total += loss.item() 154 | P.append(predicted_scores) 155 | C.append(correct_labels) 156 | SMILES = SMILES.strip().split() 157 | tru=np.concatenate(C) 158 | 159 | pre=np.concatenate(P) 160 | AUC = roc_auc_score(tru, pre) 161 | pred = [1 if i >0.4 else 0 for i in pre] 162 | # Tru=map(str,np.concatenate(C)) 163 | # Pre=map(str,np.concatenate(P)) 164 | # predictions = '\n'.join(['\t'.join(x) for x in zip(SMILES, Tru, Pre)]) 165 | predictions =np.stack((tru,pred,pre)) 166 | return AUC, loss_total,predictions 167 | def save_result(self, result, filename): 168 | with open(filename, 'a') as f: 169 | f.write(result + '\n') 170 | def save_predictions(self, predictions, filename): 171 | with open(filename, 'w') as f: 172 | f.write('Smiles\tCorrect\tPredict\n') 173 | f.write(predictions + '\n') 174 | def save_model(self, model, filename): 175 | torch.save(model.state_dict(), filename) 176 | def split_dataset(dataset, ratio): 177 | """Shuffle and split a dataset.""" 178 | np.random.seed(111) # fix the seed for shuffle. 179 | np.random.shuffle(dataset) 180 | n = int(ratio * len(dataset)) 181 | return dataset[:n], dataset[n:] 182 | def edit_dataset(drug,non_drug,task): 183 | np.random.seed(111) # fix the seed for shuffle. 184 | 185 | if task =='balance': 186 | #np.random.shuffle(non_drug) 187 | non_drug=non_drug[0:len(drug)] 188 | 189 | else: 190 | np.random.shuffle(non_drug) 191 | np.random.shuffle(drug) 192 | dataset_train_drug, dataset_test_drug = split_dataset(drug, 0.9) 193 | # dataset_train_drug,dataset_dev_drug = split_dataset(dataset_train_drug, 0.9) 194 | dataset_train_no, dataset_test_no = split_dataset(non_drug, 0.9) 195 | # dataset_train_no,dataset_dev_no = split_dataset(dataset_train_no, 0.9) 196 | dataset_train = dataset_train_drug+dataset_train_no 197 | dataset_test= dataset_test_drug+dataset_test_no 198 | # dataset_dev = dataset_dev_drug+dataset_dev_no 199 | return dataset_train, dataset_test 200 | 201 | def dump_dictionary(dictionary, filename): 202 | with open(filename, 'wb') as f: 203 | pickle.dump(dict(dictionary), f) 204 | if __name__ == "__main__": 205 | 206 | radius=1 207 | dim=65 208 | layer_hidden=0 209 | layer_output=5 210 | 211 | batch_train=48 212 | batch_test=48 213 | lr=3e-4 214 | lr_decay=0.85 215 | decay_interval=10#下降间隔 216 | iteration=140 217 | N=5000 218 | (radius, dim, layer_hidden, layer_output, 219 | batch_train, batch_test, decay_interval, 220 | iteration) = map(int, [radius, dim, layer_hidden, layer_output, 221 | batch_train, batch_test, 222 | decay_interval, iteration]) 223 | lr, lr_decay = map(float, [lr, lr_decay]) 224 | if torch.cuda.is_available(): 225 | device = torch.device('cuda') 226 | print('The code uses a GPU!') 227 | else: 228 | device = torch.device('cpu') 229 | print('The code uses a CPU...') 230 | print('-'*100) 231 | 232 | # print('Preprocessing the', dataset, 'dataset.') 233 | print('Just a moment......') 234 | print('-'*100) 235 | path='E:/code/drug/drugnn/' 236 | dataname='' 237 | 238 | dataset_train = pp.create_dataset('data_train.txt',path,dataname) 239 | dataset_test = pp.create_dataset('data_test.txt',path,dataname) 240 | 241 | #dataset_train, dataset_test = edit_dataset(dataset_drug, dataset_nondrug,'balance') 242 | #dataset_train, dataset_dev = split_dataset(dataset_train, 0.9) 243 | print('The preprocess has finished!') 244 | print('# of training data samples:', len(dataset_train)) 245 | #print('# of development data samples:', len(dataset_dev)) 246 | print('# of test data samples:', len(dataset_test)) 247 | print('-'*100) 248 | 249 | print('Creating a model.') 250 | torch.manual_seed(111) 251 | model = MolecularGraphNeuralNetwork( 252 | N, dim, layer_hidden, layer_output).to(device) 253 | trainer = Trainer(model) 254 | tester = Tester(model) 255 | print('# of model parameters:', 256 | sum([np.prod(p.size()) for p in model.parameters()])) 257 | print('-'*100) 258 | file_result = path+'AUC'+'.txt' 259 | # file_result = '../output/result--' + setting + '.txt' 260 | result = 'Epoch\tTime(sec)\tLoss_train\tLoss_test\tAUC_train\tAUC_test' 261 | file_test_result = path+ 'test_prediction'+ '.txt' 262 | file_predictions = path+'train_prediction' +'.txt' 263 | file_model = path+'model'+'.h5' 264 | with open(file_result, 'w') as f: 265 | f.write(result + '\n') 266 | 267 | print('Start training.') 268 | print('The result is saved in the output directory every epoch!') 269 | 270 | np.random.seed(111) 271 | 272 | start = timeit.default_timer() 273 | 274 | for epoch in range(iteration): 275 | 276 | epoch += 1 277 | if epoch % decay_interval == 0: 278 | trainer.optimizer.param_groups[0]['lr'] *= lr_decay 279 | #[‘amsgrad’, ‘params’, ‘lr’, ‘betas’, ‘weight_decay’, ‘eps’] 280 | prediction_train,loss_train,train_res= trainer.train(dataset_train) 281 | 282 | 283 | #prediction_dev,dev_res = tester.test_classifier(dataset_dev) 284 | prediction_test,loss_test,test_res = tester.test_classifier(dataset_test) 285 | 286 | 287 | time = timeit.default_timer() - start 288 | 289 | if epoch == 1: 290 | minutes = time * iteration / 60 291 | hours = int(minutes / 60) 292 | minutes = int(minutes - 60 * hours) 293 | print('The training will finish in about', 294 | hours, 'hours', minutes, 'minutes.') 295 | print('-'*100) 296 | print(result) 297 | 298 | result = '\t'.join(map(str, [epoch, time, loss_train, loss_test,prediction_train,prediction_test])) 299 | tester.save_result(result, file_result) 300 | 301 | print(result) 302 | 303 | 304 | 305 | loss = pd.read_table(file_result) 306 | plt.plot(loss['Loss_train'], color='r',label='Loss of train set') 307 | plt.plot(loss['Loss_test'], color='y',label='Loss of train set') 308 | plt.plot(loss['AUC_train'], color='y',label='AUC of train set') 309 | plt.plot(loss['AUC_test'], color='b',label='AUC of test set') 310 | # plt.plot(loss['AUC_test'], color='y',label='AUC of test set') 311 | plt.ylabel('AUC') 312 | plt.xlabel('Epoch') 313 | plt.legend() 314 | plt.savefig(path+'loss.tif',dpi=300) 315 | plt.show() 316 | colors = ['#00CED1','#DC143C' ] 317 | 318 | target_names=np.array(['druglike','not-drug']) 319 | lw=2 320 | res_test = test_res.T 321 | 322 | for color,i,target_name in zip(colors,[1,0],target_names): 323 | 324 | plt.scatter((res_test[res_test[:,0]==i,0]),(res_test[res_test[:,0]==i,2]),color = color,alpha=.8,lw=lw,label=target_name) 325 | plt.legend(loc='best',shadow=False,scatterpoints=1) 326 | plt.title('the results of gnn classification') 327 | res_train = train_res.T 328 | cn_matrix=confusion_matrix(res_train[:,0], res_train[:,1]) 329 | cn_matrix 330 | 331 | tn1 = cn_matrix[0,0] 332 | tp1 = cn_matrix[1,1] 333 | fn1 = cn_matrix[1,0] 334 | fp1 = cn_matrix[0,1] 335 | 336 | 337 | bacc_train = ((tp1/(tp1+fn1))+(tn1/(tn1+fp1)))/2#balance accurance 338 | pre_train = tp1/(tp1+fp1)#precision/q+ 339 | rec_train = tp1/(tp1+fn1)#recall/se 340 | sp_train=tn1/(tn1+fp1) 341 | q__train=tn1/(tn1+fn1) 342 | f1_train = 2*pre_train*rec_train/(pre_train+rec_train)#f1score 343 | mcc_train = ((tp1*tn1) - (fp1*fn1))/math.sqrt((tp1+fp1)*(tp1+fn1)*(tn1+fp1)*(tn1+fn1))#Matthews correlation coefficient 344 | acc_train=(tp1+tn1)/(tp1+fp1+fn1+tn1)#accurancy 345 | fpr_train, tpr_train, thresholds_train =roc_curve(res_train[:,0],res_train[:,1]) 346 | print('bacc_train:',bacc_train) 347 | print('pre_train:',pre_train) 348 | print('rec_train:',rec_train) 349 | print('f1_train:',f1_train) 350 | print('mcc_train:',mcc_train) 351 | print('sp_train:',sp_train) 352 | print('q__train:',q__train) 353 | print('acc_train:',acc_train) 354 | 355 | 356 | ''' 357 | res_dev = dev_res.T 358 | cn_matrix=confusion_matrix(res_dev[:,0], res_dev[:,1]) 359 | cn_matrix 360 | 361 | tn2 = cn_matrix[0,0] 362 | tp2 = cn_matrix[1,1] 363 | fn2 = cn_matrix[1,0] 364 | fp2 = cn_matrix[0,1] 365 | 366 | 367 | bacc_dev = ((tp2/(tp2+fn2))+(tn2/(tn2+fp2)))/2#balance accurance 368 | pre_dev= tp2/(tp2+fp2)#precision/q+ 369 | rec_dev = tp2/(tp2+fn2)#recall/se 370 | sp_dev=tn2/(tn2+fp2) 371 | q__dev=tn2/(tn2+fn2) 372 | f1_dev = 2*pre_dev*rec_dev/(pre_dev+rec_dev)#f1score 373 | mcc_dev = ((tp2*tn2) - (fp2*fn2))/math.sqrt((tp2+fp2)*(tp2+fn2)*(tn2+fp2)*(tn2+fn2))#Matthews correlation coefficient 374 | acc_dev=(tp2+tn2)/(tp2+fp2+fn2+tn2)#accurancy 375 | fpr_dev, tpr_dev, thresholds_dev =roc_curve(res_dev[:,0],res_dev[:,1]) 376 | print('bacc_dev:',bacc_dev) 377 | print('pre_dev:',pre_dev) 378 | print('rec_dev:',rec_dev) 379 | print('f1_dev:',f1_dev) 380 | print('mcc_dev:',mcc_dev) 381 | print('sp_dev:',sp_dev) 382 | print('q__dev:',q__dev) 383 | print('acc_dev:',acc_dev) 384 | 385 | ''' 386 | 387 | cnf_matrix=confusion_matrix(res_test[:,0], res_test[:,1]) 388 | cnf_matrix 389 | 390 | tn = cnf_matrix[0,0] 391 | tp = cnf_matrix[1,1] 392 | fn = cnf_matrix[1,0] 393 | fp = cnf_matrix[0,1] 394 | 395 | bacc = ((tp/(tp+fn))+(tn/(tn+fp)))/2#balance accurance 396 | pre = tp/(tp+fp)#precision/q+ 397 | rec = tp/(tp+fn)#recall/se 398 | sp=tn/(tn+fp) 399 | q_=tn/(tn+fn) 400 | f1 = 2*pre*rec/(pre+rec)#f1score 401 | mcc = ((tp*tn) - (fp*fn))/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))#Matthews correlation coefficient 402 | acc=(tp+tn)/(tp+fp+fn+tn)#accurancy 403 | fpr, tpr, thresholds =roc_curve(res_test[:,0], res_test[:,1]) 404 | print('bacc:',bacc) 405 | print('pre:',pre) 406 | print('rec:',rec) 407 | print('f1:',f1) 408 | print('mcc:',mcc) 409 | print('sp:',sp) 410 | print('q_:',q_) 411 | print('acc:',acc) 412 | print('auc:',prediction_test) 413 | 414 | -------------------------------------------------------------------------------- /dataset/withdrawn.txt: -------------------------------------------------------------------------------- 1 | O=C(Nc1ccc(Cl)c(Cl)c1)c1cc(Cl)cc(Cl)c1O 0 2 | CC(=O)NC1=CC=CC=C1 0 3 | CC(=O)NC1=C(C=CC(=C1)[As](=O)(O)O)O 0 4 | CC(=O)NC1=NC=C(N=N1)C=CC2=CC=C(O2)[N+](=O)[O-] 0 5 | CC(=O)OC1=CC=CC=C1C(=O)O 0 6 | CC1=CC(=C(C(=C1C=CC(=CC=CC(=CC(=O)O)C)C)C)C)OC 0 7 | C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O)O)O)N 0 8 | CC(C(=O)NC(C)C(=O)NC1C2C1CN(C2)C3=C(C=C4C(=O)C(=CN(C4=N3)C5=C(C=C(C=C5)F)F)C(=O)O)F)N 0 9 | C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl 0 10 | CC(C)C(CC1=CC(=C(C=C1)OC)OCCCOC)CC(C(CC(C(C)C)C(=O)NCC(C)(C)C(=O)N)O)N 0 11 | CC1=C(N=CN1)CN2CCC3=C(C2=O)C4=CC=CC=C4N3C 0 12 | CCC(C(CC(C)N(C)C)(C1=CC=CC=C1)C2=CC=CC=C2)OC(=O)C 0 13 | CCCN(CCC)C(=O)CC1=C(N=C2N1C=C(C=C2)Cl)C3=CC=C(C=C3)Cl 0 14 | CCN(CC)C(C)C(=O)C1=CC=CC=C1 0 15 | C1CC2=CC=CC=C2C(C3=CC=CC=C31)NCCCCCCC(=O)O 0 16 | CCC1(CCC(=O)NC1=O)C2=CC=C(C=C2)N 0 17 | Cc1c(N(C)C)c(=O)n(-c2ccccc2)n1C 0 18 | CCC1(C(=O)NC(=O)NC1=O)CCC(C)C 0 19 | CC(C)CCOCC(CN1CCOCC1)OC(=O)C2=CC(=C(C(=C2)OC)OC)OC 0 20 | CC(CC1=CC=CC=C1)N 0 21 | C1=CC=C(C=C1)C(C#N)OC2C(C(C(C(O2)COC3C(C(C(C(O3)CO)O)O)O)O)O)O 0 22 | CC1CC2C(CCC3(C2CCC3(C(=O)C)OC(=O)C)C)C4(C1=CCCC4)C 0 23 | C1CN(CCN1CCOC(=O)C2=CC=CC=C2NC3=C4C=CC(=CC4=NC=C3)C(F)(F)F)C5=CC=CC(=C5)C(F)(F)F 0 24 | CC(C)C1(C(=O)NC(=O)NC1=O)CC=C 0 25 | COC1=CC=CC2=C3C(=C(C=C21)[N+](=O)[O-])C(=CC4=C3OCO4)C(=O)O 0 26 | COC1=CC=C(C=C1)CCN2CCC(CC2)NC3=NC4=CC=CC=C4N3CC5=CC=C(C=C5)F 0 27 | CC(=O)OCC1C(C(C(O1)N2C(=O)NC(=O)C=N2)OC(=O)C)OC(=O)C 0 28 | CCC1(C(=O)NC(=O)NC1=O)CC 0 29 | CCC(C)(C(=O)OCC)OC1=CC=C(C=C1)CC2=CC=C(C=C2)Cl 0 30 | C1=CC=C(C=C1)CN2C3=CC=CC=C3C(=N2)OCC(=O)O 0 31 | CC(CC1=CC(=CC=C1)C(F)(F)F)NCCOC(=O)C2=CC=CC=C2 0 32 | CC(C1=CC2=C(C=C1)OC(=N2)C3=CC=C(C=C3)Cl)C(=O)O 0 33 | CCC1=C(C2=CC=CC=C2O1)C(=O)C3=CC=C(C=C3)O 0 34 | CCC1=C(C2=CC=CC=C2O1)C(=O)C3=CC(=C(C(=C3)Br)O)Br 0 35 | CCC1=C(C2=CC=CC=C2O1)C(=O)C3=CC(=C(C(=C3)I)O)I 0 36 | CN(C)CCCOC1=NN(C2=CC=CC=C21)CC3=CC=CC=C3 0 37 | C1=CC=C(C=C1)CO 0 38 | CC(C)COCC(CN(CC1=CC=CC=C1)C2=CC=CC=C2)N3CCCC3 0 39 | CCC(=O)N1C2=CC=CC=C2N(C1=O)C3CCN(CC3)CCC(C#N)(C4=CC=CC=C4)C5=CC=CC=C5 0 40 | CC(CS(=O)(=O)C1=CC=C(C=C1)F)(C(=O)NC2=CC(=C(C=C2)C#N)C(F)(F)F)O 0 41 | C1=C(C=C(C(=C1SC2=C(C(=CC(=C2)Cl)Cl)O)O)Cl)Cl 0 42 | B(O)(O)O 0 43 | C1=CC(=C(C(=C1)C(=O)C2=CC=C(C=C2)Br)N)CC(=O)O 0 44 | CC(C)C(C(=O)NC(=O)N)Br 0 45 | CC1=NN=C2N1C3=C(C=C(S3)Br)C(=NC2)C4=CC=CC=C4Cl 0 46 | C1=CC2=C(C(=C(C=C2Br)Br)O)N=C1 0 47 | CCOC1=CC=C(C=C1)NC(=O)CC(C)O 0 48 | CC(C)(C)N1CCC(CC1)(C2=CC=CC=C2)C3=CC=CC=C3 0 49 | CCCCOC1=CC=C(C=C1)CC(=O)NO 0 50 | COC1=CC(=C(C(=C1)OC)C(=O)CCCN2CCCC2)OC 0 51 | CCCCN=C(N)N=C(N)N 0 52 | CCCCC(C(=O)N(C1=CC=CC=C1)NC2=CC=CC=C2)C(=O)O 0 53 | CCCC(=O)NC1=C(C=C(C(=C1I)C=C(CC)C(=O)O)I)I 0 54 | CC(C)(C)C(C)(C1CC23CCC1(C4C25CCN(C3CC6=C5C(=C(C=C6)O)O4)CC7CC7)OC)O 0 55 | CC(C(=O)C1=CC(=CC=C1)Cl)NC(C)(C)C 0 56 | CCCCOC(=O)C1=CC=C(C=C1)N 0 57 | CCN(CC(C)O)C1=NN=C(C=C1)NNC(=O)OCC 0 58 | CN1C2=C(C=C(C=C2)Cl)C(=NC(C1=O)OC(=O)N(C)C)C3=CC=CC=C3 0 59 | CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn[nH]2)cc1 0 60 | CC12CCC(=O)C=C1C=CC3C2CCC4(C3CCC45CCC(=O)O5)C 0 61 | CC1=C(C(CCC1=O)(C)C)C=CC(=CC=CC(=CC=CC=C(C)C=CC=C(C)C=CC2=C(C(=O)CCC2(C)C)C)C)C 0 62 | CCN(CC)CCOCCOC(=O)C1(c2ccccc2)CCCC1 0 63 | CN(C)CCOC(C1=CC=C(C=C1)Cl)C2=CC=CC=N2 0 64 | CCCC(C)(COC(=O)N)COC(=O)NC(C)C 0 65 | CCCCCCNC(=O)N1C=C(C(=O)NC1=O)F 0 66 | CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(F)(F)F 0 67 | C1C(=C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CS3)C(=O)[O-])C[N+]4=CC=CC=C4 0 68 | CC(C)C1=C(C(=C(C(=N1)C(C)C)COC)C2=CC=C(C=C2)F)C=CC(CC(CC(=O)O)O)O 0 69 | CC(CCC(=O)O)C1CCC2C1(CCC3C2C(CC4C3(CCC(C4)O)C)O)C 0 70 | C(C(Cl)(Cl)Cl)(O)O 0 71 | C1=CC(=CC=C1C(C(CO)NC(=O)C(Cl)Cl)O)[N+](=O)[O-] 0 72 | N=C(NCCCCCCNC(=N)NC(=N)Nc1ccc(Cl)cc1)NC(=N)Nc1ccc(Cl)cc1 0 73 | CC(=O)C1(CCC2C1(CCC3C2C=C(C4=CC(=O)CCC34C)Cl)C)O 0 74 | CC(=O)C1(CCC2C1(CCC3C2C=C(C4=CC(=O)CCC34C)Cl)C)OC(=O)C 0 75 | CN1C(S(=O)(=O)CCC1=O)C2=CC=C(C=C2)Cl 0 76 | C1=CC=C2C=C(C=CC2=C1)N(CCCl)CCCl 0 77 | C(Cl)(Cl)Cl 0 78 | CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl 0 79 | CC(C)(CC1=CC=C(C=C1)Cl)N 0 80 | C1C(C(OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=C3)O)O)O 0 81 | C1=CC=C(C=C1)C2=NC3=CC=CC=C3C(=C2)C(=O)O 0 82 | COC1=CC(=CC(=C1OC)OC)C=CC(=O)N2CCN(CC2)CC(=O)N3CCCC3 0 83 | CC(C)(C(=O)O)OC1=CC=C(C=C1)C2CC2(Cl)Cl 0 84 | COC1CN(CCC1NC(=O)C2=CC(=C(C=C2OC)N)Cl)CCCOC3=CC=C(C=C3)F 0 85 | C1=CC2=C(C(=C(C=C2Cl)I)O)N=C1 0 86 | CC(CC1=CC=CC=C1)NCC2=CC=CC=C2Cl 0 87 | CC(CN(C)C)C(C)(CC1=CC=C(C=C1)Cl)O 0 88 | C1=CC(=CC=C1C(C2=CC=C(C=C2)Cl)C(Cl)(Cl)Cl)Cl 0 89 | CCOC(=O)C(C)(C)OC1=CC=C(C=C1)Cl 0 90 | CCOC(=O)NC(C)(C)CC1=CC=C(C=C1)Cl 0 91 | CN(C)CCCC1C2=CC=CC=C2NC3=C1C=C(C=C3)Cl 0 92 | CC1=C(C2=C(N1CC(=O)O)C=C(C=C2)OC)C(=O)C3=CC=C(C=C3)Cl 0 93 | C1=CC=C(C=C1)C2=NC(C(=O)NC3=C2C=C(C=C3)Cl)C(=O)O 0 94 | CN1CCN(CC1)C2=NC3=C(C=CC(=C3)Cl)NC4=CC=CC=C42 0 95 | [Co] 0 96 | CN1CCC23C4C1CC5=C2C(=C(C=C5)OC)OC3C(C=C4)O 0 97 | CCC(C1=CC=C(C=C1)OCCN(CC)CC)C(CC)C2=CC=C(C=C2)OCCN(CC)CC 0 98 | C1=CC=C2C(=C1)C=CC(=O)O2 0 99 | CC1CC(CC(C1)(C)C)OC(=O)C(C2=CC=CC=C2)O 0 100 | CCC1(C(=O)NC(=O)NC1=O)C2=CCCCC2 0 101 | CC(=O)OC1=CC=C(C=C1)C(=C2CCCCC2)C3=CC=C(C=C3)OC(=O)C 0 102 | CN1CCC(=C2C3=CC=CC=C3C=CC4=CC=CC=C42)CC1 0 103 | C1=CC2=C(C(=C1)O)C(=O)C3=C(C2=O)C=CC=C3O 0 104 | CC(CC1=CC=CC=C1)N 0 105 | CCNC(C)CC1=CC(=CC=C1)C(F)(F)F 0 106 | CCN(CC)CCOC1=CC2=C(C=C1)N=C(S2)N(C)C 0 107 | C1=CC(=CC=C1NC(=O)C2=C(C=CC(=C2)Br)O)Br 0 108 | CCN(CC)CCOC(=O)C1(CCCCC1)C2CCCCC2 0 109 | CC=C(C1=CC=C(C=C1)O)C(=CC)C2=CC=C(C=C2)O 0 110 | CCC(=C(CC)C1=CC=C(C=C1)O)C2=CC=C(C=C2)O 0 111 | CC(C)(COC(=O)C(C1=CC=CC=C1)(C2=CC=CC=C2)O)N(C)C 0 112 | C1CN(CCC1(C2=CC=CC=C2)C(=O)O)CCC(C#N)(C3=CC=CC=C3)C4=CC=CC=C4 0 113 | CC(C)[C@@]1(NC(=O)[C@@H]2C[C@@H]3c4cccc5[nH]cc(c45)C[C@H]3N(C)C2)O[C@@]2(O)[C@@H]3CCCN3C(=O)[C@H](Cc3ccccc3)N2C1=O 0 114 | CC(C)C[C@H]1C(=O)N2CCC[C@H]2[C@]2(O)O[C@](NC(=O)[C@@H]3C[C@@H]4c5cccc6[nH]cc(c56)C[C@H]4N(C)C3)(C(C)C)C(=O)N12 0 115 | CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4C(=O)[C@H](Cc4ccccc4)N3C2=O)C[C@@H]2c3cccc4[nH]cc(c34)C[C@H]21 0 116 | CN[C@@H]1[C@H](O[C@H]2[C@H](O[C@@H]3[C@@H](NC(=N)N)[C@H](O)[C@@H](NC(=N)N)[C@H](O)[C@H]3O)O[C@@H](C)[C@]2(O)CO)O[C@@H](CO)[C@H](O)[C@H]1O 0 117 | C1=C(OC(=C1)[N+](=O)[O-])C=CC2=CN=C(N=N2)N(CO)CO 0 118 | I[Sn](CC)(CC)I 0 119 | CC(CCC1=CC=CC=C1)NCC(C2=CC(=C(C=C2)O)C(=O)N)O 0 120 | CCC(C)CC(C)N 0 121 | C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])O 0 122 | CCCCCC(C=CC1C(CC(=O)C1CC=CCCCC(=O)O)O)O 0 123 | CC(CC1=CC=CC=C1)N2CCN(CC2)C(C)CC3=CC=CC=C3 0 124 | CN(C)CCOC(C1=CC=CC=C1)C2=CC=CC=C2 0 125 | CCOC(=O)C1(CCN(CC1)CCC(C#N)(C2=CC=CC=C2)C3=CC=CC=C3)C4=CC=CC=C4 0 126 | Cc1c(N(C)CS(=O)(=O)O)c(=O)n(-c2ccccc2)n1C 0 127 | CCN1C(=CC=CC=Cc2sc3ccccc3[n+]2CC)Sc2ccccc21 0 128 | CN(CCC1=CC=C(C=C1)NS(=O)(=O)C)CCOC2=CC=C(C=C2)NS(=O)(=O)C 0 129 | O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c1c[nH]c2ccccc12 0 130 | C5=C(C(OC3CC1N2C(CC(C1)C(C2)=O)C3)=O)C4=C(C=CC=C4)[NH]5 0 131 | C1CN(CCC1N2C3=C(C=C(C=C3)Cl)NC2=O)CCCN4C5=CC=CC=C5NC4=O 0 132 | CC1C2C(C3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O 0 133 | C2=C(C(C1=NC=CC=C1)(OCCN(C)C)C)C=CC=C2 0 134 | C1CN(CC=C1N2C3=CC=CC=C3NC2=O)CCCC(=O)C4=CC=C(C=C4)F 0 135 | CN1C2=C(C3=CC=CC=C3S1(=O)=O)OC(=O)N(C2=O)C4=CC=CC=N4 0 136 | C1=CC(=CC=C1S(=O)(=O)NC=NCCSCC2=CSC(=N2)N=C(N)N)Br 0 137 | CCC1CN2CCC3=CC(=C(C=C3C2CC1CC4C5=CC(=C(C=C5CCN4)OC)OC)OC)OC 0 138 | CN1CCCCC1CCC2=CC=CC=C2NC(=O)C3=CC=C(C=C3)OC 0 139 | CN[C@@H](C)[C@H](O)c1:c:c:c:c:c:1 0 140 | CNCC(C1=CC(=C(C=C1)O)O)O 0 141 | C=C1CC[C@H](O)C/C1=C/C=C1\CCC[C@@]2(C)[C@H]1CC[C@@H]2[C@H](C)/C=C/[C@H](C)C(C)C 0 142 | C(C(C(CO[N+](=O)[O-])O[N+](=O)[O-])O[N+](=O)[O-])O[N+](=O)[O-] 0 143 | CCC1C(C(C(C(=O)C(CC(C(C(C(C(C(=O)O1)C)OC2CC(C(C(O2)C)O)(C)OC)C)OC3C(C(CC(O3)C)N(C)C)O)(C)O)C)C)O)(C)O 0 144 | COC(=O)CCc1ccc(OCC(O)CNC(C)C)cc1 0 145 | CCO 0 146 | C#CC(O)(/C=C/Cl)CC 0 147 | CC12CCC3C(C1CCC2(C#C)O)CCC4=C3C=CC(=C4)O 0 148 | CCON=O 0 149 | C(CCl)Cl 0 150 | CCC1(CCC2C1(CCC3C2CCC4=CCCCC34)C)O 0 151 | CCNC1=NC2=C(C=C(C=C2)Cl)C(O1)(C)C3=CC=CC=C3 0 152 | CCNCC(C1=CC(=CC=C1)O)O 0 153 | CCOC(=O)C1=CN=CN1C(C)C2=CC=CC=C2 0 154 | CCOC(=O)C=C(C)C=CC=C(C)C=CC1=C(C(=C(C=C1C)OC)C)C 0 155 | C1=CC(=C(C(=C1C(=O)C2=CC(=C(C(=C2)O)O)O)O)O)O 0 156 | CCCCOCC(CN1C(=O)C(C(=O)NC1=O)(CC)C2=CC=CC=C2)OC(=O)N 0 157 | C1=CC=C(C=C1)C(COC(=O)N)COC(=O)N 0 158 | CCC(C1=CC=CC=C1)C(=O)OCCN2CCOC(C2C)C3=CC=CC=C3 0 159 | C1=CC=C(C(=C1)CC(=O)O)OC2=C(C=C(C=C2)Cl)Cl 0 160 | C1=CC(=CC=C1C2=NC(=CS2)CC(=O)O)Cl 0 161 | CC(CC1=CC=CC=C1)NCCN2C=NC3=C2C(=O)N(C(=O)N3C)C 0 162 | CCNC(C)CC1=CC(=CC=C1)C(F)(F)F 0 163 | CC(CC1=CC=C(C=C1)O)NCC(C2=CC(=CC(=C2)O)O)O 0 164 | CC(CC1=CC=CC=C1)NCCC#N 0 165 | O=C1NCC2(CCN(CCc3ccccc3)CC2)O1 0 166 | CC(=CCC1C(=O)N(N(C1=O)C2=CC=CC=C2)C3=CC=CC=C3)C 0 167 | C1CN(CCN1CC2=CC3=C(C=C2)OCO3)C(=O)COC4=CC=C(C=C4)Cl 0 168 | C1=CC=C(C(=C1)C(=O)OCC(CO)O)NC2=C3C=CC=C(C3=NC=C2)C(F)(F)F 0 169 | CN1C=C(C(=O)C2=C1C=C(C=C2)F)S(=O)C 0 170 | CS(=O)(=O)NC1=C(C=C2C(=C1)CCC2=O)OC3=C(C=C(C=C3)F)F 0 171 | COC(=O)NC1=NC2=C(N1)C=C(C=C2)C(=O)C3=CC=C(C=C3)F 0 172 | CC1CCC2=C3N1C=C(C(=O)C3=CC(=C2)F)C(=O)O 0 173 | CN1C(=O)CN=C(C2=C1C=CC(=C2)[N+](=O)[O-])C3=CC=CC=C3F 0 174 | CCOC(=O)Nc1ccc(NCc2ccc(F)cc2)nc1N 0 175 | CC1=CC2=C(C=C1)C(=NC(=O)N2C(C)C)C3=CC=C(C=C3)F 0 176 | C(C(F)(F)F)OCC(F)(F)F 0 177 | COCCCCC(=NOCCN)C1=CC=C(C=C1)C(F)(F)F 0 178 | CN(CC1=C(C=CC=C1Cl)NC(=O)C2=CC=CC=C2)CC(=O)N3CCOCC3 0 179 | C1COC(=O)N1N=CC2=CC=C(O2)[N+](=O)[O-] 0 180 | CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC(=C(C(=C2)OC)OC)OC 0 181 | CC1CN(CCN1)C2=C(C=C3C(=C2OC)N(C=C(C3=O)C(=O)O)C4CC4)F 0 182 | CC1=CC(=C(C=C1)C)OCCCC(C)(C)C(=O)O 0 183 | CC(C(CN1C=NC=N1)(C2=C(C=C(C=C2)F)F)O)S(=O)(=O)C 0 184 | CC(C1CCC(C(O1)OC2C(CC(C(C2O)OC3C(C(C(CO3)(C)O)NC)O)N)N)N)NC 0 185 | C1=CC=C(C(=C1)C(=O)OCC(CO)O)NC2=C3C=CC(=CC3=NC=C2)Cl 0 186 | CCC1(CCC(=O)NC1=O)C2=CC=CC=C2 0 187 | CC1CN(CCN1)C2=C(C(=C3C(=C2)N(C=C(C3=O)C(=O)O)C4CC4)C)F 0 188 | C1CCCN(CCC1)CCN=C(N)N 0 189 | CCC1(C(=O)NC(=O)NC1=O)C2=CCCCCC2 0 190 | C1=C(C(=C(C(=C1Cl)Cl)CC2=C(C(=CC(=C2Cl)Cl)Cl)O)O)Cl 0 191 | CCC(C1=CC=C(C=C1)O)C(CC)C2=CC=C(C=C2)O 0 192 | CC1(C(=O)NC(=O)N(C1=O)C)C2=CCCCC2 0 193 | C1NC2=CC(=C(C=C2S(=O)(=O)N1)S(=O)(=O)N)Cl 0 194 | CN1CCC23C4C1CC5=C2C(=C(C=C5)O)OC3C(=O)CC4 0 195 | CC(C)CC1=CC=C(C=C1)CC(=O)O 0 196 | C1CNCCC1CCC2=CNC3=CC=CC=C32 0 197 | CC1=C(C2=C(N1C(=O)C3=CC=C(C=C3)Cl)C=CC(=C2)OC)CC(=O)O 0 198 | CC(C1=CC=C(C=C1)N2CC3=CC=CC=C3C2=O)C(=O)O 0 199 | C1CN(CCC1NC(=O)C2=CC=CC=C2)CCC3=CNC4=CC=CC=C43 0 200 | C/C=C(/C)C(=O)O[C@H]1C(C)=C[C@]23C(=O)[C@@H](C=C(CO)[C@@H](O)[C@]12O)[C@H]1[C@@H](C[C@H]3C)C1(C)C 0 201 | CC(=O)NCC1=C(C(=C(C(=C1I)C(=O)O)I)NC(=O)C)I 0 202 | CCOC(=O)CCCCCCCCC(C)C1=CC=CC=C1I 0 203 | CC(C)NNC(=O)C1=CC=NC=C1 0 204 | CC(C)Nc1ncccn1 0 205 | CC1=CC(=NO1)C(=O)NNCC2=CC=CC=C2 0 206 | CC(C)NCC(C1=CC(=C(C=C1)O)O)O 0 207 | CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC(=O)O)C)C 0 208 | CC1=CC(=NO1)NC(=O)C2=C(C3=CC=CC=C3S(=O)(=O)N2C)O 0 209 | CC(=O)N1CCN(CC1)C2=CC=C(C=C2)OCC3COC(O3)(CN4C=CN=C4)C5=C(C=C(C=C5)Cl)Cl 0 210 | CC(C1=CC(=CC=C1)C(=O)C2=CC=CC=C2)C(=O)O 0 211 | C1CN2C(=CC=C2C(=O)C3=CC=CC=C3)C1C(=O)O 0 212 | CS(=O)(=O)C1=CC(=CC2=C1N(C3=C2CCC3CC(=O)O)CC4=CC=C(C=C4)Cl)F 0 213 | C1=CC(=CC=C1C#N)C(C2=CC=C(C=C2)C#N)N3C=NC=N3 0 214 | CCC(C(CC(C)N(C)C)(C1=CC=CC=C1)C2=CC=CC=C2)OC(=O)C 0 215 | CC(CC1=CC=CC=C1)N 0 216 | C1CSC2=NC(CN21)C3=CC=CC=C3 0 217 | C1=CC(=C(C=C1C(CN)O)O)O 0 218 | C1(C(C(C(C(C1Cl)Cl)Cl)Cl)Cl)Cl 0 219 | CN(C)C(=O)C(CCN1CCC(CC1)(C2=CC=C(C=C2)Cl)O)(C3=CC=CC=C3)C4=CC=CC=C4 0 220 | CC1=CC(=C(C=C1)NC2=C(C=CC=C2Cl)F)CC(=O)O 0 221 | CC12CCC3C(C1CCC2(C#C)O)CCC4=CCCCC34 0 222 | C1CN2C(=N1)C3=CC=CC=C3C2(C4=CC=C(C=C4)Cl)O 0 223 | CC(C1=CC=CC=C1)NN 0 224 | CC1=CC(=CC=C1)CN2CCN(CC2)C(C3=CC=CC=C3)C4=CC=C(C=C4)Cl 0 225 | CN(C)CC(OC1=CC=CC=C1)OC2=CC=CC=C2 0 226 | CC(CC1=CC=CC=C1)NCCCCl 0 227 | CC1=CC2C(CCC3(C2CCC3(C(=O)C)OC(=O)C)C)C4(C1=CC(=O)CC4)C 0 228 | CCN(CC)CCCC(C)NC1=C2C=C(C=CC2=NC3=C1C=CC(=C3)Cl)OC 0 229 | CN1CCCC(C1)CN2C3=CC=CC=C3SC4=CC=CC=C42 0 230 | CC1=CC=CC=C1OCC(CO)O 0 231 | CCCC(C)(COC(=O)N)COC(=O)N 0 232 | O=C(Nc1ccccc1)c1cc(Br)cc(Br)c1O 0 233 | CC(CC1=CC=CC=C1)NC 0 234 | [C@H]23[C@@H]([C@@]1(C(=CC(=O)C=C1)CC2)C)CC[C@]4([C@H]3CC[C@]4(C)O)C 0 235 | CN(C)CCN(CC1=CC=CS1)C2=CC=CC=N2 0 236 | CC1=CC=CC=C1N2C(=NC3=CC=CC=C3C2=O)C 0 237 | CN1CCC2=CC(=C(C=C2C1CCC3=CC=C(C=C3)Cl)OC)OC 0 238 | COC(F)(F)C(Cl)Cl 0 239 | COC(=O)C(C1CCCCN1)C2=CC=CC=C2 0 240 | CCC1(C(=O)C(CNC1=O)C)CC 0 241 | CC1=CC(=C(C(=C1OC(=O)C)C)C)OCC(CNC(C)C)O 0 242 | CC(=O)NC1=C(C(=C(C(=C1I)C(=O)NC2C(C(C(OC2O)CO)O)O)I)N(C)C(=O)C)I 0 243 | CN1CCN2C(C1)C3=CC=CC=C3CC4=CC=CC=C42 0 244 | CC(C)C1C2=C(CCC1(CCN(C)CCCC3=NC4=CC=CC=C4N3)OC(=O)COC)C=C(C=C2)F 0 245 | CCCCN1CC(C(C(C1CO)O)O)O 0 246 | CC1=CC(=NN=C1NCCN2CCOCC2)C3=CC=CC=C3 0 247 | CN(C)C1C2CC3CC4=C(C=CC(=C4C(=C3C(=O)C2(C(=C(C1=O)C(=O)N)O)O)O)O)N(C)C 0 248 | CCOC(=NC1=C[N+](=NO1)N2CCOCC2)[O-] 0 249 | CC1=CC(=C(C=C1OC(=O)C)C(C)C)OCCN(C)C 0 250 | CC(C1=CC(=C(C=C1)Cl)Cl)N2C(=O)CC(=N2)N 0 251 | CCCCCCCCCC(=O)OC1CCC2C1(CCC3C2CCC4=CC(=O)CCC34)C 0 252 | CC12CCC3C(C1CCC2OC(=O)CCC4=CC=CC=C4)CCC5=CC(=O)CCC35 0 253 | CCC1=NN(C(=O)N1CCOC2=CC=CC=C2)CCCN3CCN(CC3)C4=CC(=CC=C4)Cl 0 254 | CC1=C2C(=NC=C1)N(C3=C(C=CC=N3)C(=O)N2)C4CC4 0 255 | C1=CC=C(C=C1)CNC(=O)CCNNC(=O)C2=CC=NC=C2 0 256 | CO[C@]12C[C@@H](COC(=O)c3cncc(Br)c3)CN(C)[C@@H]1Cc1cn(C)c3cccc2c13 0 257 | CC1=C(C(C(=C(N1)C)C(=O)OC)C2=CC=CC=C2[N+](=O)[O-])C(=O)OC 0 258 | C1=CC(=CC=C1C(=O)NN=CC2=CC=C(O2)[N+](=O)[O-])O 0 259 | CCN(CC)C(=O)C1=CN=CC=C1 0 260 | CS(=O)(=O)NC1=C(C=C(C=C1)[N+](=O)[O-])OC2=CC=CC=C2 0 261 | CN(C)CC1=CC=C(O1)CSCCNC(=C[N+](=O)[O-])NCC2=CC3=C(C=C2)OCO3 0 262 | CC1=NC(=CN1C2=CC=C(C=C2)[N+](=O)[O-])[N+](=O)[O-] 0 263 | C1=C(OC(=C1)[N+](=O)[O-])C=NNC(=O)N 0 264 | C1=C(OC(=C1)[N+](=O)[O-])C=CC(=NN=C(N)N)C=CC2=CC=C(O2)[N+](=O)[O-] 0 265 | C1=CC2=C(C=CC(=C2N=C1)O)[N+](=O)[O-] 0 266 | CN1CC(C2=C(C1)C(=CC=C2)N)C3=CC=CC=C3 0 267 | CN1CCC2=CC3=C(C(=C2C1C4C5=C(C(=C(C=C5)OC)OC)C(=O)O4)OC)OCO3 0 268 | CC1=C(C=CC2=C1OC(=O)C(=C2O)NC(=O)C3=CC(=C(C=C3)O)CC=C(C)C)OC4C(C(C(C(O4)(C)C)OC)OC(=O)N)O 0 269 | CC1=CN=C(C(=C1OC)C)CS(=O)C2=NC3=C(N2)C=C(C=C3)OC 0 270 | CC(C)NCC(C1=CC(=CC(=C1)O)O)O 0 271 | CCC(CC)(C1=CC=CC=C1)C(=O)OCCOCCN(CC)CC 0 272 | CCN(CC)CCC1=NC(=NO1)C2=CC=CC=C2 0 273 | CC(CN1C2=CC=CC=C2S(=O)(=O)C3=CC=CC=C31)CN(C)C 0 274 | COc1ccc2c3c1O[C@H]1C(=O)CC[C@@]4(O)[C@@H](C2)N(C)CC[C@]314 0 275 | CCCCC1C(=O)N(N(C1=O)C2=CC=C(C=C2)O)C3=CC=CC=C3 0 276 | CC(=O)OC1=CC=C(C=C1)C2(C3=CC=CC=C3NC2=O)C4=CC=C(C=C4)OC(=O)C 0 277 | C1=CC=C2C(=C1)C(C(=O)N2)(C3=CC=C(C=C3)O)C4=CC=C(C=C4)O 0 278 | CN(C)CC(=O)OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(=O)O 0 279 | C1C(C(C(C(C1N)OC2C(C(C(C(O2)CO)O)O)N)OC3C(C(C(O3)CO)OC4C(C(C(C(O4)CN)O)O)N)O)O)N 0 280 | CCC(=O)NS(=O)(=O)C1=CC=C(C=C1)C2=C(ON=C2C3=CC=CC=C3)C 0 281 | CN(CC#C)CC1=CC=CC=C1 0 282 | C1=CC=C(C=C1)C2C(=O)N=C(O2)N 0 283 | CCCC(C)C1(C(=O)NC(=O)NC1=O)CC 0 284 | O[C@@H]1CO[C@@H](O[C@@H]2CO[C@@H](O)[C@H](OS(O)(=O)=O)[C@H]2OS(O)(=O)=O)[C@H](OS(O)(=O)=O)[C@H]1OS(O)(=O)=O 0 285 | C1CCC2=NN=NN2CC1 0 286 | CCCN1CC(CC2C1CC3=CNC4=CC=CC2=C34)CSC 0 287 | C1CCC(CC1)C(CC2CCCCN2)C3CCCCC3 0 288 | CCOC1=CC=C(C=C1)NC(=O)C 0 289 | CC1=CC(=O)N(N1C)C2=CC=CC=C2 0 290 | C1=CC=C(C=C1)N=NC2=C(N=C(C=C2)N)N 0 291 | CC1C(OCCN1C)C2=CC=CC=C2 0 292 | C1=CC=C(C=C1)CCN=C(N)N=C(N)N 0 293 | C1=CC=C(C=C1)NNC(=O)N 0 294 | CC1C(OCCN1)C2=CC=CC=C2 0 295 | CCC1(C(=O)NC(=O)NC1=O)C2=CC=CC=C2 0 296 | C1=CC=C(C=C1)O 0 297 | C1=CC=C2C(=C1)C(=O)OC2(C3=CC=C(C=C3)O)C4=CC=C(C=C4)O 0 298 | CC(COC1=CC=CC=C1)NN 0 299 | CC(C)(CC1=CC=CC=C1)N 0 300 | CCCCC1C(=O)N(N(C1=O)C2=CC=CC=C2)C3=CC=CC=C3 0 301 | CNCC(C1=CC(=CC=C1)O)O 0 302 | CC(C(C1=CC=CC=C1)O)N 0 303 | CC(C(C1=CC=CC=C1)O)N 0 304 | [H][C@@](C)(N)[C@]([H])(O)C1=CC=CC=C1 0 305 | C1=CC=C(C(=C1)C(=O)NC2=CC=C(C=C2)S(=O)(=O)NC3=NC=CS3)C(=O)O 0 306 | CC(=NO)C1=CC=C(C=C1)OCC(=O)N2CCCCC2 0 307 | CCC1=CN=C(C=C1)CCOC2=CC=C(C=C2)CC3C(=O)NC(=O)S3 0 308 | C1CN(CCC1C(=O)N)CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)Cl 0 309 | CC[N+]1(CCCC(C1)OC(=O)C(C2=CC=CC=C2)(C3=CC=CC=C3)O)C 0 310 | C1CNCCN1 0 311 | C1CCNC(C1)C(C2=CC=CC=C2)(C3=CC=CC=C3)O 0 312 | CC(C1=CC(=C(C=C1)N2CC=CC2)Cl)C(=O)O 0 313 | C1C(O1)CCl 0 314 | CCCCCCC(CC=CCCCCCCCOC(=O)OCCOCC(COCCOC(=O)OCCCCCCCC=CCC(CCCCCC)O)OCCOC(=O)OCCCCCCCC=CCC(CCCCCC)O)O 0 315 | C=CN1CCCC1=O 0 316 | CC(C)NCC(COC1=CC=C(C=C1)NC(=O)C)O 0 317 | CCCNC1CCC2=C(C1)SC(=N2)N 0 318 | CC(CC1=CC=CC=C1)NCCC(C2=CC=CC=C2)C3=CC=CC=C3 0 319 | CC(C)(C)C1=CC(=CC(=C1O)C(C)(C)C)SC(C)(C)SC2=CC(=C(C(=C2)C(C)(C)C)O)C(C)(C)C 0 320 | CCCN(CCC)C(=O)C(CCC(=O)O)NC(=O)C1=CC=CC=C1 0 321 | CC(C)NCC(C1=CC2=CC=CC=C2C=C1)O 0 322 | CCCOC(=O)CC1=CC(=C(C=C1)OCC(=O)N(CC)CC)OC 0 323 | CC(C)C1=C(C(=CC=C1)C(C)C)O 0 324 | CCC(=O)OC(CC1=CC=CC=C1)(C2=CC=CC=C2)C(C)CN(C)C 0 325 | CC1=C(C(=O)N(N1C)C2=CC=CC=C2)C(C)C 0 326 | CC(CC1(C(=O)NC(=O)NC1=O)CC=C)O 0 327 | CC(C(C1=CC=CC=C1)O)NC 0 328 | C(C1(C(C=CNC1=O)=O)CC)C 0 329 | CC1=NC=C(C(=C1O)CO)CSSCC2=CN=C(C(=C2CO)O)C 0 330 | CCCC(C(=O)C1=CC=C(C=C1)C)N2CCCC2 0 331 | C1CC2CCCN2C1 0 332 | CNC(=C[N+](=O)[O-])NCCSCC1=CC=C(O1)CN(C)C 0 333 | CCC(=O)OC1C(CC2C1(CCC3C2CCC4C3(CC(C(C4)OC(=O)C)N5CCCCC5)C)C)[N+]6(CCCCC6)CC=C 0 334 | CCN1CCCC1CNC(=O)C2=C(C=CC(=C2OC)Br)OC 0 335 | COC(=O)[C@H]1[C@H]2C[C@@H]3c4[nH]c5cc(OC)ccc5c4CCN3C[C@H]2C[C@@H](OC(=O)c2cc(OC)c(OC)c(OC)c2)[C@@H]1OC 0 336 | [C@@H]2([N]1N=C(C(N)=O)N=C1)O[C@H](CO)[C@H]([C@H]2O)O 0 337 | CC1=C(N(N=C1C(=O)NN2CCCCC2)C3=C(C=C(C=C3)Cl)Cl)C4=CC=C(C=C4)Cl 0 338 | CS(=O)(=O)C1=CC=C(C=C1)C2=C(C(=O)OC2)C3=CC=CC=C3 0 339 | CN(CCOC1=CC=C(C=C1)CC2C(=O)NC(=O)S2)C3=CC=CC=N3 0 340 | CC1C2CCC3(C=CC(=O)C(=C3C2OC1=O)C)C 0 341 | CCCC(C)C1(C(=O)NC(=O)NC1=O)CC=C 0 342 | CC(CC1=CC=CC=C1)N(C)CC#C 0 343 | C1CN(CCC1C2=CN(C3=C2C=C(C=C3)Cl)C4=CC=C(C=C4)F)CCN5CCNC5=O 0 344 | CC(C)CC(C1(CCC1)C2=CC=C(C=C2)Cl)N(C)C 0 345 | CC1=CC2=C(C=C1CC(=O)C3=C(C=CS3)S(=O)(=O)NC4=C(C(=NO4)C)Cl)OCO2 0 346 | C1=C(C(=O)NC(=O)N1C2C(C(C(O2)CO)O)O)C=CBr 0 347 | CC(C)NCC(C1=CC=C(C=C1)NS(=O)(=O)C)O 0 348 | CC1CN(CC(N1)C)C2=C(C(=C3C(=C2F)N(C=C(C3=O)C(=O)O)C4CC4)N)F 0 349 | C1CCN2CC3CC(C2C1)CN4C3CCCC4 0 350 | C1CN2CC3=CCOC4CC(=O)N5C6C4C3CC2C61C7=CC=CC=C75 0 351 | C1=CC(=CC=C1N)S(=O)(=O)NC(=O)N 0 352 | CC(=CC(=O)NS(=O)(=O)C1=CC=C(C=C1)N)C 0 353 | COC1=NC(=NC(=C1)NS(=O)(=O)C2=CC=C(C=C2)N)OC 0 354 | CC1=CC(=NC(=N1)NS(=O)(=O)C2=CC=C(C=C2)N)C 0 355 | C1=CC(=CC=C1N)S(=O)(=O)N=C(N)N 0 356 | CC1=NN=C(S1)NS(=O)(=O)C2=CC=C(C=C2)N 0 357 | COC1=NN=C(C=C1)NS(=O)(=O)C2=CC=C(C=C2)N 0 358 | COC1=CN=C(N=C1)NS(=O)(=O)C2=CC=C(C=C2)N 0 359 | C1=CC(=CC=C1N)S(=O)(=O)N 0 360 | C1=CC(=CC=C1N)S(=O)(=O)NC2=NC=CS2 0 361 | CC1=CC(=NC(=N1)C)NS(=O)(=O)C2=CC=C(C=C2)N 0 362 | CCCCCCCCNC(C)C(C1=CC=C(C=C1)SC(C)C)O 0 363 | CC(C(C1=CC=C(C=C1)O)O)NC 0 364 | CC(C1=CC=C(C=C1)C(=O)C2=CC=CS2)C(=O)O 0 365 | CCCCC1(C(=O)N(N(C1=O)C2=CC=CC=C2)C3=CC=CC=C3)COC(=O)CCC(=O)O 0 366 | CCCCCN=C(N)NN=CC1=CNC2=C1C=C(C=C2)OC 0 367 | CCC[C@H](NC(=O)[C@@H]1[C@H]2CCC[C@H]2CN1C(=O)[C@@H](NC(=O)[C@@H](NC(=O)c3cnccn3)C4CCCCC4)C(C)(C)C)C(=O)C(=O)NC5CC5 0 368 | CC1CN(CCN1)C2=C(C=C3C(=C2)N(C=C(C3=O)C(=O)O)C4=C(C=C(C=C4)F)F)F 0 369 | CN1C2=C(C=C(C=C2)Cl)C(=NC(C1=O)O)C3=CC=CC=C3 0 370 | CC(C)N1CCN(CC1)C2=CC=C(C=C2)OCC3COC(O3)(CN4C=NC=N4)C5=C(C=C(C=C5)Cl)Cl 0 371 | CC(C)(C)C1=CC=C(C=C1)C(CCCN2CCC(CC2)C(C3=CC=CC=C3)(C4=CC=CC=C4)O)O 0 372 | CC(CC(C1=CC=CC=C1)C2=CC=CC=C2)NC(C)(C)C 0 373 | CCC(=O)OC1CCC2C1(CCC3C2CCC4=CC(=O)CCC34C)C 0 374 | C1=CC=C(C=C1)NC(=O)C2=C(C(=C(C(=C2Cl)Cl)Cl)Cl)O 0 375 | CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O 0 376 | CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CCCCC3 0 377 | C1CC(=O)NC(=O)C1N2C(=O)C3=CC=CC=C3C2=O 0 378 | CN1CCC(CC1)N(CC2=CC=CS2)C3=CC=CC=C3 0 379 | CC[C@H](C)[C@]1(CC)C(=NC(=S)NC1=O)[O-] 0 380 | CN1CCCCC1CCN2C3=CC=CC=C3SC4=C2C=C(C=C4)SC 0 381 | C1=CSC(=C1)C(=O)C2=C(C(=C(C=C2)OCC(=O)O)Cl)Cl 0 382 | CC1=CC(=C(C2=C1C=CC=N2)O)Br 0 383 | C1C(NCS1)C(=O)O 0 384 | CC1=C(C(=CC=C1)C)NC(=O)C(C)N 0 385 | CC1=CC=C(C=C1)C(=O)C2=CC(=C(C(=C2)O)O)[N+](=O)[O-] 0 386 | CN(CC(=O)O)C(=S)C1=CC=CC2=C1C=CC(=C2C(F)(F)F)OC 0 387 | [C@H]2(C1=CC(=CC=C1N([C@H](CC)C2)C(=O)OCC)C(F)(F)F)N(C(=O)OC)CC3=CC(=CC(=C3)C(F)(F)F)C(F)(F)F 0 388 | C1C(C1N)C2=CC=CC=C2 0 389 | C1CN(CCN1CCCN2C(=O)N3C=CC=CC3=N2)C4=CC(=CC=C4)Cl 0 390 | CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC(=O)O)C)C 0 391 | CC(=O)N1C2=CC=CC=C2C(C1=O)(C3=CC=C(C=C3)OC(=O)C)C4=CC=C(C=C4)OC(=O)C 0 392 | CC1=NN=C2N1C3=C(C=C(C=C3)Cl)C(=NC2)C4=CC=CC=C4Cl 0 393 | O=C(Nc1ccc(Br)cc1)c1cc(Br)cc(Br)c1O 0 394 | CC(Cl)(Cl)Cl 0 395 | CN(C)CCOC1=CC=C(C=C1)CNC(=O)C2=CC(=C(C(=C2)OC)OC)OC 0 396 | CCN(CC)CCOC1=CC=C(C=C1)C(CC2=CC=C(C=C2)Cl)(C3=CC=C(C=C3)C)O 0 397 | CC1=C(C2=C(CCC(O2)(C)COC3=CC=C(C=C3)CC4C(=O)NC(=O)S4)C(=C1O)C)C 0 398 | C1C2C(C2N)CN1C3=C(C=C4C(=O)C(=CN(C4=N3)C5=C(C=C(C=C5)F)F)C(=O)O)F 0 399 | C1=CC=C2C(=C1)C(=CN2)CC(C(=O)O)N 0 400 | CC(=O)O[C@]1(C(C)=O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3[C@@H](c3ccc(N(C)C)cc3)C[C@@]21C 0 401 | CCOC(=O)N 0 402 | CC1=C(C(=NO1)C2=CC=CC=C2)C3=CC=C(C=C3)S(=O)(=O)N 0 403 | COC1=CC(=CC(=C1OC)C(=O)NCC2CCCN2CC=C)S(=O)(=O)N 0 404 | CCC=C(C)C1(C(=O)NC(=O)NC1=O)CC 0 405 | CCC12CCCN3C1C4=C(CC3)C5=CC=CC=C5N4C(C2)(C(=O)OC)O 0 406 | C=CCl 0 407 | [C@]2(OC1=C(C(=C(C(=C1CC2)C)O)C)C)(CCC[C@@H](CCC[C@@H](CCCC(C)C)C)C)C 0 408 | CCOC(C(=O)C1=CC=C(C=C1)C2=CC=CC=C2)NC3=CC=C(C=C3)C(=O)O 0 409 | CCOC(=O)CNC(C1CCCCC1)C(=O)N2CCC2C(=O)NCC3=CC=C(C=C3)C(=NO)N 0 410 | CN(C)CC=C(C1=CC=C(C=C1)Br)C2=CN=CC=C2 0 411 | COC(CN1CCN(CC1)CC(C(C2=CC=CC=C2)OC)O)C3=CC=CC=C3 0 412 | CC1=C(N(C(=C1)CC(=O)O)C)C(=O)C2=CC=C(C=C2)Cl 0 413 | CN1CCN(CC1)C(=O)OC2C3=NC=CN=C3C(=O)N2C4=NC=C(C=C4)Cl 0 414 | 415 | 416 | -------------------------------------------------------------------------------- /Tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "fa1cbc05", 6 | "metadata": {}, 7 | "source": [ 8 | "# D-GCAN Deep Dive" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "d4a486bf", 14 | "metadata": {}, 15 | "source": [ 16 | "In this tutorial, we take a deep dive into D-GCAN and show how it builds a drug-likeness prediction model from scratch.\n", 17 | "\n", 18 | "Let's start!" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "d7cacf2e", 24 | "metadata": {}, 25 | "source": [ 26 | "## Part I: Overview of D-GCAN and Data" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "43a44a94", 32 | "metadata": {}, 33 | "source": [ 34 | "The drug-likeness has been widely used as a criterion to distinguish drug-like molecules from non-drugs. Developing reliable computational methods to predict drug-likeness of compounds is crucial to triage unpromising molecules and accelerate the drug discovery process.In this study, a deep learning method was developed to predict drug-likeness based on the graph convo-lutional attention network (D-GCAN) directly from molecular structures. The model combined the ad-vantages of graph convolution and attention mechanism. Results showed that the D-GCAN outper-formed other state-of-the-art models for drug-likeness prediction. Molecular graph was used as encoding method for drug-likeness prediction.\n", 35 | "\n", 36 | "A dataset with enough drugs and non-drugs is the prerequisite to train accurate deep neural network models for prediction of drug-likeness.In this study, D-GCAN model was trained on the dataset released by Beker, which consists of drug and non-drug sets (abbrevi-ated as: Drugs and Non-drugs). The Drugs set includes 2136 FDA small-molecule drugs assembled from Drugbank. The Non-drugs was chosen from ZINC15. Compounds with a maximum fingerprint-based Tanimoto similarity to drugs above 0.85 were removed, and standard binary classification was used to itera-tively refine the set of reliable negative set. Since the negative set is much larger than the positive set, it was randomly down-sampled to create a balanced dataset for model training. The dataset was randomly divided into training, validation, and test sets at ratio 8:1:1. In addition, two additional datasets, the non-US dataset and the bRo5 dataset, were used to test the performance of the model. The non-US dataset composes of 1281 word-wide drugs from Drugbank and an equal size of non-drugs from ZINC15. The bRo5 dataset includes 135 FDA and non-US drugs beyond Ro5 space (bRo5). The GDB-13 data-base was used to test the ability of D-GCAN in screening large-scale data. It consists of about 977 million drug-like small molecules according to Lipinski’s rule. All molecules contain up to 13 heavy atoms , and they were stored in the canonical SMILES. All the independent test datasets and validation dataset were not used in the training process.\n" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "e597ea85", 42 | "metadata": {}, 43 | "source": [ 44 | "## Part II: To train the model" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 1, 50 | "id": "07e8ed86", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "import train" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "id": "dcca4e9f", 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "The code uses a GPU!\n", 68 | "----------------------------------------------------------------------------------------------------\n", 69 | "Just a moment......\n", 70 | "----------------------------------------------------------------------------------------------------\n", 71 | "../dataset/data_train.txt\n", 72 | "../dataset/data_test.txt\n", 73 | "The preprocess has finished!\n", 74 | "# of training data samples: 3802\n", 75 | "# of test data samples: 428\n", 76 | "----------------------------------------------------------------------------------------------------\n", 77 | "Creating a model.\n", 78 | "# of model parameters: 311698\n", 79 | "----------------------------------------------------------------------------------------------------\n", 80 | "Start training.\n", 81 | "The result is saved in the output directory every epoch!\n", 82 | "The training will finish in about 0 hours 21 minutes.\n", 83 | "----------------------------------------------------------------------------------------------------\n", 84 | "Epoch\tTime(sec)\tLoss_train\tLoss_test\tAUC_train\tAUC_test\n", 85 | "1\t9.334350300000011\t318.02376973629\t33.23613902926445\t0.6330387783115992\t0.5116822429906542\n", 86 | "2\t16.232699300000007\t275.59704649448395\t28.36697283387184\t0.7726390395642837\t0.5233644859813084\n", 87 | "3\t22.533227600000004\t258.0953543186188\t26.692712754011154\t0.8227923594659818\t0.530373831775701\n", 88 | "4\t30.035969300000005\t244.29262351989746\t25.99587020277977\t0.8555440089003034\t0.5934579439252337\n", 89 | "5\t36.20996820000002\t235.61571648716927\t26.206634640693665\t0.8711811167445258\t0.544392523364486\n", 90 | "6\t42.15839070000001\t235.10905063152313\t24.74921104311943\t0.8782201330617486\t0.5771028037383178\n", 91 | "7\t48.091756000000004\t230.60763642191887\t24.482909947633743\t0.8858224754798858\t0.6308411214953271\n", 92 | "8\t53.9427063\t225.9473716020584\t25.168260991573334\t0.894445889698231\t0.5794392523364486\n", 93 | "9\t59.887297399999994\t220.88472372293472\t23.143073588609695\t0.9038094737308211\t0.6845794392523364\n", 94 | "10\t65.88160690000001\t220.29008296132088\t23.10640263557434\t0.9080117951159034\t0.6542056074766355\n", 95 | "11\t71.7950055\t220.04156962037086\t23.67304638028145\t0.905873895764607\t0.7289719626168224\n", 96 | "12\t77.81804110000002\t214.85031658411026\t23.34747040271759\t0.9159177330794608\t0.8200934579439252\n", 97 | "13\t83.67658890000001\t212.33444252610207\t23.14932319521904\t0.9178444716275156\t0.735981308411215\n", 98 | "14\t89.6064955\t211.54040449857712\t22.778073489665985\t0.9219235005645715\t0.7593457943925234\n", 99 | "15\t95.5943882\t208.26400744915009\t22.901916056871414\t0.9267551530985012\t0.7663551401869159\n", 100 | "16\t101.6206004\t209.3945328295231\t23.913705557584763\t0.9246417461863752\t0.6915887850467289\n", 101 | "17\t107.65320110000002\t206.03158766031265\t23.282782286405563\t0.930114906901056\t0.7313084112149533\n", 102 | "18\t113.61952920000002\t207.53857171535492\t22.225304275751114\t0.9226543992295261\t0.8037383177570093\n", 103 | "19\t120.54484979999998\t204.79183167219162\t23.475462794303894\t0.926265719441185\t0.735981308411215\n", 104 | "20\t126.48899880000002\t205.36031165719032\t22.78501933813095\t0.9291473863661524\t0.8107476635514018\n", 105 | "21\t132.3617932\t202.3321330845356\t23.385528802871704\t0.9302150906635376\t0.7897196261682243\n", 106 | "22\t138.31940950000003\t202.55410113930702\t23.08760157227516\t0.9293607611751943\t0.7686915887850467\n", 107 | "23\t144.17849070000003\t198.95897144079208\t22.36356022953987\t0.9377836694932141\t0.8294392523364486\n", 108 | "24\t150.04248400000003\t197.13710144162178\t23.3654263317585\t0.9351247869019417\t0.8084112149532711\n", 109 | "25\t155.89731129999998\t256.36723348498344\t22.31619429588318\t0.7880461675559591\t0.8411214953271028\n", 110 | "26\t161.73105990000002\t199.1333883702755\t22.19395723938942\t0.9381437221865521\t0.8200934579439252\n", 111 | "27\t167.57376660000003\t195.61116680502892\t21.885735362768173\t0.9357744592290832\t0.8621495327102804\n", 112 | "28\t173.4351409\t196.21020331978798\t21.808892458677292\t0.937718217946731\t0.8808411214953271\n", 113 | "29\t179.35259220000003\t196.93134278059006\t22.267054110765457\t0.9385652135408595\t0.8481308411214953\n", 114 | "30\t185.28482499999998\t195.89555063843727\t22.040870487689972\t0.9386104622844113\t0.8434579439252337\n", 115 | "31\t191.16947059999998\t194.0237057507038\t22.781775504350662\t0.9417760754533178\t0.8387850467289719\n", 116 | "32\t197.0987102\t193.68072113394737\t22.449314266443253\t0.9423293001527663\t0.8294392523364486\n", 117 | "33\t203.01129849999998\t192.5338954925537\t22.377066612243652\t0.9452480516748955\t0.8621495327102804\n", 118 | "34\t209.6004434\t192.58278796076775\t23.285291463136673\t0.9402174153696283\t0.8714953271028038\n", 119 | "35\t216.03077720000002\t196.01435166597366\t24.061037868261337\t0.9359749651294087\t0.8014018691588785\n", 120 | "36\t222.4218536\t193.9636361002922\t22.313345968723297\t0.936061864857086\t0.8434579439252337\n", 121 | "37\t228.7779303\t192.51033294200897\t22.285043627023697\t0.9435566896185268\t0.8317757009345794\n", 122 | "38\t235.1833634\t188.01407945156097\t22.830698162317276\t0.94882975955897\t0.8014018691588785\n", 123 | "39\t241.73452799999998\t193.91294729709625\t22.711496233940125\t0.9402640478668054\t0.8294392523364486\n", 124 | "40\t248.1350879\t192.2110168337822\t21.79123494029045\t0.9453036785706378\t0.8785046728971962\n", 125 | "41\t254.51809260000002\t189.3926584124565\t23.183754086494446\t0.9464263178869528\t0.822429906542056\n", 126 | "42\t260.67076799999995\t197.67854461073875\t24.210958123207092\t0.9356472922709057\t0.8714953271028038\n", 127 | "43\t266.8244926\t195.8016073703766\t22.462971657514572\t0.9351026468439347\t0.8107476635514018\n", 128 | "44\t273.0223882\t191.97943636775017\t23.224840223789215\t0.940738813735692\t0.8247663551401869\n", 129 | "45\t279.0850431\t190.848837941885\t22.82283341884613\t0.9461044567936768\t0.8060747663551402\n", 130 | "46\t285.1555958\t190.04618108272552\t22.433310955762863\t0.9440021199105542\t0.7967289719626168\n", 131 | "47\t291.1587971\t190.5216095149517\t22.426137387752533\t0.9457633615250072\t0.822429906542056\n", 132 | "48\t297.2169235\t185.92078268527985\t22.22221177816391\t0.9480603925432284\t0.8247663551401869\n", 133 | "49\t303.3877255\t187.782156676054\t22.887968957424164\t0.94423528239644\t0.8364485981308412\n", 134 | "50\t309.40409650000004\t187.28414443135262\t21.483285009860992\t0.9458896982310093\t0.8714953271028038\n", 135 | "51\t315.44450670000003\t185.18417713046074\t21.38184556365013\t0.9481489527752565\t0.8785046728971962\n", 136 | "52\t321.7301946\t182.16105404496193\t24.673764526844025\t0.9509743009276684\t0.7920560747663551\n", 137 | "53\t327.84551209999995\t188.7527618408203\t23.513393253087997\t0.9457229559191445\t0.8154205607476636\n", 138 | "54\t334.0279252\t185.709531635046\t21.631520986557007\t0.9463949066796555\t0.8808411214953271\n", 139 | "55\t340.16017209999995\t185.12931755185127\t22.429152816534042\t0.944120984346979\t0.8294392523364486\n", 140 | "56\t346.29319970000006\t182.88407680392265\t21.58257967233658\t0.9498509697345405\t0.8761682242990654\n", 141 | "57\t357.14253529999996\t182.04424741864204\t21.475889027118683\t0.9525705991099698\t0.8808411214953271\n", 142 | "58\t363.8143368\t182.934487760067\t21.883195608854294\t0.949658766355968\t0.8714953271028038\n", 143 | "59\t370.1603794\t184.17358297109604\t21.290808767080307\t0.9471534804171187\t0.883177570093458\n", 144 | "60\t376.4684089\t181.42354640364647\t21.597694754600525\t0.949361674452587\t0.9042056074766355\n", 145 | "61\t382.88365710000005\t187.25566163659096\t21.6785786151886\t0.9452393340270551\t0.866822429906542\n", 146 | "62\t389.3225355000001\t181.59250125288963\t21.666670441627502\t0.9521657127991676\t0.8598130841121495\n", 147 | "63\t396.0132287\t179.8839019536972\t22.01644539833069\t0.9538880709367459\t0.852803738317757\n", 148 | "64\t402.58914460000005\t182.93770709633827\t22.33838379383087\t0.9484886642903003\t0.8504672897196262\n", 149 | "65\t409.2413564000001\t181.58496183156967\t22.23741576075554\t0.950153319901698\t0.8785046728971962\n", 150 | "66\t415.7737618\t182.49673774838448\t21.934344708919525\t0.9507491642128103\t0.8785046728971962\n", 151 | "67\t422.29757770000003\t180.1727076768875\t22.335491836071014\t0.9534794484911551\t0.852803738317757\n", 152 | "68\t428.7057098\t182.3468733727932\t21.559545934200287\t0.953210031660283\t0.8855140186915887\n", 153 | "69\t435.1430776000001\t177.6970148384571\t21.813909739255905\t0.9542144984169858\t0.8785046728971962\n", 154 | "70\t441.59579740000004\t179.4230616092682\t21.47458705306053\t0.9503143888236987\t0.8925233644859814\n", 155 | "71\t447.9559312\t187.4236896932125\t21.997405976057053\t0.9434714503952\t0.8714953271028038\n", 156 | "72\t454.36015280000004\t185.64457353949547\t21.80859535932541\t0.9493346912568912\t0.8995327102803738\n", 157 | "73\t460.8502926\t183.28448390960693\t22.7678345143795\t0.942662231275046\t0.8504672897196262\n", 158 | "74\t467.7673221\t181.35295176506042\t21.99883532524109\t0.9524445391546925\t0.8317757009345794\n", 159 | "75\t474.20574880000004\t180.42559936642647\t22.20555028319359\t0.9545189242145815\t0.8621495327102804\n", 160 | "76\t480.7348668000001\t177.4391260445118\t21.684407979249954\t0.9532330019704651\t0.8878504672897196\n", 161 | "77\t488.22190610000007\t187.34195244312286\t22.080274641513824\t0.9482340536232203\t0.8738317757009346\n", 162 | "78\t498.7163915\t184.663908213377\t22.014076620340347\t0.9470819403546837\t0.8691588785046729\n" 163 | ] 164 | }, 165 | { 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "79\t507.72747260000006\t178.98830798268318\t21.47413921356201\t0.9541025527486882\t0.8598130841121495\n", 170 | "80\t516.8672275\t178.3373854458332\t21.658688694238663\t0.9525628500896672\t0.8901869158878505\n", 171 | "81\t526.2272582\t176.8597036600113\t22.024581998586655\t0.9535130737042532\t0.8644859813084113\n", 172 | "82\t537.2631751\t177.2030012011528\t21.766023725271225\t0.957143212965218\t0.8691588785046729\n", 173 | "83\t548.0884053000001\t176.38141465187073\t21.79708757996559\t0.9572966712422787\t0.8714953271028038\n", 174 | "84\t559.8119843000001\t174.46399101614952\t21.843281388282776\t0.956292066110213\t0.8574766355140186\n", 175 | "85\t569.1060591\t175.65917918086052\t21.409487038850784\t0.9559244027719352\t0.9018691588785047\n", 176 | "86\t576.5913633\t176.77976202964783\t21.49503728747368\t0.9538743717758542\t0.8785046728971962\n", 177 | "87\t584.3266782000001\t179.85141596198082\t21.427332252264023\t0.9514658102154229\t0.9018691588785047\n", 178 | "88\t595.5166383000001\t178.52282038331032\t21.403560250997543\t0.9539066132353267\t0.8644859813084113\n", 179 | "89\t605.9234084000001\t177.05544209480286\t21.411171078681946\t0.9535158412115041\t0.9042056074766355\n", 180 | "90\t613.9635365\t176.31908676028252\t21.366370409727097\t0.955243734363584\t0.8878504672897196\n", 181 | "91\t623.9597695\t176.20382365584373\t21.893729746341705\t0.9558952055704386\t0.8761682242990654\n", 182 | "92\t634.0385619\t175.1233125925064\t22.482848435640335\t0.9574447328802002\t0.8621495327102804\n", 183 | "93\t644.2752579\t176.93210792541504\t21.549375027418137\t0.9527253027652932\t0.8948598130841121\n", 184 | "94\t652.8900821000001\t173.00296890735626\t21.5932075381279\t0.9579943598202227\t0.8855140186915887\n", 185 | "95\t667.6088093000001\t179.9282302260399\t23.50808882713318\t0.9537645017379945\t0.8341121495327103\n", 186 | "96\t676.2838057\t174.56020081043243\t22.648311734199524\t0.9556187315960767\t0.8714953271028038\n", 187 | "97\t685.9453472\t176.7171704173088\t21.891200184822083\t0.9524900646489693\t0.897196261682243\n", 188 | "98\t694.2790274\t178.38612964749336\t22.245244562625885\t0.9540746009254544\t0.8808411214953271\n", 189 | "99\t702.7636145\t177.73075929284096\t24.06598174571991\t0.9527554685943277\t0.897196261682243\n", 190 | "100\t713.3440201000001\t177.71725061535835\t23.09346652030945\t0.9563492151349435\t0.8434579439252337\n", 191 | "101\t722.6864448\t173.766254901886\t22.383565932512283\t0.9588301470099851\t0.8785046728971962\n", 192 | "102\t731.0422685000001\t173.67894527316093\t21.84225881099701\t0.9581018774769188\t0.8855140186915887\n", 193 | "103\t739.0269975\t175.92625331878662\t22.260210156440735\t0.956732791639914\t0.8691588785046729\n", 194 | "104\t746.7896567\t174.88757956027985\t21.68474268913269\t0.95813674806828\t0.8901869158878505\n", 195 | "105\t754.1008847\t176.1177335381508\t22.008816480636597\t0.9573001306263422\t0.8785046728971962\n", 196 | "106\t761.2548227000001\t174.49301874637604\t21.83292892575264\t0.9588928310492174\t0.897196261682243\n", 197 | "107\t768.4392893\t172.7093889117241\t21.64242872595787\t0.9596258053446101\t0.8878504672897196\n", 198 | "108\t775.9572361\t177.16424638032913\t21.55477637052536\t0.9525133117098767\t0.9065420560747663\n", 199 | "109\t784.4066813000001\t178.4102607667446\t21.33096119761467\t0.9554701164567051\t0.9042056074766355\n", 200 | "110\t794.4927042\t174.0306807756424\t21.91256058216095\t0.9572584796422166\t0.8714953271028038\n", 201 | "111\t802.6228819\t174.55561447143555\t21.62583690881729\t0.9593071268846725\t0.9088785046728972\n", 202 | "112\t810.7530621000001\t172.49658674001694\t22.11896824836731\t0.9582462029800518\t0.8901869158878505\n", 203 | "113\t818.6795158\t173.48215851187706\t21.262643307447433\t0.9584646976775077\t0.897196261682243\n", 204 | "114\t827.9049124000001\t174.29942700266838\t21.171885669231415\t0.9568379569154472\t0.9088785046728972\n", 205 | "115\t837.2002482\t171.870591878891\t21.214154481887817\t0.9590825436712644\t0.9158878504672897\n", 206 | "116\t845.4472041\t176.74994710087776\t22.71759131550789\t0.9544714614652291\t0.8995327102803738\n", 207 | "117\t853.6806366000001\t184.73798117041588\t21.845041394233704\t0.9432161478513075\t0.8995327102803738\n", 208 | "118\t868.1707754\t177.2356958091259\t21.59891825914383\t0.9571596796333606\t0.9112149532710281\n", 209 | "119\t875.8412158\t176.63044354319572\t21.46969723701477\t0.9547083600859034\t0.9088785046728972\n", 210 | "120\t884.2734249\t172.92364439368248\t21.97585704922676\t0.9586079161777404\t0.9065420560747663\n", 211 | "121\t892.1201223\t180.26964315772057\t21.951832473278046\t0.948673395399296\t0.9018691588785047\n", 212 | "122\t899.5396582999999\t173.81644931435585\t21.459405571222305\t0.9592527453671928\t0.8925233644859814\n", 213 | "123\t906.7295165\t176.33700492978096\t21.565876573324203\t0.9538397779352181\t0.8761682242990654\n", 214 | "124\t914.7522263999999\t172.69166892766953\t21.18627032637596\t0.9598193924768083\t0.9112149532710281\n", 215 | "125\t923.6481627000001\t171.38375091552734\t21.528594940900803\t0.9606258440897115\t0.8995327102803738\n", 216 | "126\t932.1785648\t171.18208953738213\t22.116858184337616\t0.9604675426749618\t0.8785046728971962\n", 217 | "127\t939.5199154\t171.14211875200272\t22.178175538778305\t0.9611412923151859\t0.883177570093458\n", 218 | "128\t946.4239123\t169.92953234910965\t21.887178242206573\t0.9611555449775278\t0.8808411214953271\n", 219 | "129\t954.5059573000001\t174.96560329198837\t22.6447791159153\t0.9555907797728429\t0.8761682242990654\n", 220 | "130\t961.8763295000001\t178.70918104052544\t22.300085812807083\t0.9529475335975381\t0.8808411214953271\n", 221 | "131\t969.1789554\t174.0145247578621\t22.113102048635483\t0.9579599043549494\t0.8808411214953271\n", 222 | "132\t976.3019077000001\t172.59970355033875\t21.501632899045944\t0.9596043571634159\t0.9018691588785047\n", 223 | "133\t984.5187846000001\t172.19679167866707\t22.107417851686478\t0.9594648747979719\t0.8808411214953271\n", 224 | "134\t993.8344443000001\t173.3250037431717\t23.72211918234825\t0.9588380344056502\t0.8341121495327103\n", 225 | "135\t1002.1266977\t173.15374860167503\t22.544156223535538\t0.9600561527221201\t0.8738317757009346\n", 226 | "136\t1009.9465580000001\t171.82129180431366\t22.224039256572723\t0.9606888048796689\t0.8738317757009346\n", 227 | "137\t1018.5052494000001\t171.67953670024872\t22.0196373462677\t0.9615298503332077\t0.8901869158878505\n", 228 | "138\t1028.1396465\t172.37993958592415\t22.44371086359024\t0.9589339285318929\t0.8808411214953271\n", 229 | "139\t1037.3752773\t174.99391075968742\t21.463502824306488\t0.9590761784045874\t0.9018691588785047\n", 230 | "140\t1046.0815085\t174.17848363518715\t21.192844033241272\t0.959691256891093\t0.9042056074766355\n", 231 | "auc: 0.9042056074766356\n", 232 | "bacc: 0.9042056074766356\n", 233 | "pre: 0.9303482587064676\n", 234 | "rec: 0.8738317757009346\n", 235 | "f1: 0.9012048192771085\n", 236 | "mcc: 0.8099069874057296\n", 237 | "sp: 0.9345794392523364\n", 238 | "q_: 0.8810572687224669\n", 239 | "acc: 0.9042056074766355\n" 240 | ] 241 | } 242 | ], 243 | "source": [ 244 | "\n", 245 | "\n", 246 | "tes = train.train('../dataset/data_test.txt', #test set \n", 247 | " radius = 1, #hops of radius subgraph: 1, 2 \n", 248 | " dim = 52, #dimension of graph convolution layers\n", 249 | " layer_hidden = 4, #Number of graph convolution layers\n", 250 | " layer_output = 10, #Number of dense layers\n", 251 | " dropout = 0.45, #drop out rate :0-1\n", 252 | " batch_train = 8, # batch of training set\n", 253 | " batch_test = 8, #batch of test set\n", 254 | " lr =3e-4, #learning rate: 1e-5,1e-4,3e-4, 5e-4, 1e-3, 3e-3,5e-3\n", 255 | " lr_decay = 0.85, #Learning rate decay:0.5, 0.75, 0.85, 0.9\n", 256 | " decay_interval = 25,#Number of iterations for learning rate decay:10,25,30,50\n", 257 | " iteration = 140, #Number of iterations \n", 258 | " N = 5000, #length of embedding: 2000,3000,5000,7000 \n", 259 | " dataset_train='../dataset/data_train.txt') #training set\n", 260 | "\n" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "id": "d4a50302", 266 | "metadata": {}, 267 | "source": [ 268 | "## Part III: To test the performance of the D-GCAN on independent model" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "id": "581857d7", 274 | "metadata": {}, 275 | "source": [ 276 | "We have provided the trained model. And it can be used directly as follow:\n", 277 | "\n", 278 | "We test the trained model on bRo5 dataset.\n", 279 | "\n" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "id": "3d3ee166", 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 4, 293 | "id": "68722dcd", 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "import predict" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 5, 303 | "id": "97d307e5", 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "name": "stdout", 308 | "output_type": "stream", 309 | "text": [ 310 | "The code uses a GPU!\n", 311 | "../dataset/bRo5.txt\n", 312 | "SMILESis error\n", 313 | "bacc: 0.9580740740740741\n", 314 | "pre: 0.9696969696969697\n", 315 | "rec: 0.9481481481481482\n", 316 | "f1: 0.9588014981273408\n", 317 | "mcc: 0.9155786319049269\n", 318 | "sp: 0.968\n", 319 | "q_: 0.9453125\n", 320 | "acc: 0.9576923076923077\n" 321 | ] 322 | } 323 | ], 324 | "source": [ 325 | "test = predict.predict('../dataset/bRo5.txt',\n", 326 | " radius = 1,\n", 327 | " property = True, #True if drug-likeness is known \n", 328 | " dim = 52 ,\n", 329 | " layer_hidden = 4,\n", 330 | " layer_output = 10,\n", 331 | " dropout = 0.45,\n", 332 | " batch_train = 8,\n", 333 | " batch_test = 8,\n", 334 | " lr = 3e-4,\n", 335 | " lr_decay = 0.85,\n", 336 | " decay_interval = 25 ,\n", 337 | " iteration = 140,\n", 338 | " N = 5000)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "id": "5fe34a0c", 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "Feedbacks would also be appreciated and you can send me an email (jinyusun@csu.edu.cn)!" 349 | ] 350 | } 351 | ], 352 | "metadata": { 353 | "kernelspec": { 354 | "display_name": "Python 3 (ipykernel)", 355 | "language": "python", 356 | "name": "python3" 357 | }, 358 | "language_info": { 359 | "codemirror_mode": { 360 | "name": "ipython", 361 | "version": 3 362 | }, 363 | "file_extension": ".py", 364 | "mimetype": "text/x-python", 365 | "name": "python", 366 | "nbconvert_exporter": "python", 367 | "pygments_lexer": "ipython3", 368 | "version": "3.8.8" 369 | } 370 | }, 371 | "nbformat": 4, 372 | "nbformat_minor": 5 373 | } 374 | --------------------------------------------------------------------------------