├── DGCAN
    ├── __init__.py
    ├── dict
    │   ├── -atom_dict.pickle
    │   ├── -bond_dict.pickle
    │   ├── -edge_dict.pickle
    │   └── -fingerprint_dict.pickle
    ├── run.py
    ├── train.py
    ├── preprocess.py
    ├── DGCAN.py
    ├── results
    │   └── AUC.txt
    └── predict.py
├── solgan.png
├── screening
    ├── process.png
    ├── COVIDVS-3.ipynb
    ├── Dataset
    │   ├── testset.csv
    │   └── finetunev1.csv
    ├── README.md
    └── DTI.ipynb
├── LICENSE
├── .gitignore
├── README.md
├── Discussion
    ├── preprocess.py
    ├── GPC.py
    ├── svc.py
    ├── RF.py
    ├── CNN.py
    └── GNN.py
├── Test
    └── test.ipynb
├── dataset
    ├── bRo5.txt
    └── withdrawn.txt
└── Tutorial.ipynb


/DGCAN/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/solgan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinYSun/D-GCAN/HEAD/solgan.png


--------------------------------------------------------------------------------
/screening/process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinYSun/D-GCAN/HEAD/screening/process.png


--------------------------------------------------------------------------------
/DGCAN/dict/-atom_dict.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinYSun/D-GCAN/HEAD/DGCAN/dict/-atom_dict.pickle


--------------------------------------------------------------------------------
/DGCAN/dict/-bond_dict.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinYSun/D-GCAN/HEAD/DGCAN/dict/-bond_dict.pickle


--------------------------------------------------------------------------------
/DGCAN/dict/-edge_dict.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinYSun/D-GCAN/HEAD/DGCAN/dict/-edge_dict.pickle


--------------------------------------------------------------------------------
/DGCAN/dict/-fingerprint_dict.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinYSun/D-GCAN/HEAD/DGCAN/dict/-fingerprint_dict.pickle


--------------------------------------------------------------------------------
/DGCAN/run.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Apr 28 09:19:10 2022
 4 | 
 5 | @author: BM109X32G-10GPU-02
 6 | """
 7 | import torch
 8 | import pandas as pd
 9 | import train
10 | import predict
11 | import numpy as np
12 | import rdkit
13 | from rdkit import Chem
14 | from rdkit.Chem import QED 
15 | from rdkit import rdBase, Chem
16 | from rdkit.Chem import PandasTools, QED, Descriptors, rdMolDescriptors
17 | from rdkit.Chem import Lipinski
18 | 
19 | tes = train.train('../dataset/data_test.txt',   
20 |     radius = 1,         
21 |     dim = 52,           
22 |     layer_hidden = 4,   
23 |     layer_output = 10,  
24 |     dropout = 0.45,    
25 |     batch_train = 8, 
26 |     batch_test = 8,     
27 |     lr =3e-4,           
28 |     lr_decay = 0.85,    
29 |     decay_interval = 25, 
30 |     iteration = 140,     
31 |     N = 5000,          
32 |     dataset_train='../dataset/data_train.txt')  
33 | 
34 | 
35 | test = predict.predict('../dataset/nonUS.txt',
36 |     radius = 1,
37 |     property = True,  
38 |     dim = 52 ,
39 |     layer_hidden = 4,
40 |     layer_output = 10,
41 |     dropout = 0.45,
42 |     batch_train = 8,
43 |     batch_test = 8,
44 |     lr = 3e-4,
45 |     lr_decay = 0.85,
46 |     decay_interval = 25 ,
47 |     iteration = 140,
48 |     N = 5000)
49 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2022, JinyuSun
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *.pyc
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center"><img src="https://user-images.githubusercontent.com/62410732/165705895-77c97081-7df2-402d-8199-29d1c33027d2.png" alt="logo" width="400px" /></p>
  2 | <h3 align="center">
  3 | <p> Prediction of Drug-likeness using Graph Convolutional Attention Network.<br></h3>
  4 | <h4 align="center">
  5 | -----------------------------------------------------------------------------------------------------------------
  6 | 
  7 | 
  8 | 
  9 | A deep learning method was developed to predict **d**rug-likeness based on the **g**raph **c**onvolutional **a**ttention **n**etwork (D-GCAN) directly from molecular structures. The model combined the advantages of graph convolution and attention mechanism. D-GCAN is a promising tool to predict drug-likeness for selecting potential candidates and accelerating the process of drug discovery by excluding unpromising candidates and avoiding unnecessary biological and clinical testing. 
 10 | 
 11 | 
 12 | 
 13 | ![图片](https://user-images.githubusercontent.com/62410732/143736741-05e00f97-b01c-4130-8faa-562b51c0a4b4.png)
 14 | 
 15 | 
 16 | 
 17 | 
 18 | ## Motivation
 19 | 
 20 | The drug-likeness has been widely used as a criterion to distinguish drug-like molecules from non-drugs. Developing reliable computational methods to predict drug-likeness of candidate compounds is crucial to triage unpromising molecules and accelerate the drug discovery process.
 21 | 
 22 | 
 23 | 
 24 | 
 25 | ## Depends
 26 | 
 27 | [Anaconda for python 3.8](https://www.python.org/)
 28 | 
 29 | [conda install pytorch](https://pytorch.org/)
 30 | 
 31 | [conda install -c conda-forge rdkit](https://rdkit.org/)
 32 | 
 33 | 
 34 | 
 35 | 
 36 | ## Discussion
 37 | 
 38 | The [Discussion](https://github.com/JinYSun/D-GCAN/tree/main/Discussion) folder contains the scripts for evaluating the classification performance.  We compared sevaral common methods widely used in drug-likeness prediction, such as [GNN](https://github.com/JinYSun/D-GCAN/tree/main/Discussion/GNN.py),[RF](https://github.com/JinYSun/D-GCAN/tree/main/Discussion/GNN.py), [CNN](https://github.com/JinYSun/D-GCAN/tree/main/Discussion/RF.py),[SVC](https://github.com/JinYSun/D-GCAN/tree/main/Discussion/SVC.py),and [GPC](https://github.com/JinYSun/D-GCAN/tree/main/Discussion/GPC.py).
 39 | 
 40 | 
 41 | 
 42 | 
 43 | ## Usage
 44 | 
 45 | If you want to retrain the model, please put the molecule's SMILES files in to data directory and run [D-GCAN](https://github.com/JinYSun/D-GCAN/tree/main/DGCAN/DGCAN.py). The test set can be replaced by changing the path. It is recommended to retrain the model before predicting. The process will take less than 15 minutes. It is as simple as
 46 | 
 47 | ```
 48 | import train
 49 | test = train.train('../dataset/bRo5.txt',  
 50 |     radius = 1,         
 51 |     dim = 52,         
 52 |     layer_hidden = 4,  
 53 |     layer_output = 10, 
 54 |     dropout = 0.45,   
 55 |     batch_train = 8,   
 56 |     batch_test = 8,   
 57 |     lr =3e-4,          
 58 |     lr_decay = 0.85,   
 59 |     decay_interval = 25,
 60 |     iteration = 140,    
 61 |     N = 5000,           
 62 |     dataset_train='../dataset/data_train.txt') 
 63 | ```
 64 | 
 65 | If you want to make the prediction of druglikeness of unknown molecule, it can be made as follow
 66 | 
 67 | ```
 68 | import predict
 69 | test = predict.predict('../dataset/bRo5.txt',
 70 |     radius = 1,
 71 |     property = True,   #True if drug-likeness is known 
 72 |     dim = 52 ,
 73 |     layer_hidden = 4,
 74 |     layer_output = 10,
 75 |     dropout = 0.45,
 76 |     batch_train = 8,
 77 |     batch_test = 8,
 78 |     lr = 3e-4,
 79 |     lr_decay = 0.85,
 80 |     decay_interval = 25 ,
 81 |     iteration = 140,
 82 |     N = 5000)
 83 | 
 84 | ```
 85 | 
 86 | or you can run [run.py](https://github.com/JinYSun/D-GCAN/blob/main/DGCAN/run.py) and modify the hyperparameters of the neural network to optimize the model .
 87 | 
 88 | The D-GCAN-screened GDB-13 database [(S-GDB13)](https://doi.org/10.5281/zenodo.7054367) is a more drug-like database and can be used to find new drug candidates.
 89 | 
 90 | #### -Notice-
 91 | 
 92 | As described in paper, the prediction of drug-likeness was deeply influenced by the dataset, especially the negative set. If necessary, retrain the model on your dataset.
 93 | 
 94 | 
 95 | 
 96 | # Contact
 97 | 
 98 | Jinyu Sun E-mail: jinyusun@csu.edu.cn
 99 | 
100 | 
101 | 
102 | # Cite
103 | 
104 | 
105 | 	@article{10.1093/bioinformatics/btac676,
106 | 	author = {Sun, Jinyu and Wen, Ming and Wang, Huabei and Ruan, Yuezhe and Yang, Qiong and Kang, Xiao and Zhang, Hailiang and Zhang, Zhimin and Lu, Hongmei},
107 | 	title = "{Prediction of Drug-likeness using Graph Convolutional Attention Network}",
108 | 	journal = {Bioinformatics},
109 | 	year = {2022},
110 | 	month = {10}
111 | 	}
112 | 


--------------------------------------------------------------------------------
/screening/COVIDVS-3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "95a57d71",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Virtual Screening for anti-SARS-CoV-2 drugs by COVIDVS"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "4c2a3472",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "COVIDVS models are Chemprop models trained with anti-beta-coronavirus actives/inactives collected from published papers and fine-tuned with anti-SARS-CoV-2 actives/inactives."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "id": "ebc1985d",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Installation"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "id": "4ee9c28c",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "Requirement\n",
 33 |     "conda install -c conda-forge rdkit\n",
 34 |     "pip install git+https://github.com/bp-kelley/descriptastorus\n",
 35 |     "pip install chemprop\n",
 36 |     "model: download https://github.com/pkuwangsw/COVIDVS"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "id": "3e00b470",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## Dataset"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "id": "f4231a29",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "traindata.csv: A set of inhibitors against HCoV-OC43, SARS-CoV and MERS-CoV collected from literatures. All the inhibitors were identified by screening libraries including FDA-approved drugs and pharmacologically active compounds. This primary training dataset (Training Set 1) contains 90 positive data and 1862 negative data."
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "id": "8b1e47f5",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "testset.csv: This dataset (Test Set 1) was derived from Fine-tuning Set 1 by removing repeated molecules in Training Set 1."
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "id": "def004af",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "finetunev1.csv: This dataset (Fine-tuning Set 1) contains 154 data collocted from literatures, including 70 positive data and 84 negative data. The molecular activities against SARS-CoV-2 of these molecules have been experimentally tested."
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "id": "b81c3941",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "## Pretraining"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "id": "fd6fa29f",
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "python train.py --gpu 0 --data_path ./dataset/traindata.csv --features_path ./dataset/traindata-feat.npy --no_features_scaling --save_dir covidvs1/ --dataset_type classification --split_sizes 0.9 0.1 0.0 --num_folds 20 --config_path hyperopt_it20.json "
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "id": "da60e7de",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "## Fine-tuning"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "id": "7fb02ab2",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "python finetune.py --gpu 0 --data_path ../data/finetunev1.csv --features_path ./dataset/finetunev1-feat.npy --save_dir covidvs2/ --checkpoint_path covidvs1/fold_0/model_0/model.pt --split_sizes 0.9 0.1 0.0 --config_path hyperopt_it20.json --dataset_type classification --init_lr 1e-4 --batch_size 20 --epochs 30"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "id": "6a3f6b8f",
110 |    "metadata": {},
111 |    "source": [
112 |     "## Prediction"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "id": "50824d76",
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "python predict.py --gpu 0 --test_path ./dataset/dataset.csv --features_path ./dataset/launched-feat.npy --preds_path preds_covidvs1_launched.csv --checkpoint_dir covidvs1/ --use_compound_names"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "id": "0ceaff0a",
128 |    "metadata": {},
129 |    "source": [
130 |     "## Acknowledgement"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "id": "cbceee2b",
136 |    "metadata": {},
137 |    "source": [
138 |     "This project incorporates code from the following repo:\n",
139 |     "    https://github.com/pkuwangsw/COVIDVS"
140 |    ]
141 |   }
142 |  ],
143 |  "metadata": {
144 |   "kernelspec": {
145 |    "display_name": "Python 3 (ipykernel)",
146 |    "language": "python",
147 |    "name": "python3"
148 |   },
149 |   "language_info": {
150 |    "codemirror_mode": {
151 |     "name": "ipython",
152 |     "version": 3
153 |    },
154 |    "file_extension": ".py",
155 |    "mimetype": "text/x-python",
156 |    "name": "python",
157 |    "nbconvert_exporter": "python",
158 |    "pygments_lexer": "ipython3",
159 |    "version": "3.8.8"
160 |   }
161 |  },
162 |  "nbformat": 4,
163 |  "nbformat_minor": 5
164 | }
165 | 


--------------------------------------------------------------------------------
/screening/Dataset/testset.csv:
--------------------------------------------------------------------------------
 1 | name,smiles,isactive
 2 | Salinomycin sodium,CCC(C(=O)O)C1CCC(C)C(C(C)C(O)C(C)C(=O)C(CC)C2OC3(C=CC(O)C4(CCC(C)(C5CCC(O)(CC)C(C)O5)O4)O3)C(C)CC2C)O1,1
 3 | LDK378,Cc1cc(Nc2ncc(Cl)c(Nc3ccccc3S(=O)(=O)C(C)C)n2)c(OC(C)C)cc1C1CCNCC1,1
 4 | Isoosajin,CC1(C)C=Cc2c(c3c(c4c(=O)c(-c5ccc(O)cc5)coc24)OC(C)(C)CC3)O1,1
 5 | Osimertinib mesylate,C=CC(=O)Nc1cc(Nc2nccc(-c3cn(C)c4ccccc34)n2)c(OC)cc1N(C)CCN(C)C,1
 6 | Bazedoxifene,Cc1c(-c2ccc(O)cc2)n(Cc2ccc(OCCN3CCCCCC3)cc2)c2ccc(O)cc12,1
 7 | Lusutrombopag,CCCCCCOC(C)c1cccc(-c2csc(NC(=O)c3cc(Cl)c(C=C(C)C(=O)O)c(Cl)c3)n2)c1OC,1
 8 | Osajin,CC(C)=CCc1c2c(c3occ(-c4ccc(O)cc4)c(=O)c3c1O)C=CC(C)(C)O2,1
 9 | Dronedarone HCl,CCCCc1oc2ccc(NS(C)(=O)=O)cc2c1C(=O)c1ccc(OCCCN(CCCC)CCCC)cc1,1
10 | Ciclesonide,CC(C)C(=O)OCC(=O)C12OC(C3CCCCC3)OC1CC1C3CCC4=CC(=O)C=CC4(C)C3C(O)CC12C,1
11 | Isopomiferin,CC1(C)C=Cc2c(c3c(c4c(=O)c(-c5ccc(O)c(O)c5)coc24)OC(C)(C)CC3)O1,1
12 | Anidulafungin,CCCCCOc1ccc(-c2ccc(-c3ccc(C(=O)NC4CC(O)C(O)NC(=O)C5C(O)C(C)CN5C(=O)C(C(C)O)NC(=O)C(C(O)C(O)c5ccc(O)cc5)NC(=O)C5CC(O)CN5C(=O)C(C(C)O)NC4=O)cc3)cc2)cc1,1
13 | Ivacaftor,CC(C)(C)c1cc(C(C)(C)C)c(NC(=O)c2c[nH]c3ccccc3c2=O)cc1O,1
14 | Droloxifene,CCC(=C(c1ccc(OCCN(C)C)cc1)c1cccc(O)c1)c1ccccc1,1
15 | Abemaciclib,CCN1CCN(Cc2ccc(Nc3ncc(F)c(-c4cc(F)c5nc(C)n(C(C)C)c5c4)n3)nc2)CC1,1
16 | Gilteritinib,CCc1nc(C(N)=O)c(Nc2ccc(N3CCC(N4CCN(C)CC4)CC3)c(OC)c2)nc1NC1CCOCC1,1
17 | Ebastine,CC(C)(C)c1ccc(C(=O)CCCN2CCC(OC(c3ccccc3)c3ccccc3)CC2)cc1,1
18 | Mequitazine,c1ccc2c(c1)Sc1ccccc1N2CC1CN2CCC1CC2,1
19 | Eltrombopag,Cc1ccc(-n2[nH]c(C)c(N=Nc3cccc(-c4cccc(C(=O)O)c4)c3O)c2=O)cc1C,1
20 | Atazanavir,COC(=O)NC(C(=O)NC(Cc1ccccc1)C(O)CN(Cc1ccc(-c2ccccn2)cc1)NC(=O)C(NC(=O)OC)C(C)(C)C)C(C)(C)C,1
21 | Benztropine Mesylate,CN1C2CCC1CC(OC(c1ccccc1)c1ccccc1)C2,1
22 | Terconazole,CC(C)N1CCN(c2ccc(OCC3COC(Cn4cncn4)(c4ccc(Cl)cc4Cl)O3)cc2)CC1,1
23 | Oxprenolol hydrochloride,C=CCOc1ccccc1OCC(O)CNC(C)C,1
24 | Alprostadil,CCCCCC(O)C=CC1C(O)CC(=O)C1CCCCCCC(=O)O,1
25 | Dolutegravir,CC1CCOC2Cn3cc(C(=O)NCc4ccc(F)cc4F)c(=O)c(O)c3C(=O)N12,1
26 | Opipramol dihydrochloride,OCCN1CCN(CCCN2c3ccccc3C=Cc3ccccc32)CC1,1
27 | Arbidol,CCOC(=O)c1c(CSc2ccccc2)n(C)c2cc(Br)c(O)c(CN(C)C)c12,1
28 | Ritonavir,CC(C)c1nc(CN(C)C(=O)NC(C(=O)NC(Cc2ccccc2)CC(O)C(Cc2ccccc2)NC(=O)OCc2cncs2)C(C)C)cs1,1
29 | Nelfinavir,Cc1c(O)cccc1C(=O)NC(CSc1ccccc1)C(O)CN1CC2CCCCC2CC1C(=O)NC(C)(C)C,1
30 | Saquinavir,CC(C)(C)NC(=O)C1CC2CCCCC2CN1CC(O)C(Cc1ccccc1)NC(=O)C(CC(N)=O)NC(=O)c1ccc2ccccc2n1,1
31 | Tipranavir,CCCC1(CCc2ccccc2)CC(O)=C(C(CC)c2cccc(NS(=O)(=O)c3ccc(C(F)(F)F)cn3)c2)C(=O)O1,1
32 | Amprenavir,CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(=O)c1ccc(N)cc1,1
33 | Darunavir,CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1COC2OCCC12)S(=O)(=O)c1ccc(N)cc1,1
34 | Indinavir,CC(C)(C)NC(=O)C1CN(Cc2cccnc2)CCN1CC(O)CC(Cc1ccccc1)C(=O)NC1c2ccccc2CC1O,1
35 | Camostat,CN(C)C(=O)COC(=O)Cc1ccc(OC(=O)c2ccc(N=C(N)N)cc2)cc1,0
36 | Favipiravir,NC(=O)c1nc(F)c[nH]c1=O,0
37 | Acetyl spiramycin,COC1C(OC(C)=O)CC(=O)OC(C)CC=CC=CC(OC2CCC(N(C)C)C(C)O2)C(C)CC(CC=O)C1OC1OC(C)C(OC2CC(C)(O)C(O)C(C)O2)C(N(C)C)C1O,0
38 | Alfadolone,CC12CC(=O)C3C(CCC4CC(O)CCC43C)C1CCC2C(=O)CO,0
39 | Ambrisentan,COC(c1ccccc1)(c1ccccc1)C(Oc1nc(C)cc(C)n1)C(=O)O,0
40 | Artenimol,CC1CCC2C(C)C(O)OC3OC4(C)CCC1C32OO4,0
41 | Benoxinate hydrochloride,CCCCOc1cc(C(=O)OCCN(CC)CC)ccc1N,0
42 | Benzathine,c1ccc(CNCCNCc2ccccc2)cc1,0
43 | Bisbentiamine,CC(=C(CCOC(=O)c1ccccc1)SSC(CCOC(=O)c1ccccc1)=C(C)N(C=O)Cc1cnc(C)nc1N)N(C=O)Cc1cnc(C)nc1N,0
44 | Budralazine,CC(C)=CC(C)=NNc1nncc2ccccc12,0
45 | Candesartan,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1,0
46 | Cisatracurium besylate,COc1ccc(CC2c3cc(OC)c(OC)cc3CC[N+]2(C)CCC(=O)OCCCCCOC(=O)CC[N+]2(C)CCc3cc(OC)c(OC)cc3C2Cc2ccc(OC)c(OC)c2)cc1OC,0
47 | Cyclofenil,CC(=O)Oc1ccc(C(=C2CCCCC2)c2ccc(OC(C)=O)cc2)cc1,0
48 | Dipivefrin hydrochloride,CNCC(O)c1ccc(OC(=O)C(C)(C)C)c(OC(=O)C(C)(C)C)c1,0
49 | Eperisone HCl,CCc1ccc(C(=O)C(C)CN2CCCCC2)cc1,0
50 | Fenoldopam,Oc1ccc(C2CNCCc3c2cc(O)c(O)c3Cl)cc1,0
51 | Formoterol,COc1ccc(CC(C)NCC(O)c2ccc(O)c(NC=O)c2)cc1,0
52 | Fosinopril,CCC(=O)OC(OP(=O)(CCCCc1ccccc1)CC(=O)N1CC(C2CCCCC2)CC1C(=O)O)C(C)C,0
53 | Fursultiamine hydrochloride,CC(=C(CCO)SSCC1CCCO1)N(C=O)Cc1cnc(C)nc1N,0
54 | Ibudilast,CC(C)C(=O)c1c(C(C)C)nn2ccccc12,0
55 | Meprylcaine hydrochloride,CCCNC(C)(C)COC(=O)c1ccccc1,0
56 | Meptazinol,CCC1(c2cccc(O)c2)CCCCN(C)C1,0
57 | Mirtazapine,CN1CCN2c3ncccc3Cc3ccccc3C2C1,0
58 | Misoprostol,CCCCC(C)(O)CC=CC1C(O)CC(=O)C1CCCCCCC(=O)OC,0
59 | Nabumetone,COc1ccc2cc(CCC(C)=O)ccc2c1,0
60 | Nalmefene hydrochloride,C=C1CCC2(O)C3Cc4ccc(O)c5c4C2(CCN3CC2CC2)C1O5,0
61 | Olanzapine,Cc1cc2c(s1)Nc1ccccc1N=C2N1CCN(C)CC1,0
62 | Olmesartan,CCCc1nc(C(C)(C)O)c(C(=O)O)n1Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1,0
63 | Ondansetron hydrochloride,Cc1nccn1CC1CCc2c(c3ccccc3n2C)C1=O,0
64 | Oxymetholone,CC12CC(=CO)C(=O)CC1CCC1C2CCC2(C)C1CCC2(C)O,0
65 | Oxyphenisatin,O=C1Nc2ccccc2C1(c1ccc(O)cc1)c1ccc(O)cc1,0
66 | Prednicarbate,CCOC(=O)OC1(C(=O)COC(=O)CC)CCC2C3CCC4=CC(=O)C=CC4(C)C3C(O)CC21C,0
67 | Sparfloxacin,CC1CN(c2c(F)c(N)c3c(=O)c(C(=O)O)cn(C4CC4)c3c2F)CC(C)N1,0
68 | Tolterodine tartrate,Cc1ccc(O)c(C(CCN(C(C)C)C(C)C)c2ccccc2)c1,0
69 | Vonoprazan,CNCc1cc(-c2ccccc2F)n(S(=O)(=O)c2cccnc2)c1,0
70 | Zoledronic acid hydrate,O=P(O)(O)C(O)(Cn1ccnc1)P(=O)(O)O,0
71 | Merbromin,O=C1OC2(c3cc(Br)c([O-])cc3Oc3c2cc(Br)c([O-])c3[Hg])c2ccccc21,0
72 | Methandrostenolone,CC12C=CC(=O)C=C1CCC1C2CCC2(C)C1CCC2(C)O,0
73 | 


--------------------------------------------------------------------------------
/screening/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <h3 align="center">
 3 | <p> A deep learning-based process related to the screening of SARS-CoV2 3CL inhibitors.<br></h3>
 4 | 
 5 | 
 6 | ## process
 7 | 
 8 | ![图片](https://user-images.githubusercontent.com/62410732/176149139-b96f2edd-b66b-4007-a0f4-73259b319cb6.png)
 9 | 
10 | Coronavirus disease 2019 (COVID-19) is a highly infectious disease caused by severe acute respiratory syndrome coronavirus-2 (SARS-CoV-2). It is urgent to find potential antiviral drugs against SARS-CoV-2 in a short time. Deep learning-based virtual screening is one of the approaches that can rapidly search against large molecular libraries. Here, SARS-CoV-2 3C-like protease (SARS-CoV-2 3CLpro) was chosen as the target. As shown in Figure bellow, the utility of D-GCAN is evaluated by comparing the screening results on the GDB-13 and S-GDB13 databases. The process was carried out with the help of the transfer learning method (Wang et al., 2021), DeepPurpose (Huang et al., 2020), and ADMETLab2.0 (Xiong et al., 2021). 
11 | 
12 | These databases were firstly screened by using a transfer learning method (COVIDVS) proposed by Wang et al (Wang et al., 2021), which was reported for screening inhibitors against SARS-CoV-2. The model was trained on the dataset containing inhibitors against HCoV-OC43, SARS-CoV and MERS-CoV. All of these viruses as well as SARS-CoV-2 belong to β-coronaviruses. They have high consistency in essential functional proteins (Wu et al., 2020; Shen et al.; Pillaiyar et al., 2020). Then, the trained model was fine-tuned by the transfer learning approach with the dataset containing drugs against SARS-CoV-2. In this way, 107 million drug-like molecules were screened out. Then, drug-target interaction prediction (DTI) was carried out based on DeepPurpose (Huang et al., 2020), which provided pretrained model for the interaction prediction between drugs and SARS-CoV-2 3CLpro target. The interaction binding score was evaluated by the dissociation equilibrium constant (Kd). After this step, 17 thousand molecules with high affinity were obtained. Finally, ADMET properties were widely chosen and used for screening SARS-CoV-2 inhibitors (Gajjar et al., 2021; Roy et al., 2021; Dhameliya et al., 2022). These properties were calculated by using ADMETLab2.0 (Xiong et al., 2021), and 65 candidates with good properties were selected.
13 | 
14 | ## COVIDVS
15 | 
16 | COVIDVS models are Chemprop models trained with anti-beta-coronavirus actives/inactives collected from published papers and fine-tuned with anti-SARS-CoV-2 actives/inactives.
17 | 
18 | 
19 | 
20 | ## DeepPurpose
21 | 
22 | DeepPurpose has provied the pretrained model by predicting the interaction between a target (SARS-CoV2 3CL Protease) and a list of repurposing drugs from a curated drug library of 81 antiviral drugs. The Binding Score is the Kd values. Results aggregated from five pretrained model on BindingDB dataset.
23 | 
24 | 
25 | 
26 | ## AMETLab2.0
27 | 
28 | Undesirable pharmacokinetics and toxicity of candidate compounds are the main reasons for the                    failure of drug development, and it has been widely recognized that absorption, distribution,                    metabolism, excretion and toxicity (ADMET) of chemicals should be evaluated as early as possible.                    ADMETlab 2.0 is an enhanced version of the widely used [ADMETlab](http://admet.scbdd.com/) for systematical evaluation of ADMET properties, as well as some physicochemical properties and medicinal chemistry friendliness. With significant updates to functional modules, predictive models, explanations, and                    the user interface, ADMETlab 2.0 has greater capacity to assist medicinal chemists in accelerating                    the drug research and development process.                
29 | 
30 | 
31 | 
32 | ## Acknowledgement
33 | 
34 | Dhameliya,T.M. *et al.* (2022) Systematic virtual screening in search of SARS CoV-2 inhibitors against spike glycoprotein: pharmacophore screening, molecular docking, ADMET analysis and MD simulations. *Mol Divers*.
35 | 
36 | Gajjar,N.D. *et al.* (2021) In search of RdRp and Mpro inhibitors against SARS CoV-2: Molecular docking, molecular dynamic simulations and ADMET analysis. *Journal of Molecular Structure*, **1239**, 130488.
37 | 
38 | Huang,K. *et al.* (2020) DeepPurpose: a deep learning library for drug–target interaction prediction. *Bioinformatics*, **36**, 5545–5547.
39 | 
40 | Pillaiyar,T. *et al.* (2020) Recent discovery and development of inhibitors targeting coronaviruses. *Drug Discovery Today*, **25**, 668–688.
41 | 
42 | Roy,R. *et al.* (2021) Finding potent inhibitors against SARS-CoV-2 main protease through virtual screening, ADMET, and molecular dynamics simulation studies. *Journal of Biomolecular Structure and Dynamics*, **0**, 1–13.
43 | 
44 | Shen,L. *et al.* High-Throughput Screening and Identification of Potent Broad-Spectrum Inhibitors of Coronaviruses. *Journal of Virology*, **93**, e00023-19.
45 | 
46 | Wang,S. *et al.* (2021) A transferable deep learning approach to fast screen potential antiviral drugs against SARS-CoV-2. *Briefings in Bioinformatics*.
47 | 
48 | Wu,F. *et al.* (2020) A new coronavirus associated with human respiratory disease in China. *Nature*, **579**, 265–269.
49 | 
50 | Xiong,G. *et al.* (2021) ADMETlab 2.0: an integrated online platform for accurate and comprehensive predictions of ADMET properties. *Nucleic Acids Research*, **49**, W5–W14.
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/Discussion/preprocess.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | import numpy as np
  4 | 
  5 | from rdkit import Chem
  6 | 
  7 | import torch
  8 | atom_dict = defaultdict(lambda: len(atom_dict))
  9 | bond_dict = defaultdict(lambda: len(bond_dict))
 10 | fingerprint_dict = defaultdict(lambda: len(fingerprint_dict))
 11 | edge_dict = defaultdict(lambda: len(edge_dict))
 12 | radius=1
 13 | if torch.cuda.is_available():
 14 |         device = torch.device('cuda')
 15 |         print('The code uses a GPU!')
 16 | else:
 17 |         device = torch.device('cpu')
 18 |         print('The code uses a CPU...')
 19 | def create_atoms(mol, atom_dict):
 20 |     """Transform the atom types in a molecule (e.g., H, C, and O)
 21 |     into the indices (e.g., H=0, C=1, and O=2).
 22 |     Note that each atom index considers the aromaticity.
 23 |     """
 24 |     atoms = [a.GetSymbol() for a in mol.GetAtoms()]
 25 |     for a in mol.GetAromaticAtoms():
 26 |         i = a.GetIdx()
 27 |         atoms[i] = (atoms[i], 'aromatic')
 28 |     atoms = [atom_dict[a] for a in atoms]
 29 |     return np.array(atoms)
 30 | 
 31 | 
 32 | def create_ijbonddict(mol, bond_dict):
 33 |     """Create a dictionary, in which each key is a node ID
 34 |     and each value is the tuples of its neighboring node
 35 |     and chemical bond (e.g., single and double) IDs.
 36 | 
 37 |     """
 38 |     i_jbond_dict = defaultdict(lambda: [])
 39 |     for b in mol.GetBonds():
 40 |         i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
 41 |         bond = bond_dict[str(b.GetBondType())]
 42 |         i_jbond_dict[i].append((j, bond))
 43 |         i_jbond_dict[j].append((i, bond))
 44 |     return i_jbond_dict
 45 | 
 46 | 
 47 | def extract_fingerprints(radius, atoms, i_jbond_dict,
 48 |                          fingerprint_dict, edge_dict):
 49 |     """Extract the fingerprints from a molecular graph
 50 |     based on Weisfeiler-Lehman algorithm.
 51 | 
 52 |     """
 53 | 
 54 |     if (len(atoms) == 1) or (radius == 0):
 55 |         nodes = [fingerprint_dict[a] for a in atoms]
 56 | 
 57 |     else:
 58 |         nodes = atoms
 59 |         i_jedge_dict = i_jbond_dict
 60 | 
 61 |         for _ in range(radius):
 62 | 
 63 |             """Update each node ID considering its neighboring nodes and edges.
 64 |             The updated node IDs are the fingerprint IDs.。
 65 |             """
 66 |             nodes_ = []
 67 |             for i, j_edge in i_jedge_dict.items():
 68 |                 neighbors = [(nodes[j], edge) for j, edge in j_edge]
 69 |                 fingerprint = (nodes[i], tuple(sorted(neighbors)))
 70 |                 nodes_.append(fingerprint_dict[fingerprint])
 71 | 
 72 |             """Also update each edge ID considering
 73 |             its two nodes on both sides.
 74 |             """
 75 |             i_jedge_dict_ = defaultdict(lambda: [])
 76 |             for i, j_edge in i_jedge_dict.items():
 77 |                 for j, edge in j_edge:
 78 |                     both_side = tuple(sorted((nodes[i], nodes[j])))
 79 |                     edge = edge_dict[(both_side, edge)]
 80 |                     i_jedge_dict_[i].append((j, edge))
 81 | 
 82 |             nodes = nodes_
 83 |             i_jedge_dict = i_jedge_dict_
 84 | 
 85 |     return np.array(nodes)
 86 | 
 87 | 
 88 | def split_dataset(dataset, ratio):
 89 |     """Shuffle and split a dataset.洗牌和拆分数据集"""
 90 |     np.random.seed(1234)  # fix the seed for shuffle为洗牌修正种子.
 91 |     np.random.shuffle(dataset)
 92 |     n = int(ratio * len(dataset))
 93 |     return dataset[:n], dataset[n:]
 94 | 
 95 | 
 96 | def create_dataset(filename,path,dataname):
 97 |     dir_dataset = path+dataname
 98 |     print(filename)
 99 |     """Load a dataset."""
100 |     with open(dir_dataset + filename, 'r') as f:
101 |         smiles_property = f.readline().strip().split()
102 |         data_original = f.read().strip().split('\n')
103 | 
104 |         """Exclude the data contains '.' in its smiles.排除含.的数据"""
105 |     data_original = [data for data in data_original
106 |                         if '.' not in data.split()[0]]
107 |     dataset = []
108 |     for data in data_original:
109 | 
110 |             smiles, property = data.strip().split()
111 | 
112 |             """Create each data with the above defined functions."""
113 |             mol = Chem.AddHs(Chem.MolFromSmiles(smiles))
114 |             atoms = create_atoms(mol, atom_dict)
115 |             molecular_size = len(atoms)
116 |             i_jbond_dict = create_ijbonddict(mol, bond_dict)
117 |             fingerprints = extract_fingerprints(radius, atoms, i_jbond_dict,
118 |                                                 fingerprint_dict, edge_dict)
119 |             adjacency = Chem.GetAdjacencyMatrix(mol)
120 | 
121 |             """Transform the above each data of numpy
122 |             to pytorch tensor on a device (i.e., CPU or GPU).
123 |             """
124 |             fingerprints = torch.LongTensor(fingerprints).to(device)
125 |             adjacency = torch.FloatTensor(adjacency).to(device)
126 |             property = torch.FloatTensor([int(property)]).to(device)
127 |           
128 |             dataset.append((smiles,fingerprints, adjacency, molecular_size, property))
129 | 
130 |     return dataset
131 | 
132 |     dataset_train = create_dataset('data_train.txt')
133 |     dataset_train, dataset_dev = split_dataset(dataset_train, 0.9)
134 |     dataset_test = create_dataset('data_test.txt')
135 | 
136 |     N_fingerprints = len(fingerprint_dict)
137 | 
138 |     return dataset_train, dataset_dev, dataset_test, N_fingerprints
139 | 


--------------------------------------------------------------------------------
/Test/test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "65b363bc",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import predict"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 3,
 16 |    "id": "4cc4f418",
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "name": "stdout",
 21 |      "output_type": "stream",
 22 |      "text": [
 23 |       "The code uses a GPU!\n",
 24 |       "../dataset/data_test.txt\n",
 25 |       "bacc_dev: 0.5119539230602043\n",
 26 |       "pre_dev: 0.5080213903743316\n",
 27 |       "rec_dev: 0.8837209302325582\n",
 28 |       "f1_dev: 0.6451612903225807\n",
 29 |       "mcc_dev: 0.03575604067764825\n",
 30 |       "sp_dev: 0.14018691588785046\n",
 31 |       "q__dev: 0.5454545454545454\n",
 32 |       "acc_dev: 0.5128205128205128\n"
 33 |      ]
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "test1 = predict.predict('../dataset/data_test.txt',property=True)#Drugs from FDA"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 2,
 43 |    "id": "8f0eba5e",
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "name": "stdout",
 48 |      "output_type": "stream",
 49 |      "text": [
 50 |       "The code uses a GPU!\n",
 51 |       "../dataset/world_wide.txt\n",
 52 |       "bacc_dev: 0.46604215456674475\n",
 53 |       "pre_dev: 0.47987043035631655\n",
 54 |       "rec_dev: 0.8095238095238095\n",
 55 |       "f1_dev: 0.6025566531086578\n",
 56 |       "mcc_dev: -0.09345868862125822\n",
 57 |       "sp_dev: 0.12256049960967993\n",
 58 |       "q__dev: 0.3915211970074813\n",
 59 |       "acc_dev: 0.46604215456674475\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "test2 = predict.predict('../dataset/world_wide.txt',property=True)#Drugs from non-US"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "id": "75c3a192",
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "The code uses a GPU!\n",
 78 |       "../dataset/beyondRo5.txt\n",
 79 |       "1\n",
 80 |       "1\n",
 81 |       "1\n",
 82 |       "1\n",
 83 |       "1\n",
 84 |       "1\n",
 85 |       "1\n",
 86 |       "1\n",
 87 |       "1\n",
 88 |       "1\n",
 89 |       "1\n",
 90 |       "1\n",
 91 |       "1\n",
 92 |       "1\n",
 93 |       "1\n",
 94 |       "1\n",
 95 |       "1\n",
 96 |       "1\n",
 97 |       "1\n",
 98 |       "1\n",
 99 |       "1\n",
100 |       "1\n",
101 |       "1\n",
102 |       "1\n",
103 |       "1\n",
104 |       "1\n",
105 |       "1\n",
106 |       "1\n",
107 |       "1\n",
108 |       "1\n",
109 |       "1\n",
110 |       "1\n",
111 |       "1\n",
112 |       "1\n",
113 |       "1\n",
114 |       "1\n",
115 |       "1\n",
116 |       "1\n",
117 |       "1\n",
118 |       "1\n",
119 |       "1\n",
120 |       "1\n",
121 |       "1\n",
122 |       "1\n",
123 |       "1\n",
124 |       "1\n",
125 |       "1\n",
126 |       "1\n",
127 |       "1\n",
128 |       "1\n",
129 |       "1\n",
130 |       "1\n",
131 |       "1\n",
132 |       "1\n",
133 |       "1\n",
134 |       "1\n",
135 |       "1\n",
136 |       "1\n",
137 |       "1\n",
138 |       "1\n",
139 |       "1\n",
140 |       "1\n",
141 |       "1\n",
142 |       "1\n",
143 |       "1\n",
144 |       "1\n",
145 |       "1\n",
146 |       "1\n",
147 |       "1\n",
148 |       "1\n",
149 |       "1\n",
150 |       "1\n",
151 |       "1\n",
152 |       "1\n",
153 |       "1\n",
154 |       "1\n",
155 |       "1\n",
156 |       "1\n",
157 |       "1\n",
158 |       "1\n",
159 |       "1\n",
160 |       "1\n",
161 |       "1\n",
162 |       "1\n",
163 |       "1\n",
164 |       "1\n",
165 |       "1\n",
166 |       "1\n",
167 |       "1\n",
168 |       "1\n",
169 |       "1\n",
170 |       "1\n",
171 |       "1\n",
172 |       "1\n",
173 |       "1\n",
174 |       "1\n",
175 |       "1\n",
176 |       "1\n",
177 |       "1\n",
178 |       "1\n",
179 |       "1\n",
180 |       "1\n",
181 |       "1\n",
182 |       "1\n",
183 |       "1\n",
184 |       "1\n",
185 |       "1\n",
186 |       "1\n",
187 |       "1\n",
188 |       "1\n",
189 |       "1\n",
190 |       "1\n",
191 |       "1\n",
192 |       "1\n",
193 |       "1\n",
194 |       "1\n",
195 |       "1\n",
196 |       "1\n",
197 |       "1\n",
198 |       "1\n",
199 |       "1\n",
200 |       "1\n",
201 |       "1\n",
202 |       "1\n",
203 |       "1\n",
204 |       "1\n",
205 |       "1\n",
206 |       "1\n",
207 |       "1\n",
208 |       "1\n",
209 |       "1\n",
210 |       "1\n",
211 |       "1\n",
212 |       "1\n",
213 |       "1\n",
214 |       "1\n"
215 |      ]
216 |     }
217 |    ],
218 |    "source": [
219 |     "test3 = predict.predict('../dataset/beyondRo5.txt',property=False)#Drugs beyond Ro5"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "id": "147a96e5",
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": []
229 |   }
230 |  ],
231 |  "metadata": {
232 |   "kernelspec": {
233 |    "display_name": "Python 3",
234 |    "language": "python",
235 |    "name": "python3"
236 |   },
237 |   "language_info": {
238 |    "codemirror_mode": {
239 |     "name": "ipython",
240 |     "version": 3
241 |    },
242 |    "file_extension": ".py",
243 |    "mimetype": "text/x-python",
244 |    "name": "python",
245 |    "nbconvert_exporter": "python",
246 |    "pygments_lexer": "ipython3",
247 |    "version": "3.8.8"
248 |   }
249 |  },
250 |  "nbformat": 4,
251 |  "nbformat_minor": 5
252 | }
253 | 


--------------------------------------------------------------------------------
/DGCAN/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Apr 27 20:09:31 2022
  4 | 
  5 | @author:Jinyu-Sun
  6 | """
  7 | 
  8 | import timeit
  9 | import sys
 10 | import numpy as np
 11 | import math
 12 | import torch
 13 | import torch.nn as nn
 14 | import torch.nn.functional as F
 15 | import torch.optim as optim
 16 | import pickle
 17 | from sklearn.metrics import roc_auc_score, roc_curve,auc
 18 | from sklearn.metrics import confusion_matrix
 19 | import preprocess as pp
 20 | import pandas as pd
 21 | import matplotlib.pyplot as plt
 22 | from DGCAN import MolecularGraphNeuralNetwork,Trainer,Tester
 23 | 
 24 | def metrics(cnf_matrix):
 25 |     '''Evaluation Metrics'''
 26 |     tn = cnf_matrix[0, 0]
 27 |     tp = cnf_matrix[1, 1]
 28 |     fn = cnf_matrix[1, 0]
 29 |     fp = cnf_matrix[0, 1]
 30 | 
 31 |     bacc = ((tp / (tp + fn)) + (tn / (tn + fp))) / 2  # balance accurance
 32 |     pre = tp / (tp + fp)  # precision/q+
 33 |     rec = tp / (tp + fn)  # recall/se
 34 |     sp = tn / (tn + fp)
 35 |     q_ = tn / (tn + fn)
 36 |     f1 = 2 * pre * rec / (pre + rec)  # f1score
 37 |     mcc = ((tp * tn) - (fp * fn)) / math.sqrt(
 38 |         (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))  # Matthews correlation coefficient
 39 |     acc = (tp + tn) / (tp + fp + fn + tn)  # accurancy
 40 | 
 41 |     print('bacc:', bacc)
 42 |     print('pre:', pre)
 43 |     print('rec:', rec)
 44 |     print('f1:', f1)
 45 |     print('mcc:', mcc)
 46 |     print('sp:', sp)
 47 |     print('q_:', q_)
 48 |     print('acc:', acc)
 49 |     
 50 |     
 51 | def train (test_name, radius, dim, layer_hidden, layer_output, dropout, batch_train,
 52 |     batch_test, lr, lr_decay, decay_interval, iteration, N , dataset_train):
 53 |     '''
 54 |     
 55 |     Parameters
 56 |     ----------
 57 |     data_test='../dataset/data_test.txt', #test set   
 58 |     radius = 1,        #hops of radius subgraph: 1, 2 
 59 |     dim = 64,          #dimension of graph convolution layers
 60 |     layer_hidden = 4,  #Number of graph convolution layers
 61 |     layer_output = 10, #Number of dense layers
 62 |     dropout = 0.45,    #drop out rate :0-1
 63 |     batch_train = 8,   # batch of training set
 64 |     batch_test = 8,    #batch of test set
 65 |     lr =3e-4,          #learning rate: 1e-5,1e-4,3e-4, 5e-4, 1e-3, 3e-3,5e-3
 66 |     lr_decay = 0.85,   #Learning rate decay:0.5, 0.75, 0.85, 0.9
 67 |     decay_interval = 25,#Number of iterations for learning rate decay:10,25,30,50
 68 |     iteration = 140,    #Number of iterations 
 69 |     N = 5000,           #length of embedding: 2000,3000,5000,7000 
 70 |     dataset_train='../dataset/data_train.txt') #training set
 71 | 
 72 |     Returns
 73 |     -------
 74 |     res_test : results
 75 |         Predicting results.
 76 | 
 77 |     '''
 78 |     dataset_test = test_name
 79 |     (radius, dim, layer_hidden, layer_output,
 80 |      batch_train, batch_test, decay_interval,
 81 |      iteration, dropout) = map(int, [radius, dim, layer_hidden, layer_output,
 82 |                                      batch_train, batch_test,
 83 |                                      decay_interval, iteration, dropout])                
 84 |     lr, lr_decay = map(float, [lr, lr_decay])
 85 |     if torch.cuda.is_available():
 86 |         device = torch.device('cuda')
 87 |         print('The code uses a GPU!')
 88 |     else:
 89 |         device = torch.device('cpu')
 90 |         print('The code uses a CPU...')
 91 | 
 92 |     lr, lr_decay = map(float, [lr, lr_decay])
 93 | 
 94 |     print('-' * 100)
 95 |     print('Just a moment......')
 96 |     print('-' * 100)
 97 |     path = ''
 98 |     dataname = ''
 99 |     
100 |     dataset_train=   pp.create_dataset(dataset_train,path,dataname)
101 |     #dataset_train,dataset_test = pp.split_dataset(dataset_train,0.9)
102 |     #dataset_test=   pp.create_dataset(dataset_dev,path,dataname)    
103 |     dataset_test= pp.create_dataset(dataset_test,path,dataname)
104 |     np.random.seed(0)
105 |     np.random.shuffle(dataset_train)
106 |     print('The preprocess has finished!')
107 |     print('-' * 100)
108 | 
109 |     print('Creating a model.')
110 |     torch.manual_seed(0)
111 |     model = MolecularGraphNeuralNetwork(
112 |         N, dim, layer_hidden, layer_output, dropout).to(device)
113 |     trainer = Trainer(model,lr,batch_train)
114 |     tester = Tester(model,batch_test)
115 |     print('# of model parameters:',
116 |           sum([np.prod(p.size()) for p in model.parameters()]))
117 |     print('-' * 100)
118 |     file_result = path + '../DGCAN/results/AUC' + '.txt'
119 |     #    file_result = '../output/result--' + setting + '.txt'
120 |     result = 'Epoch\tTime(sec)\tLoss_train\tLoss_test\tAUC_train\tAUC_test'
121 |     file_test_result = path + 'test_prediction' + '.txt'
122 |     file_predictions = path + 'train_prediction' + '.txt'
123 |     file_model = '../DGCAN/model/model' + '.pth'
124 |     with open(file_result, 'w') as f:
125 |         f.write(result + '\n')
126 | 
127 |     print('Start training.')
128 |     print('The result is saved in the output directory every epoch!')
129 | 
130 |     np.random.seed(0)
131 | 
132 |     start = timeit.default_timer()
133 | 
134 |     for epoch in range(iteration):
135 |         epoch += 1
136 |         if epoch % decay_interval == 0:
137 |             trainer.optimizer.param_groups[0]['lr'] *= lr_decay
138 |         # [‘amsgrad’, ‘params’, ‘lr’, ‘betas’, ‘weight_decay’, ‘eps’]
139 |         prediction_train, loss_train, train_res = trainer.train(dataset_train)
140 |         prediction_test, loss_test, test_res = tester.test_classifier(dataset_test)
141 | 
142 |         time = timeit.default_timer() - start
143 | 
144 |         if epoch == 1:
145 |             minutes = time * iteration / 60
146 |             hours = int(minutes / 60)
147 |             minutes = int(minutes - 60 * hours)
148 |             print('The training will finish in about',
149 |                   hours, 'hours', minutes, 'minutes.')
150 |             print('-' * 100)
151 |             print(result)
152 | 
153 |         result = '\t'.join(map(str, [epoch, time, loss_train, loss_test, prediction_train, prediction_test]))
154 |         tester.save_result(result, file_result)
155 |         tester.save_model(model, file_model)
156 |         print(result)
157 |     model.eval()
158 |     prediction_test, loss_test, test_res = tester.test_classifier(dataset_test)
159 |     res_test = test_res.T
160 | 
161 |     cnf_matrix = confusion_matrix(res_test[:, 0], res_test[:, 1])
162 |     fpr, tpr, thresholds = roc_curve(res_test[:, 0], res_test[:, 1])
163 |     AUC = auc(fpr, tpr)
164 |     print('auc:',AUC)
165 |     metrics(cnf_matrix)
166 |     return res_test


--------------------------------------------------------------------------------
/screening/DTI.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "c40219cd",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "#  Drug Target Interaction Prediction by using DeepPurpose"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "23b43a6f",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "DeepPurpose has provied the convinient way for DTI prediction especially for SARS_CoV2_Protease. "
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "id": "38ab44af",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Installation"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "1ffbb4b3",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "pip\n",
 35 |     "conda create -n DeepPurpose python=3.6\n",
 36 |     "conda activate DeepPurpose\n",
 37 |     "conda install -c conda-forge rdkit\n",
 38 |     "conda install -c conda-forge notebook\n",
 39 |     "pip install git+https://github.com/bp-kelley/descriptastorus \n",
 40 |     "pip install DeepPurpose\n",
 41 |     "\n",
 42 |     "or  Build from Source\n",
 43 |     "\n",
 44 |     "git clone https://github.com/kexinhuang12345/DeepPurpose.git ## Download code repository\n",
 45 |     "cd DeepPurpose ## Change directory to DeepPurpose\n",
 46 |     "conda env create -f environment.yml  ## Build virtual environment with all packages installed using conda\n",
 47 |     "conda activate DeepPurpose ## Activate conda environment (use \"source activate DeepPurpose\" for anaconda 4.4 or earlier) \n",
 48 |     "jupyter notebook ## open the jupyter notebook with the conda env\n",
 49 |     "\n",
 50 |     "## run our code, e.g. click a file in the DEMO folder\n",
 51 |     "... ...\n",
 52 |     "\n",
 53 |     "conda deactivate ## when done, exit conda environment "
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "id": "26e590fe",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Run"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "id": "55207f2c",
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "import os\n",
 72 |     "os.chdir('../')\n",
 73 |     "from DeepPurpose import utils\n",
 74 |     "from DeepPurpose import DTI as models\n",
 75 |     "X_drug, X_target, y  = process_BindingDB(download_BindingDB(SAVE_PATH),\n",
 76 |     "                                        y = 'Kd', \n",
 77 |     "                                        binary = False, \n",
 78 |     "                                        convert_to_log = True)\n",
 79 |     "\n",
 80 |     "# Type in the encoding names for drug/protein.\n",
 81 |     "drug_encoding, target_encoding = 'MPNN', 'CNN'\n",
 82 |     "\n",
 83 |     "# Data processing, here we select cold protein split setup.\n",
 84 |     "train, val, test = data_process(X_drug, X_target, y, \n",
 85 |     "                                drug_encoding, target_encoding, \n",
 86 |     "                                split_method='cold_protein', \n",
 87 |     "                                frac=[0.7,0.1,0.2])\n",
 88 |     "\n",
 89 |     "# Generate new model using default parameters; also allow model tuning via input parameters.\n",
 90 |     "config = generate_config(drug_encoding, target_encoding, transformer_n_layer_target = 8)\n",
 91 |     "net = models.model_initialize(**config)\n",
 92 |     "\n",
 93 |     "# Train the new model.\n",
 94 |     "# Detailed output including a tidy table storing validation loss, metrics, AUC curves figures and etc. are stored in the ./result folder.\n",
 95 |     "net.train(train, val, test)\n",
 96 |     "\n",
 97 |     "# or simply load pretrained model from a model directory path or reproduced model name such as DeepDTA\n",
 98 |     "net = models.model_pretrained(MODEL_PATH_DIR or MODEL_NAME)\n",
 99 |     "\n",
100 |     "X_repurpose, drug_name, drug_cid = load_broad_repurposing_hub(SAVE_PATH)\n",
101 |     "target, target_name = load_SARS_CoV2_Protease_3CL()\n",
102 |     "\n",
103 |     "_ = models.virtual_screening(smiles, target, net, drug_name, target_name)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "id": "bab0c86d",
109 |    "metadata": {},
110 |    "source": [
111 |     "## Results"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "id": "1136255f",
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "+-------+-----------+------------------------+---------------+\n",
122 |     "|  Rank | Drug Name |      Target Name       | Binding Score |\n",
123 |     "+-------+-----------+------------------------+---------------+\n",
124 |     "|   1   | Drug 4565 | SARS-CoV2 3CL Protease |      8.96     |\n",
125 |     "|   2   | Drug 4570 | SARS-CoV2 3CL Protease |     12.42     |\n",
126 |     "|   3   | Drug 3690 | SARS-CoV2 3CL Protease |     12.86     |\n",
127 |     "|   4   | Drug 3068 | SARS-CoV2 3CL Protease |     13.36     |\n",
128 |     "|   5   | Drug 8387 | SARS-CoV2 3CL Protease |     13.47     |\n",
129 |     "|   6   | Drug 5176 | SARS-CoV2 3CL Protease |     14.47     |\n",
130 |     "|   7   |  Drug 438 | SARS-CoV2 3CL Protease |     14.67     |\n",
131 |     "|   8   | Drug 4507 | SARS-CoV2 3CL Protease |     16.11     |\n",
132 |     "```\n",
133 |     "```\n",
134 |     "|  9978 | Drug 1377 | SARS-CoV2 3CL Protease |   460788.11   |\n",
135 |     "|  9979 | Drug 3768 | SARS-CoV2 3CL Protease |   479737.13   |\n",
136 |     "|  9980 | Drug 5106 | SARS-CoV2 3CL Protease |   485684.14   |\n",
137 |     "|  9981 | Drug 3765 | SARS-CoV2 3CL Protease |   505994.35   |\n",
138 |     "|  9982 | Drug 2207 | SARS-CoV2 3CL Protease |   510293.39   |\n",
139 |     "|  9983 | Drug 1161 | SARS-CoV2 3CL Protease |   525921.93   |\n",
140 |     "|  9984 | Drug 2477 | SARS-CoV2 3CL Protease |   533613.12   |\n",
141 |     "|  9985 | Drug 3320 | SARS-CoV2 3CL Protease |   538902.46   |\n",
142 |     "|  9986 | Drug 3783 | SARS-CoV2 3CL Protease |   542639.17   |\n",
143 |     "|  9987 | Drug 4834 | SARS-CoV2 3CL Protease |   603510.00   |\n",
144 |     "|  9988 | Drug 9653 | SARS-CoV2 3CL Protease |   611796.89   |\n",
145 |     "|  9989 | Drug 6606 | SARS-CoV2 3CL Protease |   671138.31   |\n",
146 |     "|  9990 |  Drug 160 | SARS-CoV2 3CL Protease |   697775.04   |\n",
147 |     "|  9991 | Drug 3851 | SARS-CoV2 3CL Protease |   792134.96   |\n",
148 |     "|  9992 | Drug 5208 | SARS-CoV2 3CL Protease |   832708.75   |\n",
149 |     "|  9993 | Drug 2786 | SARS-CoV2 3CL Protease |   905739.10   |\n",
150 |     "|  9994 | Drug 6612 | SARS-CoV2 3CL Protease |   968825.66   |\n",
151 |     "|  9995 | Drug 6609 | SARS-CoV2 3CL Protease |   1088788.87  |\n",
152 |     "|  9996 |  Drug 801 | SARS-CoV2 3CL Protease |   1186364.21  |\n",
153 |     "|  9997 | Drug 3844 | SARS-CoV2 3CL Protease |   1199274.11  |\n",
154 |     "|  9998 | Drug 3842 | SARS-CoV2 3CL Protease |   1559694.06  |\n",
155 |     "|  9999 | Drug 4486 | SARS-CoV2 3CL Protease |   1619297.87  |\n",
156 |     "| 10000 |  Drug 800 | SARS-CoV2 3CL Protease |   1623061.65  |\n",
157 |     "+-------+-----------+------------------------+---------------+"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "id": "83ff4364",
163 |    "metadata": {},
164 |    "source": [
165 |     "## Acknowledgement"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "id": "ab0dd49f",
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "This project incorporates code from the following repo:\n",
176 |     "     \n",
177 |     "    https://github.com/kexinhuang12345/DeepPurpose\n",
178 |     "    "
179 |    ]
180 |   }
181 |  ],
182 |  "metadata": {
183 |   "kernelspec": {
184 |    "display_name": "Python 3 (ipykernel)",
185 |    "language": "python",
186 |    "name": "python3"
187 |   },
188 |   "language_info": {
189 |    "codemirror_mode": {
190 |     "name": "ipython",
191 |     "version": 3
192 |    },
193 |    "file_extension": ".py",
194 |    "mimetype": "text/x-python",
195 |    "name": "python",
196 |    "nbconvert_exporter": "python",
197 |    "pygments_lexer": "ipython3",
198 |    "version": "3.8.8"
199 |   }
200 |  },
201 |  "nbformat": 4,
202 |  "nbformat_minor": 5
203 | }
204 | 


--------------------------------------------------------------------------------
/screening/Dataset/finetunev1.csv:
--------------------------------------------------------------------------------
  1 | smiles,new_label
  2 | CCN(CC)CCCC(C)Nc1ccnc2cc(Cl)ccc12,1
  3 | Cc1cccc(C)c1OCC(=O)NC(Cc1ccccc1)C(O)CC(Cc1ccccc1)NC(=O)C(C(C)C)N1CCCNC1=O,1
  4 | O=C(Nc1ccc([N+](=O)[O-])cc1Cl)c1cc(Cl)ccc1O,1
  5 | CN(C)C(=O)C(CCN1CCC(O)(c2ccc(Cl)cc2)CC1)(c1ccccc1)c1ccccc1,1
  6 | CC1OC(OC2CC(O)C3(CO)C4C(O)CC5(C)C(C6=CC(=O)OC6)CCC5(O)C4CCC3(O)C2)C(O)C(O)C1O,1
  7 | CCN(CC)CCOc1ccc(C(O)(Cc2ccc(Cl)cc2)c2ccc(C)cc2)cc1,1
  8 | COc1ccc2cc1Oc1ccc(cc1)CC1c3cc(c(OC)cc3CCN1C)Oc1c(OC)c(OC)cc3c1C(C2)N(C)CC3,1
  9 | CCN(CC)Cc1cc(Nc2ccnc3cc(Cl)ccc23)ccc1O,1
 10 | O=C1NCN(c2ccccc2)C12CCN(CCCC(c1ccc(F)cc1)c1ccc(F)cc1)CC2,1
 11 | Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc(-c2cccnc2)n1,1
 12 | CCN(CC)CCOc1ccc2c(c1)C(=O)c1cc(OCCN(CC)CC)ccc1-2,1
 13 | COc1cc2c3cc1Oc1c(OC)c(OC)cc4c1C(Cc1ccc(O)c(c1)Oc1ccc(cc1)CC3N(C)CC2)N(C)CC4,1
 14 | OC(c1cc(C(F)(F)F)nc2c(C(F)(F)F)cccc12)C1CCCCN1,1
 15 | OCCN1CCN(CCCN2c3ccccc3Sc3ccc(C(F)(F)F)cc32)CC1,1
 16 | O=C(Nc1cc(Cl)cc(Cl)c1O)c1c(O)c(Cl)cc(Cl)c1Cl,1
 17 | CCN(CCO)CCCC(C)Nc1ccnc2cc(Cl)ccc12,1
 18 | CC(CN1c2ccccc2Sc2ccccc21)N(C)C,1
 19 | CCSc1ccc2c(c1)N(CCCN1CCN(C)CC1)c1ccccc1S2,1
 20 | CC1OC(OC2C(O)CC(OC3C(O)CC(OC4CCC5(C)C(CCC6C5CC(O)C5(C)C(C7=CC(=O)OC7)CCC65O)C4)OC3C)OC2C)CC(O)C1O,1
 21 | CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21,1
 22 | CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21,1
 23 | CCN(CC)CCOc1ccc(C(=C(Cl)c2ccccc2)c2ccccc2)cc1,1
 24 | CCC1OC(=O)C(C)C(OC2CC(C)(OC)C(O)C(C)O2)C(C)C(OC2OC(C)CC(N(C)C)C2O)C(C)(O)CC(C)CN(C)C(C)C(O)C1(C)O,1
 25 | COc1ncnc(NS(=O)(=O)c2ccc(N)cc2)c1OC,1
 26 | COc1ccc2nc(S(=O)Cc3ncc(C)c(OC)c3C)[nH]c2c1,1
 27 | CN(C)CCOc1ccc(C(=C(CCCl)c2ccccc2)c2ccccc2)cc1,1
 28 | CSc1ccc2c(c1)N(CCC1CCCCN1C)c1ccccc1S2,1
 29 | CC=CCC(C)C(O)C1C(=O)NC(CC)C(=O)N(C)CC(=O)N(C)C(CC(C)C)C(=O)NC(C(C)C)C(=O)N(C)C(CC(C)C)C(=O)NC(C)C(=O)NC(C)C(=O)N(C)C(CC(C)C)C(=O)N(C)C(CC(C)C)C(=O)N(C)C(C(C)C)C(=O)N1C,1
 30 | CC(C)=CCCC1(C)C=Cc2c(O)c3c(c(CC=C(C)C)c2O1)OC12C(=CC4CC1C(C)(C)OC2(CC=C(C)C(=O)O)C4O)C3=O,1
 31 | C=CC1CN2CCC1CC2C(O)c1ccnc2ccc(OC)cc12,1
 32 | CCCCCC(=O)OC1(C(C)=O)CCC2C3CCC4=CC(=O)CCC4(C)C3CCC21C,1
 33 | COc1ccc2cc1Oc1ccc(cc1)CC1c3c(cc4c(c3Oc3cc5c(cc3OC)CCN(C)C5C2)OCO4)CCN1C,1
 34 | CCC(=C(c1ccccc1)c1ccc(OCCN(C)C)cc1)c1ccccc1,1
 35 | OC1(c2ccc(Cl)c(C(F)(F)F)c2)CCN(CCCC(c2ccc(F)cc2)c2ccc(F)cc2)CC1,1
 36 | Clc1ccc(Cn2c(CN3CCCC3)nc3ccccc32)cc1,1
 37 | C1CCC(C(CC2CCCCN2)C2CCCCC2)CC1,1
 38 | Oc1c(Cl)cc(Cl)c(Cl)c1Cc1c(O)c(Cl)cc(Cl)c1Cl,1
 39 | CCC(C(=O)O)C1CCC(C)C(C(C)C(O)C(C)C(=O)C(CC)C2OC3(C=CC(O)C4(CCC(C)(C5CCC(O)(CC)C(C)O5)O4)O3)C(C)CC2C)O1,1
 40 | Cc1cc(Nc2ncc(Cl)c(Nc3ccccc3S(=O)(=O)C(C)C)n2)c(OC(C)C)cc1C1CCNCC1,1
 41 | CC1(C)C=Cc2c(c3c(c4c(=O)c(-c5ccc(O)cc5)coc24)OC(C)(C)CC3)O1,1
 42 | C=CC(=O)Nc1cc(Nc2nccc(-c3cn(C)c4ccccc34)n2)c(OC)cc1N(C)CCN(C)C,1
 43 | Cc1c(-c2ccc(O)cc2)n(Cc2ccc(OCCN3CCCCCC3)cc2)c2ccc(O)cc12,1
 44 | CCCCCCOC(C)c1cccc(-c2csc(NC(=O)c3cc(Cl)c(C=C(C)C(=O)O)c(Cl)c3)n2)c1OC,1
 45 | CC(C)=CCc1c2c(c3occ(-c4ccc(O)cc4)c(=O)c3c1O)C=CC(C)(C)O2,1
 46 | CCCCc1oc2ccc(NS(C)(=O)=O)cc2c1C(=O)c1ccc(OCCCN(CCCC)CCCC)cc1,1
 47 | CC(C)C(=O)OCC(=O)C12OC(C3CCCCC3)OC1CC1C3CCC4=CC(=O)C=CC4(C)C3C(O)CC12C,1
 48 | CC1(C)C=Cc2c(c3c(c4c(=O)c(-c5ccc(O)c(O)c5)coc24)OC(C)(C)CC3)O1,1
 49 | CCCCCOc1ccc(-c2ccc(-c3ccc(C(=O)NC4CC(O)C(O)NC(=O)C5C(O)C(C)CN5C(=O)C(C(C)O)NC(=O)C(C(O)C(O)c5ccc(O)cc5)NC(=O)C5CC(O)CN5C(=O)C(C(C)O)NC4=O)cc3)cc2)cc1,1
 50 | CC(C)(C)c1cc(C(C)(C)C)c(NC(=O)c2c[nH]c3ccccc3c2=O)cc1O,1
 51 | CCC(=C(c1ccc(OCCN(C)C)cc1)c1cccc(O)c1)c1ccccc1,1
 52 | CCN1CCN(Cc2ccc(Nc3ncc(F)c(-c4cc(F)c5nc(C)n(C(C)C)c5c4)n3)nc2)CC1,1
 53 | CCc1nc(C(N)=O)c(Nc2ccc(N3CCC(N4CCN(C)CC4)CC3)c(OC)c2)nc1NC1CCOCC1,1
 54 | CC(C)(C)c1ccc(C(=O)CCCN2CCC(OC(c3ccccc3)c3ccccc3)CC2)cc1,1
 55 | c1ccc2c(c1)Sc1ccccc1N2CC1CN2CCC1CC2,1
 56 | CC1=NN(c2ccc(C)c(C)c2)C(=O)C1=NNc1cccc(-c2cccc(C(=O)O)c2)c1O,1
 57 | COC(=O)NC(C(=O)NC(Cc1ccccc1)C(O)CN(Cc1ccc(-c2ccccn2)cc1)NC(=O)C(NC(=O)OC)C(C)(C)C)C(C)(C)C,1
 58 | CN1C2CCC1CC(OC(c1ccccc1)c1ccccc1)C2,1
 59 | CC(C)N1CCN(c2ccc(OCC3COC(Cn4cncn4)(c4ccc(Cl)cc4Cl)O3)cc2)CC1,1
 60 | C=CCOc1ccccc1OCC(O)CNC(C)C,1
 61 | CCCCCC(O)C=CC1C(O)CC(=O)C1CCCCCCC(=O)O,1
 62 | CC1CCOC2Cn3cc(C(=O)NCc4ccc(F)cc4F)c(=O)c(O)c3C(=O)N12,1
 63 | OCCN1CCN(CCCN2c3ccccc3C=Cc3ccccc32)CC1,1
 64 | CCOC(=O)c1c(CSc2ccccc2)n(C)c2cc(Br)c(O)c(CN(C)C)c12,1
 65 | CC(C)c1nc(CN(C)C(=O)NC(C(=O)NC(Cc2ccccc2)CC(O)C(Cc2ccccc2)NC(=O)OCc2cncs2)C(C)C)cs1,1
 66 | Cc1c(O)cccc1C(=O)NC(CSc1ccccc1)C(O)CN1CC2CCCCC2CC1C(=O)NC(C)(C)C,1
 67 | CC(C)(C)NC(=O)C1CC2CCCCC2CN1CC(O)C(Cc1ccccc1)NC(=O)C(CC(N)=O)NC(=O)c1ccc2ccccc2n1,1
 68 | CCCC1(CCc2ccccc2)CC(O)=C(C(CC)c2cccc(NS(=O)(=O)c3ccc(C(F)(F)F)cn3)c2)C(=O)O1,1
 69 | CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(=O)c1ccc(N)cc1,1
 70 | CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1COC2OCCC12)S(=O)(=O)c1ccc(N)cc1,1
 71 | CC(C)(C)NC(=O)C1CN(Cc2cccnc2)CCN1CC(O)CC(Cc1ccccc1)C(=O)NC1c2ccccc2CC1O,1
 72 | Nc1ccn(C2OC(CO)C(O)C2(F)F)c(=O)n1,0
 73 | CCC1CN2CCc3cc(OC)c(OC)cc3C2CC1CC1NCCc2cc(OC)c(OC)cc21,0
 74 | CN(C)CCCN1c2ccccc2Sc2ccccc21,0
 75 | CC(=O)OC1CC(OC2C(O)CC(OC3C(O)CC(OC4CCC5(C)C(CCC6C5CC(O)C5(C)C(C7=CC(=O)OC7)CCC65O)C4)OC3C)OC2C)OC(C)C1OC1OC(CO)C(O)C(O)C1O,0
 76 | COc1ccc(CC2NCC(O)C2OC(C)=O)cc1,0
 77 | Nc1ccc(N=Nc2ccccc2)c(N)n1,0
 78 | CC(C)NCC(O)COc1cccc2ccccc12,0
 79 | CN(C)CCCSC(=N)N,0
 80 | CN1CCCC1Cc1c[nH]c2ccc(CCS(=O)(=O)c3ccccc3)cc12,0
 81 | NC1CONC1=O,0
 82 | Nc1c2c(nc3ccccc13)CCCC2,0
 83 | CC(=O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C,0
 84 | O=C1CC2(CCCC2)CC(=O)N1CCCCN1CCN(c2ncccn2)CC1,0
 85 | CCN(CC)CCCC(C)Nc1c2ccc(Cl)cc2nc2ccc(OC)cc12,0
 86 | CCCCOc1ccc(C(=O)CCN2CCCCC2)cc1,0
 87 | O=C(CCCN1CCC2(CC1)C(=O)NCN2c1ccccc1)c1ccc(F)cc1,0
 88 | CC(C(O)c1ccc(O)cc1)N1CCC(Cc2ccccc2)CC1,0
 89 | Cn1nnnc1SCC1=C(C(=O)[O-])N2C(=O)C(NC(=O)C(O)c3ccccc3)C2SC1,0
 90 | CCOc1ccc2nc(S(N)(=O)=O)sc2c1,0
 91 | CC=Cc1ccc(OC)cc1,0
 92 | CC(CCc1ccccc1)NCC(O)c1ccc(O)c(C(N)=O)c1,0
 93 | CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,0
 94 | CC(C)(N)Cc1ccccc1,0
 95 | CC12CCC3C(=CCc4cc(O)ccc43)C1CCC2=O,0
 96 | COc1cc2c(cc1OC)C(=O)C(CC1CCN(Cc3ccccc3)CC1)C2,0
 97 | CC(CCc1ccccc1)NC(C)C(O)c1ccc(O)cc1,0
 98 | C=C1CC2C(CCC3(C)C(=O)CCC23)C2(C)C=CC(=O)C=C12,0
 99 | NC(CCC(=O)NC(CSSCC(NC(=O)CCC(N)C(=O)O)C(=O)NCC(=O)O)C(=O)NCC(=O)O)C(=O)O,0
100 | CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC(=O)O,0
101 | CC12CCC3C(CCC4CC(O)CCC43C)C1CCC2=O,0
102 | Cc1ccc(C(=O)C(C)CN2CCCCC2)cc1,0
103 | CN(C)CCN(Cc1ccccc1)c1ccccn1,0
104 | COc1ccccc1OCC(O)CN1CCN(CC(=O)Nc2c(C)cccc2C)CC1,0
105 | COc1cc(N)c(Cl)cc1C(=O)NC1CCN(Cc2ccccc2)CC1,0
106 | CC1(C)OC2CC3C4CCC5=CC(=O)C=CC5(C)C4C(O)CC3(C)C2(C(=O)CO)O1,0
107 | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)nc21,0
108 | Nc1c(Br)cc(Br)cc1CNC1CCC(O)CC1,0
109 | CC12CCC3C(CCC4=C(O)C(=O)CCC43C)C1CCC2=O,0
110 | NCCc1ccc(O)c(O)c1,0
111 | CC(C)(C)NCC(O)c1ccc(O)c(CO)c1,0
112 | CCCN1CC(CSC)CC2c3cccc4[nH]cc(c34)CC21,0
113 | CN(C)CCOC(=O)C(c1ccccc1)C1(O)CCCC1,0
114 | c1ccc2c(c1)OCC(C1=NCCN1)O2,0
115 | CCCc1cc(C(N)=S)ccn1,0
116 | CN(C)CCOC(c1ccccc1)c1ccccc1,0
117 | O=C1c2c(O)cccc2Cc2cccc(O)c21,0
118 | CN(C)C(=O)COC(=O)Cc1ccc(OC(=O)c2ccc(N=C(N)N)cc2)cc1,0
119 | NC(=O)c1nc(F)c[nH]c1=O,0
120 | COC1C(OC(C)=O)CC(=O)OC(C)CC=CC=CC(OC2CCC(N(C)C)C(C)O2)C(C)CC(CC=O)C1OC1OC(C)C(OC2CC(C)(O)C(O)C(C)O2)C(N(C)C)C1O,0
121 | CC12CC(=O)C3C(CCC4CC(O)CCC43C)C1CCC2C(=O)CO,0
122 | COC(c1ccccc1)(c1ccccc1)C(Oc1nc(C)cc(C)n1)C(=O)O,0
123 | CC1CCC2C(C)C(O)OC3OC4(C)CCC1C32OO4,0
124 | CCCCOc1cc(C(=O)OCCN(CC)CC)ccc1N,0
125 | c1ccc(CNCCNCc2ccccc2)cc1,0
126 | CC(=C(CCOC(=O)c1ccccc1)SSC(CCOC(=O)c1ccccc1)=C(C)N(C=O)Cc1cnc(C)nc1N)N(C=O)Cc1cnc(C)nc1N,0
127 | CC(C)=CC(C)=NNc1nncc2ccccc12,0
128 | CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1,0
129 | COc1ccc(CC2c3cc(OC)c(OC)cc3CC[N+]2(C)CCC(=O)OCCCCCOC(=O)CC[N+]2(C)CCc3cc(OC)c(OC)cc3C2Cc2ccc(OC)c(OC)c2)cc1OC,0
130 | CC(=O)Oc1ccc(C(=C2CCCCC2)c2ccc(OC(C)=O)cc2)cc1,0
131 | CNCC(O)c1ccc(OC(=O)C(C)(C)C)c(OC(=O)C(C)(C)C)c1,0
132 | CCc1ccc(C(=O)C(C)CN2CCCCC2)cc1,0
133 | Oc1ccc(C2CNCCc3c2cc(O)c(O)c3Cl)cc1,0
134 | COc1ccc(CC(C)NCC(O)c2ccc(O)c(NC=O)c2)cc1,0
135 | CCC(=O)OC(OP(=O)(CCCCc1ccccc1)CC(=O)N1CC(C2CCCCC2)CC1C(=O)O)C(C)C,0
136 | CC(=C(CCO)SSCC1CCCO1)N(C=O)Cc1cnc(C)nc1N,0
137 | CC(C)C(=O)c1c(C(C)C)nn2ccccc12,0
138 | CCCNC(C)(C)COC(=O)c1ccccc1,0
139 | CCC1(c2cccc(O)c2)CCCCN(C)C1,0
140 | CN1CCN2c3ncccc3Cc3ccccc3C2C1,0
141 | CCCCC(C)(O)CC=CC1C(O)CC(=O)C1CCCCCCC(=O)OC,0
142 | COc1ccc2cc(CCC(C)=O)ccc2c1,0
143 | C=C1CCC2(O)C3Cc4ccc(O)c5c4C2(CCN3CC2CC2)C1O5,0
144 | Cc1cc2c(s1)Nc1ccccc1N=C2N1CCN(C)CC1,0
145 | CCCc1nc(C(C)(C)O)c(C(=O)O)n1Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1,0
146 | Cc1nccn1CC1CCc2c(c3ccccc3n2C)C1=O,0
147 | CC12CC(=CO)C(=O)CC1CCC1C2CCC2(C)C1CCC2(C)O,0
148 | O=C1Nc2ccccc2C1(c1ccc(O)cc1)c1ccc(O)cc1,0
149 | CCOC(=O)OC1(C(=O)COC(=O)CC)CCC2C3CCC4=CC(=O)C=CC4(C)C3C(O)CC21C,0
150 | CC1CN(c2c(F)c(N)c3c(=O)c(C(=O)O)cn(C4CC4)c3c2F)CC(C)N1,0
151 | Cc1ccc(O)c(C(CCN(C(C)C)C(C)C)c2ccccc2)c1,0
152 | CNCc1cc(-c2ccccc2F)n(S(=O)(=O)c2cccnc2)c1,0
153 | O=P(O)(O)C(O)(Cn1ccnc1)P(=O)(O)O,0
154 | O=C1OC2(c3cc(Br)c([O-])cc3Oc3c2cc(Br)c([O-])c3[Hg])c2ccccc21,0
155 | CC12C=CC(=O)C=C1CCC1C2CCC2(C)C1CCC2(C)O,0
156 | 


--------------------------------------------------------------------------------
/Discussion/GPC.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Apr 27 22:04:01 2021
  4 | 
  5 | @author: BM109X32G-10GPU-02
  6 | """
  7 | 
  8 | # -*- coding: utf-8 -*-
  9 | """
 10 | Created on Sun Nov 15 13:46:29 2020
 11 | 
 12 | @author: de''
 13 | """
 14 | 
 15 | # -*- coding: utf-8 -*-
 16 | """
 17 | Created on Sun Nov 15 10:40:57 2020
 18 | 
 19 | @author: de''
 20 | """
 21 | 
 22 | from sklearn.datasets import make_blobs
 23 | import json
 24 | import numpy as np
 25 | import math
 26 | from tqdm import tqdm
 27 | from scipy import sparse
 28 | from sklearn.metrics import roc_auc_score,roc_curve,auc
 29 | from sklearn.metrics import confusion_matrix
 30 | from sklearn.gaussian_process.kernels import RBF
 31 | import pandas as pd
 32 | import matplotlib.pyplot as plt
 33 | from rdkit import Chem
 34 | from sklearn.gaussian_process import GaussianProcessClassifier as GPC
 35 | from sklearn.ensemble import RandomForestClassifier
 36 | from sklearn.model_selection import train_test_split
 37 | from sklearn.preprocessing import MinMaxScaler
 38 | from sklearn.neural_network import MLPClassifier
 39 | from sklearn.svm import SVC
 40 | from tensorflow.keras.models import Model, load_model
 41 | from tensorflow.keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, concatenate
 42 | from tensorflow.keras import metrics, optimizers
 43 | from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
 44 | 
 45 | def split_smiles(smiles, kekuleSmiles=True):
 46 |     try:
 47 |         mol = Chem.MolFromSmiles(smiles)
 48 |         smiles = Chem.MolToSmiles(mol, kekuleSmiles=kekuleSmiles)
 49 |     except:
 50 |         pass
 51 |     splitted_smiles = []
 52 |     for j, k in enumerate(smiles):
 53 |         if len(smiles) == 1:
 54 |             return [smiles]
 55 |         if j == 0:
 56 |             if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
 57 |                 splitted_smiles.append(k + smiles[j + 1])
 58 |             else:
 59 |                 splitted_smiles.append(k)
 60 |         elif j != 0 and j < len(smiles) - 1:
 61 |             if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
 62 |                 splitted_smiles.append(k + smiles[j + 1])
 63 |             elif k.islower() and smiles[j - 1].isupper() and k != "c":
 64 |                 pass
 65 |             else:
 66 |                 splitted_smiles.append(k)
 67 | 
 68 |         elif j == len(smiles) - 1:
 69 |             if k.islower() and smiles[j - 1].isupper() and k != "c":
 70 |                 pass
 71 |             else:
 72 |                 splitted_smiles.append(k)
 73 |     return splitted_smiles
 74 | 
 75 | def get_maxlen(all_smiles, kekuleSmiles=True):
 76 |     maxlen = 0
 77 |     for smi in tqdm(all_smiles):
 78 |         spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
 79 |         if spt is None:
 80 |             continue
 81 |         maxlen = max(maxlen, len(spt))
 82 |     return maxlen
 83 | def get_dict(all_smiles, save_path, kekuleSmiles=True):
 84 |     words = [' ']
 85 |     for smi in tqdm(all_smiles):
 86 |         spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
 87 |         if spt is None:
 88 |             continue
 89 |         for w in spt:
 90 |             if w in words:
 91 |                 continue
 92 |             else:
 93 |                 words.append(w)
 94 |     with open(save_path, 'w') as js:
 95 |         json.dump(words, js)
 96 |     return words
 97 | 
 98 | def one_hot_coding(smi, words, kekuleSmiles=True, max_len=1000):
 99 |     coord_j = []
100 |     coord_k = []
101 |     spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
102 |     if spt is None:
103 |         return None
104 |     for j,w in enumerate(spt):
105 |         if j >= max_len:
106 |             break
107 |         try:
108 |             k = words.index(w)
109 |         except:
110 |             continue
111 |         coord_j.append(j)
112 |         coord_k.append(k)
113 |     data = np.repeat(1, len(coord_j))
114 |     output = sparse.csr_matrix((data, (coord_j, coord_k)), shape=(max_len, len(words)))
115 |     return output
116 | 
117 | if __name__ == "__main__":
118 |     data_train= pd.read_csv('E:/code/drug/drugnn/data_train.csv')
119 |     data_test=pd.read_csv('E:/code/drug/drugnn/worddrug.csv')
120 |     inchis = list(data_train['SMILES'])
121 |     rts = list(data_train['type'])
122 |     
123 |     smiles, targets = [], []
124 |     for i, inc in enumerate(tqdm(inchis)):
125 |         mol = Chem.MolFromSmiles(inc)
126 |         if mol is None:
127 |             continue
128 |         else:
129 |             smi = Chem.MolToSmiles(mol)
130 |             smiles.append(smi)
131 |             targets.append(rts[i])
132 |             
133 |     words = get_dict(smiles, save_path='E:\code\FingerID Reference\drug-likeness/dict.json')
134 |     
135 |     features = []
136 |     for i, smi in enumerate(tqdm(smiles)):
137 |         xi = one_hot_coding(smi, words, max_len=600)
138 |         if xi is not None:
139 |             features.append(xi.todense())
140 |     features = np.asarray(features)
141 |     targets = np.asarray(targets)
142 |     X_train=features
143 |     Y_train=targets
144 |       
145 | 
146 |    # physical_devices = tf.config.experimental.list_physical_devices('CPU') 
147 |    # assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
148 |   #  tf.config.experimental.set_memory_growth(physical_devices[0], True)
149 |     
150 |     
151 |   
152 |     inchis = list(data_test['SMILES'])
153 |     rts = list(data_test['type'])
154 |     
155 |     smiles, targets = [], []
156 |     for i, inc in enumerate(tqdm(inchis)):
157 |         mol = Chem.MolFromSmiles(inc)
158 |         if mol is None:
159 |             continue
160 |         else:
161 |             smi = Chem.MolToSmiles(mol)
162 |             smiles.append(smi)
163 |             targets.append(rts[i])
164 |             
165 |    # words = get_dict(smiles, save_path='D:/工作文件/work.Data/CNN/dict.json')
166 |     
167 |     features = []
168 |     for i, smi in enumerate(tqdm(smiles)):
169 |         xi = one_hot_coding(smi, words, max_len=600)
170 |         if xi is not None:
171 |             features.append(xi.todense())
172 |     features = np.asarray(features)
173 |     targets = np.asarray(targets)
174 |     X_test=features
175 |     Y_test=targets
176 |    
177 |   #  kernel = 1.0 * RBF(0.8)
178 |     #model = RandomForestClassifier(n_estimators=10,max_features='auto', max_depth=None,min_samples_split=2, bootstrap=True)
179 |     model = GPC(  random_state=111)
180 |    
181 |     # earlyStopping = EarlyStopping(monitor='val_loss', patience=0.05, verbose=0, mode='min')
182 |     #mcp_save = ModelCheckpoint('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5', save_best_only=True, monitor='accuracy', mode='auto')
183 |   #  reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min')
184 |     from tensorflow.keras import backend as K
185 |     X_train = K.cast_to_floatx(X_train).reshape((np.size(X_train,0),np.size(X_train,1)*np.size(X_train,2)))
186 | 
187 |     Y_train = K.cast_to_floatx(Y_train)
188 |     
189 | #    X_train,Y_train = make_blobs(n_samples=300, n_features=n_features, centers=6)
190 |     model.fit(X_train, Y_train)
191 | 
192 | 
193 |  #   model = load_model('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5')
194 |     Y_predict = model.predict(K.cast_to_floatx(X_test).reshape((np.size(X_test,0),np.size(X_test,1)*np.size(X_test,2))))
195 |      #Y_predict = model.predict(X_test)#训练数据
196 |     x = list(Y_test)
197 |     y = list(Y_predict)
198 |     from pandas.core.frame import DataFrame   
199 |     x=DataFrame(x)
200 |     y=DataFrame(y)
201 |   #  X= pd.concat([x,y], axis=1)
202 |     #X.to_csv('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/molecularGNN_smiles-master/0825/single-CNN-seed444.csv')
203 |     Y_predict = [1 if i >0.4 else 0 for i in Y_predict]
204 | 
205 |     cnf_matrix=confusion_matrix(Y_test, Y_predict)
206 |     cnf_matrix
207 |     
208 |     tn = cnf_matrix[0,0]
209 |     tp = cnf_matrix[1,1]
210 |     fn = cnf_matrix[1,0]
211 |     fp = cnf_matrix[0,1]
212 |     
213 |     bacc = ((tp/(tp+fn))+(tn/(tn+fp)))/2#balance accurance
214 |     pre = tp/(tp+fp)#precision/q+
215 |     rec = tp/(tp+fn)#recall/se
216 |     sp=tn/(tn+fp)
217 |     q_=tn/(tn+fn)
218 |     f1 = 2*pre*rec/(pre+rec)#f1score
219 |     mcc = ((tp*tn) - (fp*fn))/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))#Matthews correlation coefficient
220 |     acc=(tp+tn)/(tp+fp+fn+tn)#accurancy
221 |     fpr, tpr, thresholds =roc_curve(Y_test, Y_predict)
222 |     AUC = auc(fpr, tpr)
223 |     print('bacc:',bacc)
224 |     print('pre:',pre)
225 |     print('rec:',rec)
226 |     print('f1:',f1)
227 |     print('mcc:',mcc)
228 |     print('sp:',sp)
229 |     print('q_:',q_)
230 |     print('acc:',acc)
231 |     print('auc:',AUC)
232 | 


--------------------------------------------------------------------------------
/Discussion/svc.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Nov 15 13:46:29 2020
  4 | 
  5 | @author: de''
  6 | """
  7 | 
  8 | # -*- coding: utf-8 -*-
  9 | """
 10 | Created on Sun Nov 15 10:40:57 2020
 11 | 
 12 | @author: de''
 13 | """
 14 | 
 15 | from sklearn.datasets import make_blobs
 16 | import json
 17 | import numpy as np
 18 | import math
 19 | from tqdm import tqdm
 20 | from scipy import sparse
 21 | from sklearn.metrics import roc_auc_score,roc_curve,auc
 22 | from sklearn.metrics import confusion_matrix
 23 | 
 24 | import pandas as pd
 25 | import matplotlib.pyplot as plt
 26 | from rdkit import Chem
 27 | 
 28 | from sklearn.ensemble import RandomForestClassifier
 29 | from sklearn.model_selection import train_test_split
 30 | from sklearn.preprocessing import MinMaxScaler
 31 | from sklearn.neural_network import MLPClassifier
 32 | from sklearn.svm import SVC
 33 | from tensorflow.keras.models import Model, load_model
 34 | from tensorflow.keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, concatenate
 35 | from tensorflow.keras import metrics, optimizers
 36 | from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
 37 | 
 38 | def split_smiles(smiles, kekuleSmiles=True):
 39 |     try:
 40 |         mol = Chem.MolFromSmiles(smiles)
 41 |         smiles = Chem.MolToSmiles(mol, kekuleSmiles=kekuleSmiles)
 42 |     except:
 43 |         pass
 44 |     splitted_smiles = []
 45 |     for j, k in enumerate(smiles):
 46 |         if len(smiles) == 1:
 47 |             return [smiles]
 48 |         if j == 0:
 49 |             if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
 50 |                 splitted_smiles.append(k + smiles[j + 1])
 51 |             else:
 52 |                 splitted_smiles.append(k)
 53 |         elif j != 0 and j < len(smiles) - 1:
 54 |             if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
 55 |                 splitted_smiles.append(k + smiles[j + 1])
 56 |             elif k.islower() and smiles[j - 1].isupper() and k != "c":
 57 |                 pass
 58 |             else:
 59 |                 splitted_smiles.append(k)
 60 | 
 61 |         elif j == len(smiles) - 1:
 62 |             if k.islower() and smiles[j - 1].isupper() and k != "c":
 63 |                 pass
 64 |             else:
 65 |                 splitted_smiles.append(k)
 66 |     return splitted_smiles
 67 | 
 68 | def get_maxlen(all_smiles, kekuleSmiles=True):
 69 |     maxlen = 0
 70 |     for smi in tqdm(all_smiles):
 71 |         spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
 72 |         if spt is None:
 73 |             continue
 74 |         maxlen = max(maxlen, len(spt))
 75 |     return maxlen
 76 | def get_dict(all_smiles, save_path, kekuleSmiles=True):
 77 |     words = [' ']
 78 |     for smi in tqdm(all_smiles):
 79 |         spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
 80 |         if spt is None:
 81 |             continue
 82 |         for w in spt:
 83 |             if w in words:
 84 |                 continue
 85 |             else:
 86 |                 words.append(w)
 87 |     with open(save_path, 'w') as js:
 88 |         json.dump(words, js)
 89 |     return words
 90 | 
 91 | def one_hot_coding(smi, words, kekuleSmiles=True, max_len=1000):
 92 |     coord_j = []
 93 |     coord_k = []
 94 |     spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
 95 |     if spt is None:
 96 |         return None
 97 |     for j,w in enumerate(spt):
 98 |         if j >= max_len:
 99 |             break
100 |         try:
101 |             k = words.index(w)
102 |         except:
103 |             continue
104 |         coord_j.append(j)
105 |         coord_k.append(k)
106 |     data = np.repeat(1, len(coord_j))
107 |     output = sparse.csr_matrix((data, (coord_j, coord_k)), shape=(max_len, len(words)))
108 |     return output
109 | 
110 | if __name__ == "__main__":
111 | 
112 | 
113 |     data_train= pd.read_csv('E:/code/drug/drugnn/data_train.csv')
114 |     data_test=pd.read_csv('E:/code/drug/drugnn/bro5.csv')
115 |     inchis = list(data_train['SMILES'])
116 |     rts = list(data_train['type'])
117 |     
118 |     smiles, targets = [], []
119 |     for i, inc in enumerate(tqdm(inchis)):
120 |         mol = Chem.MolFromSmiles(inc)
121 |         if mol is None:
122 |             continue
123 |         else:
124 |             smi = Chem.MolToSmiles(mol)
125 |             smiles.append(smi)
126 |             targets.append(rts[i])
127 |             
128 |     words = get_dict(smiles, save_path='E:\code\FingerID Reference\drug-likeness/dict.json')
129 |     
130 |     features = []
131 |     for i, smi in enumerate(tqdm(smiles)):
132 |         xi = one_hot_coding(smi, words, max_len=2000)
133 |         if xi is not None:
134 |             features.append(xi.todense())
135 |     features = np.asarray(features)
136 |     targets = np.asarray(targets)
137 |     X_train=features
138 |     Y_train=targets
139 |       
140 | 
141 |    # physical_devices = tf.config.experimental.list_physical_devices('CPU') 
142 |    # assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
143 |   #  tf.config.experimental.set_memory_growth(physical_devices[0], True)
144 |     
145 |     
146 |   
147 |     inchis = list(data_test['SMILES'])
148 |     rts = list(data_test['type'])
149 |     
150 |     smiles, targets = [], []
151 |     for i, inc in enumerate(tqdm(inchis)):
152 |         mol = Chem.MolFromSmiles(inc)
153 |         if mol is None:
154 |             continue
155 |         else:
156 |             smi = Chem.MolToSmiles(mol)
157 |             smiles.append(smi)
158 |             targets.append(rts[i])
159 |             
160 |    # words = get_dict(smiles, save_path='D:/工作文件/work.Data/CNN/dict.json')
161 |     
162 |     features = []
163 |     for i, smi in enumerate(tqdm(smiles)):
164 |         xi = one_hot_coding(smi, words, max_len=2000)
165 |         if xi is not None:
166 |             features.append(xi.todense())
167 |     features = np.asarray(features)
168 |     targets = np.asarray(targets)
169 |     X_test=features
170 |     Y_test=targets
171 |     
172 |     
173 |     #model = RandomForestClassifier(n_estimators=10,max_features='auto', max_depth=None,min_samples_split=2, bootstrap=True)
174 |     #model = MLPClassifier(rangdom_state=1,max_iter=300)
175 |     model = SVC(C=500, kernel='rbf', gamma='auto',
176 |                 coef0=0.0, shrinking=True,probability=False, tol=0.0001, cache_size=200, class_weight=None, verbose=False, max_iter=- 1, decision_function_shape='ovr', break_ties=False, random_state=None)
177 |    
178 |     # earlyStopping = EarlyStopping(monitor='val_loss', patience=0.05, verbose=0, mode='min')
179 |     #mcp_save = ModelCheckpoint('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5', save_best_only=True, monitor='accuracy', mode='auto')
180 |   #  reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min')
181 |     from tensorflow.keras import backend as K
182 |     X_train = K.cast_to_floatx(X_train).reshape((np.size(X_train,0),np.size(X_train,1)*np.size(X_train,2)))
183 | 
184 |     Y_train = K.cast_to_floatx(Y_train)
185 |     
186 | #    X_train,Y_train = make_blobs(n_samples=300, n_features=n_features, centers=6)
187 |     model.fit(X_train, Y_train)
188 | 
189 | 
190 |  #   model = load_model('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5')
191 |     Y_predict = model.predict(K.cast_to_floatx(X_test).reshape((np.size(X_test,0),np.size(X_test,1)*np.size(X_test,2))))
192 |      #Y_predict = model.predict(X_test)#训练数据
193 |     x = list(Y_test)
194 |     y = list(Y_predict)
195 |     from pandas.core.frame import DataFrame   
196 |     x=DataFrame(x)
197 |     y=DataFrame(y)
198 |   #  X= pd.concat([x,y], axis=1)
199 |     #X.to_csv('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/molecularGNN_smiles-master/0825/single-CNN-seed444.csv')
200 |     Y_predict = [1 if i >0.5 else 0 for i in Y_predict]
201 | 
202 |     cnf_matrix=confusion_matrix(Y_test, Y_predict)
203 |     cnf_matrix
204 |     
205 |     tn = cnf_matrix[0,0]
206 |     tp = cnf_matrix[1,1]
207 |     fn = cnf_matrix[1,0]
208 |     fp = cnf_matrix[0,1]
209 |     
210 |     bacc = ((tp/(tp+fn))+(tn/(tn+fp)))/2#balance accurance
211 |     pre = tp/(tp+fp)#precision/q+
212 |     rec = tp/(tp+fn)#recall/se
213 |     sp=tn/(tn+fp)
214 |     q_=tn/(tn+fn)
215 |     f1 = 2*pre*rec/(pre+rec)#f1score
216 |     mcc = ((tp*tn) - (fp*fn))/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))#Matthews correlation coefficient
217 |     acc=(tp+tn)/(tp+fp+fn+tn)#accurancy
218 |     fpr, tpr, thresholds =roc_curve(Y_test, Y_predict)
219 |     AUC = auc(fpr, tpr)
220 |     print('bacc:',bacc)
221 |     print('pre:',pre)
222 |     print('rec:',rec)
223 |     print('f1:',f1)
224 |     print('mcc:',mcc)
225 |     print('sp:',sp)
226 |     print('q_:',q_)
227 |     print('acc:',acc)
228 |     print('auc:',AUC)


--------------------------------------------------------------------------------
/DGCAN/preprocess.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Apr 27 20:09:31 2022
  5 | 
  6 | @author:Jinyu-Sun
  7 | """
  8 | 
  9 | from collections import defaultdict
 10 | import numpy as np
 11 | from rdkit import Chem
 12 | import torch
 13 | 
 14 | device = torch.device('cuda')
 15 |  
 16 | atom_dict = defaultdict(lambda: len(atom_dict))
 17 | bond_dict = defaultdict(lambda: len(bond_dict))
 18 | fingerprint_dict = defaultdict(lambda: len(fingerprint_dict))
 19 | edge_dict = defaultdict(lambda: len(edge_dict))
 20 | radius=1
 21 | def create_atoms(mol, atom_dict):
 22 |     """Transform the atom types in a molecule (e.g., H, C, and O)
 23 |     into the indices (e.g., H=0, C=1, and O=2).
 24 |     Note that each atom index considers the aromaticity.
 25 |     """
 26 |     atoms = [a.GetSymbol() for a in mol.GetAtoms()]
 27 |     for a in mol.GetAromaticAtoms():
 28 |         i = a.GetIdx()
 29 |         atoms[i] = (atoms[i], 'aromatic')
 30 |     atoms = [atom_dict[a] for a in atoms]
 31 |     return np.array(atoms)
 32 | 
 33 | 
 34 | def create_ijbonddict(mol, bond_dict):
 35 |     """Create a dictionary, in which each key is a node ID
 36 |     and each value is the tuples of its neighboring node
 37 |     and chemical bond (e.g., single and double) IDs.
 38 | 
 39 |     """
 40 |     i_jbond_dict = defaultdict(lambda: [])
 41 |     for b in mol.GetBonds():
 42 |         i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
 43 |         bond = bond_dict[str(b.GetBondType())]
 44 |         i_jbond_dict[i].append((j, bond))
 45 |         i_jbond_dict[j].append((i, bond))
 46 |     return i_jbond_dict
 47 | 
 48 | 
 49 | def extract_fingerprints(radius, atoms, i_jbond_dict,
 50 |                          fingerprint_dict, edge_dict):
 51 |     """Extract the fingerprints from a molecular graph
 52 |     based on Weisfeiler-Lehman algorithm.
 53 |     """
 54 | 
 55 |     if (len(atoms) == 1) or (radius == 0):
 56 |         nodes = [fingerprint_dict[a] for a in atoms]
 57 | 
 58 |     else:
 59 |         nodes = atoms
 60 |         i_jedge_dict = i_jbond_dict
 61 | 
 62 |         for _ in range(radius):
 63 | 
 64 |             """Update each node ID considering its neighboring nodes and edges.
 65 |             The updated node IDs are the fingerprint IDs.。
 66 |             """
 67 |             nodes_ = []
 68 |             for i, j_edge in i_jedge_dict.items():
 69 |                 neighbors = [(nodes[j], edge) for j, edge in j_edge]
 70 |                 fingerprint = (nodes[i], tuple(sorted(neighbors)))
 71 |                 nodes_.append(fingerprint_dict[fingerprint])
 72 | 
 73 |             """Also update each edge ID considering
 74 |             its two nodes on both sides.
 75 |             """
 76 |             i_jedge_dict_ = defaultdict(lambda: [])
 77 |             for i, j_edge in i_jedge_dict.items():
 78 |                 for j, edge in j_edge:
 79 |                     both_side = tuple(sorted((nodes[i], nodes[j])))
 80 |                     edge = edge_dict[(both_side, edge)]
 81 |                     i_jedge_dict_[i].append((j, edge))
 82 | 
 83 |             nodes = nodes_
 84 |             i_jedge_dict = i_jedge_dict_
 85 | 
 86 |     return np.array(nodes)
 87 | 
 88 | 
 89 | def split_dataset(dataset, ratio):
 90 |     """Shuffle and split a dataset."""
 91 |     np.random.seed(1234)  # fix the seed for shuffle
 92 | #    np.random.shuffle(dataset)
 93 |     n = int(ratio * len(dataset))
 94 |     return dataset[:n], dataset[n:]
 95 | def create_testdataset(filename,path,dataname,property):
 96 |     dir_dataset = path+dataname
 97 |     print(filename)
 98 |     """Load a dataset."""  
 99 |     if property== False:
100 |                 with open(dir_dataset + filename, 'r') as f:
101 |                     #smiles_property = f.readline().strip().split()
102 |                     data_original = f.read().strip().split()
103 |                 data_original = [data for data in data_original
104 |                                     if '.' not in data.split()[0]]              
105 |                 dataset = []
106 |                 for data in data_original:                
107 |                     smiles = data                
108 |                     try:
109 |                         """Create each data with the above defined functions."""
110 |                         mol = Chem.AddHs(Chem.MolFromSmiles(smiles))
111 |                         atoms = create_atoms(mol, atom_dict)
112 |                         molecular_size = len(atoms)
113 |                         i_jbond_dict = create_ijbonddict(mol, bond_dict)
114 |                         fingerprints = extract_fingerprints(radius, atoms, i_jbond_dict,
115 |                                                             fingerprint_dict, edge_dict)          
116 |                         adjacency = Chem.GetAdjacencyMatrix(mol) 
117 |                         """Transform the above each data of numpy
118 |                         to pytorch tensor on a device (i.e., CPU or GPU).
119 |                         """
120 |                         fingerprints = torch.LongTensor(fingerprints).to(device)
121 |                         adjacency = torch.FloatTensor(adjacency).to(device)                    
122 |                         proper = torch.LongTensor([int(0)]).to(device)    
123 |                         dataset.append((smiles,fingerprints, adjacency, molecular_size,proper ))
124 |                     except:
125 |                         print(smiles)
126 |     elif property== True:
127 |         with open(dir_dataset + filename, 'r') as f:
128 |                    # smiles_property = f.readline().strip().split()
129 |                     data_original = f.read().strip().split('\n')
130 |         
131 |         data_original = [data for data in data_original
132 |                             if '.' not in data.split()[0]]
133 |         dataset = []
134 |         for data in data_original:
135 |                 smiles, proper = data.strip().split()
136 |                 try:
137 |                     """Create each data with the above defined functions."""
138 |                     mol = Chem.AddHs(Chem.MolFromSmiles(smiles))
139 |                     atoms = create_atoms(mol, atom_dict)
140 |                     molecular_size = len(atoms)
141 |                     i_jbond_dict = create_ijbonddict(mol, bond_dict)
142 |                     fingerprints = extract_fingerprints(radius, atoms, i_jbond_dict,
143 |                                                         fingerprint_dict, edge_dict)                  
144 |                     adjacency = Chem.GetAdjacencyMatrix(mol)
145 |                     
146 |                     """Transform the above each data of numpy
147 |                     to pytorch tensor on a device (i.e., CPU or GPU).
148 |                     """
149 |                     fingerprints = torch.LongTensor(fingerprints).to(device)       
150 |                     adjacency = torch.FloatTensor(adjacency).to(device)
151 |                     proper = torch.LongTensor([int(proper)]).to(device)
152 |                     dataset.append((smiles,fingerprints, adjacency, molecular_size, proper))
153 |                 except:
154 |                     print(smiles+'is error')
155 |     return dataset
156 | 
157 | def create_dataset(filename,path,dataname):
158 |     dir_dataset = path+dataname
159 |     print(filename)
160 |     """Load a dataset."""
161 |     try:
162 |         with open(dir_dataset + filename, 'r') as f:
163 |             smiles_property = f.readline().strip().split()
164 |             data_original = f.read().strip().split('\n')
165 |     except:
166 |         with open(dir_dataset + filename, 'r') as f:
167 |             smiles_property = f.readline().strip().split()
168 |             data_original = f.read().strip().split('\n')
169 |     
170 |         """Exclude the data contains '.' in its smiles.排除含.的数据"""
171 |     data_original = [data for data in data_original
172 |                         if '.' not in data.split()[0]]
173 |     dataset = []
174 |     for data in data_original:
175 |             smiles, property = data.strip().split()
176 |             try:
177 |                 """Create each data with the above defined functions."""
178 |                 mol = Chem.AddHs(Chem.MolFromSmiles(smiles))
179 |                 atoms = create_atoms(mol, atom_dict)
180 |                 molecular_size = len(atoms)
181 |                 i_jbond_dict = create_ijbonddict(mol, bond_dict)
182 |                 fingerprints = extract_fingerprints(radius, atoms, i_jbond_dict,
183 |                                                     fingerprint_dict, edge_dict)
184 |                 adjacency = Chem.GetAdjacencyMatrix(mol)                
185 |                 """
186 |                 Transform the above each data of numpy
187 |                 to pytorch tensor on a device (i.e., CPU or GPU).
188 |                 """
189 |                 fingerprints = torch.LongTensor(fingerprints).to(device)                          
190 |                 adjacency = torch.FloatTensor(adjacency).to(device)
191 |                 property = torch.LongTensor([int(property)]).to(device)            
192 |                 dataset.append((smiles,fingerprints, adjacency, molecular_size, property))
193 |             except:
194 |                 print(smiles)
195 |     return dataset
196 | 
197 | 
198 | 


--------------------------------------------------------------------------------
/Discussion/RF.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Apr 27 22:08:23 2021
  4 | 
  5 | @author:Jinyusun
  6 | """
  7 | 
  8 | 
  9 | from sklearn.datasets import make_blobs
 10 | import json
 11 | import numpy as np
 12 | import math
 13 | from tqdm import tqdm
 14 | from scipy import sparse
 15 | from sklearn.metrics import roc_auc_score,roc_curve,auc
 16 | from sklearn.metrics import confusion_matrix
 17 | 
 18 | import pandas as pd
 19 | import matplotlib.pyplot as plt
 20 | from rdkit import Chem
 21 | 
 22 | from sklearn.ensemble import RandomForestClassifier
 23 | from sklearn.model_selection import train_test_split
 24 | from sklearn.preprocessing import MinMaxScaler
 25 | from sklearn.neural_network import MLPClassifier
 26 | from sklearn.svm import SVC
 27 | from tensorflow.keras.models import Model, load_model
 28 | from tensorflow.keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, concatenate
 29 | from tensorflow.keras import metrics, optimizers
 30 | from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
 31 | 
 32 | def split_smiles(smiles, kekuleSmiles=True):
 33 |     try:
 34 |         mol = Chem.MolFromSmiles(smiles)
 35 |         smiles = Chem.MolToSmiles(mol, kekuleSmiles=kekuleSmiles)
 36 |     except:
 37 |         pass
 38 |     splitted_smiles = []
 39 |     for j, k in enumerate(smiles):
 40 |         if len(smiles) == 1:
 41 |             return [smiles]
 42 |         if j == 0:
 43 |             if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
 44 |                 splitted_smiles.append(k + smiles[j + 1])
 45 |             else:
 46 |                 splitted_smiles.append(k)
 47 |         elif j != 0 and j < len(smiles) - 1:
 48 |             if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
 49 |                 splitted_smiles.append(k + smiles[j + 1])
 50 |             elif k.islower() and smiles[j - 1].isupper() and k != "c":
 51 |                 pass
 52 |             else:
 53 |                 splitted_smiles.append(k)
 54 | 
 55 |         elif j == len(smiles) - 1:
 56 |             if k.islower() and smiles[j - 1].isupper() and k != "c":
 57 |                 pass
 58 |             else:
 59 |                 splitted_smiles.append(k)
 60 |     return splitted_smiles
 61 | 
 62 | def get_maxlen(all_smiles, kekuleSmiles=True):
 63 |     maxlen = 0
 64 |     for smi in tqdm(all_smiles):
 65 |         spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
 66 |         if spt is None:
 67 |             continue
 68 |         maxlen = max(maxlen, len(spt))
 69 |     return maxlen
 70 | def get_dict(all_smiles, save_path, kekuleSmiles=True):
 71 |     words = [' ']
 72 |     for smi in tqdm(all_smiles):
 73 |         spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
 74 |         if spt is None:
 75 |             continue
 76 |         for w in spt:
 77 |             if w in words:
 78 |                 continue
 79 |             else:
 80 |                 words.append(w)
 81 |     with open(save_path, 'w') as js:
 82 |         json.dump(words, js)
 83 |     return words
 84 | 
 85 | def one_hot_coding(smi, words, kekuleSmiles=True, max_len=1000):
 86 |     coord_j = []
 87 |     coord_k = []
 88 |     spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
 89 |     if spt is None:
 90 |         return None
 91 |     for j,w in enumerate(spt):
 92 |         if j >= max_len:
 93 |             break
 94 |         try:
 95 |             k = words.index(w)
 96 |         except:
 97 |             continue
 98 |         coord_j.append(j)
 99 |         coord_k.append(k)
100 |     data = np.repeat(1, len(coord_j))
101 |     output = sparse.csr_matrix((data, (coord_j, coord_k)), shape=(max_len, len(words)))
102 |     return output
103 | def split_dataset(dataset, ratio):
104 |     """Shuffle and split a dataset."""
105 |    # np.random.seed(111)  # fix the seed for shuffle.
106 |     #np.random.shuffle(dataset)
107 |     n = int(ratio * len(dataset))
108 |     return dataset[:n], dataset[n:]
109 | def edit_dataset(drug,non_drug,task):
110 |   #  np.random.seed(111)  # fix the seed for shuffle.
111 | 
112 | #    np.random.shuffle(non_drug)
113 |     non_drug=non_drug[0:len(drug)]
114 |        
115 | 
116 |       #  np.random.shuffle(non_drug)
117 |    # np.random.shuffle(drug)
118 |     dataset_train_drug, dataset_test_drug = split_dataset(drug, 0.9)
119 |    # dataset_train_drug,dataset_dev_drug =  split_dataset(dataset_train_drug, 0.9)
120 |     dataset_train_no, dataset_test_no = split_dataset(non_drug, 0.9)
121 |    # dataset_train_no,dataset_dev_no =  split_dataset(dataset_train_no, 0.9)
122 |     dataset_train =  pd.concat([dataset_train_drug,dataset_train_no], axis=0)
123 |     dataset_test=pd.concat([ dataset_test_drug,dataset_test_no], axis=0)
124 |   #  dataset_dev = dataset_dev_drug+dataset_dev_no
125 |     return dataset_train, dataset_test
126 | if __name__ == "__main__":
127 |     data_train= pd.read_csv('E:/code/drug/drugnn/data_train.csv')
128 |     data_test=pd.read_csv('E:/code/drug/drugnn/bro5.csv')
129 |     inchis = list(data_train['SMILES'])
130 |     rts = list(data_train['type'])
131 |     
132 |     smiles, targets = [], []
133 |     for i, inc in enumerate(tqdm(inchis)):
134 |         mol = Chem.MolFromSmiles(inc)
135 |         if mol is None:
136 |             continue
137 |         else:
138 |             smi = Chem.MolToSmiles(mol)
139 |             smiles.append(smi)
140 |             targets.append(rts[i])
141 |             
142 |     words = get_dict(smiles, save_path='E:\code\FingerID Reference\drug-likeness/dict.json')
143 |     
144 |     features = []
145 |     for i, smi in enumerate(tqdm(smiles)):
146 |         xi = one_hot_coding(smi, words, max_len=600)
147 |         if xi is not None:
148 |             features.append(xi.todense())
149 |     features = np.asarray(features)
150 |     targets = np.asarray(targets)
151 |     X_train=features
152 |     Y_train=targets
153 |       
154 | 
155 |    # physical_devices = tf.config.experimental.list_physical_devices('CPU') 
156 |    # assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
157 |   #  tf.config.experimental.set_memory_growth(physical_devices[0], True)
158 |     
159 |     
160 |   
161 |     inchis = list(data_test['SMILES'])
162 |     rts = list(data_test['type'])
163 |     
164 |     smiles, targets = [], []
165 |     for i, inc in enumerate(tqdm(inchis)):
166 |         mol = Chem.MolFromSmiles(inc)
167 |         if mol is None:
168 |             continue
169 |         else:
170 |             smi = Chem.MolToSmiles(mol)
171 |             smiles.append(smi)
172 |             targets.append(rts[i])
173 |             
174 |    # words = get_dict(smiles, save_path='D:/工作文件/work.Data/CNN/dict.json')
175 |     
176 |     features = []
177 |     for i, smi in enumerate(tqdm(smiles)):
178 |         xi = one_hot_coding(smi, words, max_len=600)
179 |         if xi is not None:
180 |             features.append(xi.todense())
181 |     features = np.asarray(features)
182 |     targets = np.asarray(targets)
183 |     X_test=features
184 |     Y_test=targets
185 |     n_features=10
186 |     
187 |     model = RandomForestClassifier(n_estimators=5,max_features='auto', max_depth=None,min_samples_split=5, bootstrap=True)
188 |     #model = MLPClassifier(rangdom_state=1,max_iter=300)
189 |     #model = SVC()
190 |    
191 |     # earlyStopping = EarlyStopping(monitor='val_loss', patience=0.05, verbose=0, mode='min')
192 |     #mcp_save = ModelCheckpoint('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5', save_best_only=True, monitor='accuracy', mode='auto')
193 |   #  reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min')
194 |     from tensorflow.keras import backend as K
195 |     X_train = K.cast_to_floatx(X_train).reshape((np.size(X_train,0),np.size(X_train,1)*np.size(X_train,2)))
196 | 
197 |     Y_train = K.cast_to_floatx(Y_train)
198 |     
199 | #    X_train,Y_train = make_blobs(n_samples=300, n_features=n_features, centers=6)
200 |     model.fit(X_train, Y_train)
201 | 
202 | 
203 |  #   model = load_model('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5')
204 |     Y_predict = model.predict(K.cast_to_floatx(X_test).reshape((np.size(X_test,0),np.size(X_test,1)*np.size(X_test,2))))
205 |      #Y_predict = model.predict(X_test)#训练数据
206 |     x = list(Y_test)
207 |     y = list(Y_predict)
208 |     from pandas.core.frame import DataFrame   
209 |     x=DataFrame(x)
210 |     y=DataFrame(y)
211 |   #  X= pd.concat([x,y], axis=1)
212 |     #X.to_csv('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/molecularGNN_smiles-master/0825/single-CNN-seed444.csv')
213 |     Y_predict = [1 if i >0.4 else 0 for i in Y_predict]
214 | 
215 |     cnf_matrix=confusion_matrix(Y_test, Y_predict)
216 |     cnf_matrix
217 |     
218 |     tn = cnf_matrix[0,0]
219 |     tp = cnf_matrix[1,1]
220 |     fn = cnf_matrix[1,0]
221 |     fp = cnf_matrix[0,1]
222 |     
223 |     bacc = ((tp/(tp+fn))+(tn/(tn+fp)))/2#balance accurance
224 |     pre = tp/(tp+fp)#precision/q+
225 |     rec = tp/(tp+fn)#recall/se
226 |     sp=tn/(tn+fp)
227 |     q_=tn/(tn+fn)
228 |     f1 = 2*pre*rec/(pre+rec)#f1score
229 |     mcc = ((tp*tn) - (fp*fn))/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))#Matthews correlation coefficient
230 |     acc=(tp+tn)/(tp+fp+fn+tn)#accurancy
231 |     fpr, tpr, thresholds =roc_curve(Y_test, Y_predict)
232 |     AUC = auc(fpr, tpr)
233 |     print('bacc:',bacc)
234 |     print('pre:',pre)
235 |     print('rec:',rec)
236 |     print('f1:',f1)
237 |     print('mcc:',mcc)
238 |     print('sp:',sp)
239 |     print('q_:',q_)
240 |     print('acc:',acc)
241 |     print('auc:',AUC)
242 | 


--------------------------------------------------------------------------------
/Discussion/CNN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Nov 15 10:40:57 2020
  4 | 
  5 | @author: de''
  6 | """
  7 | 
  8 | import json
  9 | import numpy as np
 10 | import math
 11 | from tqdm import tqdm
 12 | from scipy import sparse
 13 | from sklearn.metrics import roc_auc_score,roc_curve,auc
 14 | from sklearn.metrics import confusion_matrix
 15 | 
 16 | import pandas as pd
 17 | import matplotlib.pyplot as plt
 18 | from rdkit import Chem
 19 | 
 20 | from sklearn.model_selection import train_test_split
 21 | from sklearn.preprocessing import MinMaxScaler
 22 | from sklearn.metrics import mean_absolute_error, r2_score,median_absolute_error
 23 | from tensorflow.keras.models import Model, load_model
 24 | from tensorflow.keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, concatenate
 25 | from tensorflow.keras import metrics, optimizers
 26 | from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
 27 | 
 28 | def split_smiles(smiles, kekuleSmiles=True):
 29 |     try:
 30 |         mol = Chem.MolFromSmiles(smiles)
 31 |         smiles = Chem.MolToSmiles(mol, kekuleSmiles=kekuleSmiles)
 32 |     except:
 33 |         pass
 34 |     splitted_smiles = []
 35 |     for j, k in enumerate(smiles):
 36 |         if len(smiles) == 1:
 37 |             return [smiles]
 38 |         if j == 0:
 39 |             if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
 40 |                 splitted_smiles.append(k + smiles[j + 1])
 41 |             else:
 42 |                 splitted_smiles.append(k)
 43 |         elif j != 0 and j < len(smiles) - 1:
 44 |             if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
 45 |                 splitted_smiles.append(k + smiles[j + 1])
 46 |             elif k.islower() and smiles[j - 1].isupper() and k != "c":
 47 |                 pass
 48 |             else:
 49 |                 splitted_smiles.append(k)
 50 | 
 51 |         elif j == len(smiles) - 1:
 52 |             if k.islower() and smiles[j - 1].isupper() and k != "c":
 53 |                 pass
 54 |             else:
 55 |                 splitted_smiles.append(k)
 56 |     return splitted_smiles
 57 | 
 58 | def get_maxlen(all_smiles, kekuleSmiles=True):
 59 |     maxlen = 0
 60 |     for smi in tqdm(all_smiles):
 61 |         spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
 62 |         if spt is None:
 63 |             continue
 64 |         maxlen = max(maxlen, len(spt))
 65 |     return maxlen
 66 | def get_dict(all_smiles, save_path, kekuleSmiles=True):
 67 |     words = [' ']
 68 |     for smi in tqdm(all_smiles):
 69 |         spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
 70 |         if spt is None:
 71 |             continue
 72 |         for w in spt:
 73 |             if w in words:
 74 |                 continue
 75 |             else:
 76 |                 words.append(w)
 77 |     with open(save_path, 'w') as js:
 78 |         json.dump(words, js)
 79 |     return words
 80 | 
 81 | def one_hot_coding(smi, words, kekuleSmiles=True, max_len=1000):
 82 |     coord_j = []
 83 |     coord_k = []
 84 |     spt = split_smiles(smi, kekuleSmiles=kekuleSmiles)
 85 |     if spt is None:
 86 |         return None
 87 |     for j,w in enumerate(spt):
 88 |         if j >= max_len:
 89 |             break
 90 |         try:
 91 |             k = words.index(w)
 92 |         except:
 93 |             continue
 94 |         coord_j.append(j)
 95 |         coord_k.append(k)
 96 |     data = np.repeat(1, len(coord_j))
 97 |     output = sparse.csr_matrix((data, (coord_j, coord_k)), shape=(max_len, len(words)))
 98 |     return output
 99 | def split_dataset(dataset, ratio):
100 |     """Shuffle and split a dataset."""
101 |    # np.random.seed(111)  # fix the seed for shuffle.
102 |     #np.random.shuffle(dataset)
103 |     n = int(ratio * len(dataset))
104 |     return dataset[:n], dataset[n:]
105 | def edit_dataset(drug,non_drug,task):
106 |   #  np.random.seed(111)  # fix the seed for shuffle.
107 | 
108 |    # np.random.shuffle(non_drug)
109 |     non_drug=non_drug[0:len(drug)]
110 |        
111 | 
112 |       #  np.random.shuffle(non_drug)
113 |    # np.random.shuffle(drug)
114 |     dataset_train_drug, dataset_test_drug = split_dataset(drug, 0.9)
115 |    # dataset_train_drug,dataset_dev_drug =  split_dataset(dataset_train_drug, 0.9)
116 |     dataset_train_no, dataset_test_no = split_dataset(non_drug, 0.9)
117 |    # dataset_train_no,dataset_dev_no =  split_dataset(dataset_train_no, 0.9)
118 |     dataset_train =  pd.concat([dataset_train_drug,dataset_train_no], axis=0)
119 |     dataset_test=pd.concat([ dataset_test_drug,dataset_test_no], axis=0)
120 |   #  dataset_dev = dataset_dev_drug+dataset_dev_no
121 |     return dataset_train, dataset_test
122 | if __name__ == "__main__":
123 |     data_train= pd.read_csv('E:/code/drug/drugnn/data_train.csv')
124 |     data_test=pd.read_csv('E:/code/drug/drugnn/bro5.csv')
125 |     inchis = list(data_train['SMILES'])
126 |     rts = list(data_train['type'])
127 |     
128 |     smiles, targets = [], []
129 |     for i, inc in enumerate(tqdm(inchis)):
130 |         mol = Chem.MolFromSmiles(inc)
131 |         if mol is None:
132 |             continue
133 |         else:
134 |             smi = Chem.MolToSmiles(mol)
135 |             smiles.append(smi)
136 |             targets.append(rts[i])
137 |             
138 |     words = get_dict(smiles, save_path='E:\code\FingerID Reference\drug-likeness/dict.json')
139 |     
140 |     features = []
141 |     for i, smi in enumerate(tqdm(smiles)):
142 |         xi = one_hot_coding(smi, words, max_len=600)
143 |         if xi is not None:
144 |             features.append(xi.todense())
145 |     features = np.asarray(features)
146 |     targets = np.asarray(targets)
147 |     X_train=features
148 |     Y_train=targets
149 |       
150 |     import tensorflow as tf
151 |    # physical_devices = tf.config.experimental.list_physical_devices('CPU') 
152 |    # assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
153 |   #  tf.config.experimental.set_memory_growth(physical_devices[0], True)
154 |     
155 |     
156 |   
157 |     inchis = list(data_test['SMILES'])
158 |     rts = list(data_test['type'])
159 |     
160 |     smiles, targets = [], []
161 |     for i, inc in enumerate(tqdm(inchis)):
162 |         mol = Chem.MolFromSmiles(inc)
163 |         if mol is None:
164 |             continue
165 |         else:
166 |             smi = Chem.MolToSmiles(mol)
167 |             smiles.append(smi)
168 |             targets.append(rts[i])
169 |             
170 |    # words = get_dict(smiles, save_path='D:/工作文件/work.Data/CNN/dict.json')
171 |     
172 |     features = []
173 |     for i, smi in enumerate(tqdm(smiles)):
174 |         xi = one_hot_coding(smi, words, max_len=600)
175 |         if xi is not None:
176 |             features.append(xi.todense())
177 |     features = np.asarray(features)
178 |     targets = np.asarray(targets)
179 |     X_test=features
180 |     Y_test=targets
181 |     layer_in = Input(shape=(X_train.shape[1:3]), name="smile")
182 |     layer_conv = layer_in
183 |     for i in range(6):
184 |             layer_conv = Conv1D(128, kernel_size=4, activation='relu', kernel_initializer='normal')(layer_conv)
185 |             layer_conv = MaxPooling1D(pool_size=2)(layer_conv)
186 |     layer_dense = Flatten()(layer_conv)
187 |             
188 |     for i in range(1):
189 |         layer_dense = Dense(32, activation="relu", kernel_initializer='normal')(layer_dense)
190 |     layer_output = Dense(1, activation="sigmoid", name="output")(layer_dense)
191 |     
192 |    # earlyStopping = EarlyStopping(monitor='val_loss', patience=0.05, verbose=0, mode='min')
193 |     #mcp_save = ModelCheckpoint('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5', save_best_only=True, monitor='accuracy', mode='auto')
194 |   #  reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min')
195 |     
196 |     model = Model(layer_in, outputs = layer_output) 
197 |     opt = optimizers.Adam(lr=0.0005)
198 |     model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
199 |     from tensorflow.keras import backend as K #转换为张量
200 |     X_train = K.cast_to_floatx(X_train)
201 |     Y_train = K.cast_to_floatx(Y_train)
202 |     history = model.fit(X_train, Y_train, epochs=12)
203 | 
204 |     # plot loss
205 |     plt.plot(history.history['loss'])
206 |     plt.plot(history.history['accuracy'])
207 |   #  plt.plot(history.history['val_loss'])
208 |  #   plt.plot(history.history['val_accuracy'])
209 |     plt.ylabel('values')
210 |     plt.xlabel('epoch')
211 |  #   plt.legend(['loss', 'mae', 'val_loss', 'val_mae'], loc='upper left')
212 |     plt.show()
213 |  #   model = load_model('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/single_model.h5')
214 |     Y_predict = model.predict(K.cast_to_floatx(X_test))
215 |      #Y_predict = model.predict(X_test)#训练数据
216 |     x = list(Y_test)
217 |     y = list(Y_predict)
218 |     from pandas.core.frame import DataFrame   
219 |     x=DataFrame(x)
220 |     y=DataFrame(y)
221 |   #  X= pd.concat([x,y], axis=1)
222 |     #X.to_csv('C:/Users/sunjinyu/Desktop/FingerID Reference/drug-likeness/CNN/molecularGNN_smiles-master/0825/single-CNN-seed444.csv')
223 |     Y_predict = [1 if i >0.4 else 0 for i in Y_predict]
224 | 
225 |     cnf_matrix=confusion_matrix(Y_test, Y_predict)
226 |     cnf_matrix
227 |     
228 |     tn = cnf_matrix[0,0]
229 |     tp = cnf_matrix[1,1]
230 |     fn = cnf_matrix[1,0]
231 |     fp = cnf_matrix[0,1]
232 |     
233 |     bacc = ((tp/(tp+fn))+(tn/(tn+fp)))/2#balance accurance
234 |     pre = tp/(tp+fp)#precision/q+
235 |     rec = tp/(tp+fn)#recall/se
236 |     sp=tn/(tn+fp)
237 |     q_=tn/(tn+fn)
238 |     f1 = 2*pre*rec/(pre+rec)#f1score
239 |     mcc = ((tp*tn) - (fp*fn))/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))#Matthews correlation coefficient
240 |     acc=(tp+tn)/(tp+fp+fn+tn)#accurancy
241 |     fpr, tpr, thresholds =roc_curve(Y_test, Y_predict)
242 |     AUC = auc(fpr, tpr)
243 |     print('bacc:',bacc)
244 |     print('pre:',pre)
245 |     print('rec:',rec)
246 |     print('f1:',f1)
247 |     print('mcc:',mcc)
248 |     print('sp:',sp)
249 |     print('q_:',q_)
250 |     print('acc:',acc)
251 |     print('auc:',AUC)


--------------------------------------------------------------------------------
/DGCAN/DGCAN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Apr 27 18:04:58 2022
  4 | 
  5 | @author:Jinyu-Sun
  6 | """
  7 | 
  8 | #coding=utf-8 
  9 | import timeit
 10 | import sys
 11 | import numpy as np
 12 | import math
 13 | import torch
 14 | import torch.nn as nn
 15 | import torch.nn.functional as F
 16 | import torch.optim as optim
 17 | import pickle
 18 | from sklearn.metrics import roc_auc_score, roc_curve
 19 | from sklearn.metrics import confusion_matrix
 20 | import preprocess as pp
 21 | import pandas as pd
 22 | import matplotlib.pyplot as plt
 23 | 
 24 | if torch.cuda.is_available():
 25 |     device = torch.device('cuda')
 26 |    
 27 | else:
 28 |     device = torch.device('cpu')
 29 |     
 30 | torch.cuda.empty_cache()
 31 | class GraphAttentionLayer(nn.Module):
 32 |     def __init__(self, in_features, out_features, dropout, alpha, concat=True):
 33 |         super(GraphAttentionLayer, self).__init__()
 34 |         self.dropout = dropout 
 35 |         self.concat = concat
 36 |         self.in_features = in_features   #dim of input feature
 37 |         self.out_features = out_features #dim of output feature
 38 |         self.alpha = alpha               # negative_slope leakyrelu
 39 |         self.W = nn.Parameter(torch.zeros(size=(in_features, out_features))) 
 40 |         
 41 |         self.a = nn.Parameter(torch.zeros(size=(2 * out_features, 1))) 
 42 |         torch.nn.init.xavier_uniform_(self.W , gain=2.0)
 43 |         #torch.nn.init.kaiming_uniform_(self.a, a=0, mode='fan_in', nonlinearity='leaky_relu')  
 44 |         torch.nn.init.xavier_uniform_(self.W , gain=1.9)
 45 |         self.leakyrelu = nn.LeakyReLU(self.alpha)
 46 | 
 47 |     def forward(self, input, adj):   
 48 |         """
 49 |         input: input_feature [N, in_features] in_features indicates the number of elements of the input feature vector of the node
 50 |         adj: adjacency matrix of the graph dimension [N, N] non-zero is one, data structure basics
 51 |         """
 52 |         h = torch.mm(input, self.W)    # [N, out_features]
 53 |         N = h.size()[0]     #Number of nodes of the graph
 54 |         a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features)     # [N, N, 2*out_features]
 55 |         e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))
 56 |         zero_vec =-9e10 *torch.ones_like(e)
 57 |         attention = torch.where(adj > 0, e, zero_vec)    
 58 |         # indicates that if the adjacency matrix element is greater than 0, then the two nodes are connected and the attention factor at that position is retained.
 59 |         # Otherwise it is necessary to mask and set to a very small value, the reason is that this minimum value will be disregarded during softmax.
 60 |         attention = F.softmax(attention, dim=1)
 61 |         attention = F.dropout(attention, self.dropout, training=self.training)
 62 |         h_prime = torch.matmul(attention, h)
 63 |         if self.concat:
 64 |             return F.elu(h_prime)  
 65 |         else:
 66 |             return h_prime
 67 | 
 68 | 
 69 | class GAT(nn.Module):
 70 |     def __init__(self, nfeat, nhid, dropout, alpha, nheads):
 71 |         super(GAT, self).__init__()
 72 |         """
 73 |         n_heads indicates that there are several GAL layers, which are finally stitched together, similar to self-attention
 74 |         to extract features from different subspaces.
 75 |         """
 76 |         self.dropout = dropout
 77 |         self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in
 78 |                            range(nheads)]
 79 |         for i, attention in enumerate(self.attentions):
 80 |             self.add_module('attention_{}'.format(i), attention)
 81 | 
 82 |         self.out_att = GraphAttentionLayer(nhid,56, dropout=dropout, alpha=alpha, concat=False)
 83 |         self.nheads=nheads
 84 | 
 85 |     def forward(self, x, adj):
 86 |         x = F.dropout(x, self.dropout, training=self.training)
 87 |         #x = torch.cat([att(x, adj) for att in self.attentions], dim=1)
 88 |         
 89 |         z=torch.zeros_like(self.attentions[1](x, adj))
 90 |         for att in self.attentions:
 91 |             z=torch.add(z, att(x, adj))
 92 |         x = z/self.nheads 
 93 |         x = F.dropout(x, self.dropout, training=self.training)
 94 |         x = F.elu(self.out_att(x, adj))
 95 |         return F.softmax(x, dim=1)
 96 | 
 97 | class MolecularGraphNeuralNetwork(nn.Module):
 98 |     def __init__(self, N_fingerprints, dim, layer_hidden, layer_output, dropout):
 99 |         super(MolecularGraphNeuralNetwork, self).__init__()
100 |         self.layer_hidden=layer_hidden
101 |         self.layer_output=layer_output
102 |         self.embed_fingerprint = nn.Embedding(N_fingerprints, dim)
103 |         self.W_fingerprint = nn.ModuleList([nn.Linear(dim, dim) for _ in range(layer_hidden)])
104 | 
105 |         self.W_output = nn.ModuleList([nn.Linear(56,56) for _ in range(layer_output)])
106 |         self.W_property = nn.Linear(56, 2)
107 |        
108 |         self.dropout = dropout
109 |         self.alpha = 0.25
110 |         self.nheads = 2
111 |         self.attentions = GAT(dim, dim, dropout, alpha=self.alpha, nheads=self.nheads).to(device)
112 | 
113 |     def pad(self, matrices, pad_value):
114 |         """Pad the list of matrices
115 |         with a pad_value (e.g., 0) for batch processing.
116 |         For example, given a list of matrices [A, B, C],
117 |         we obtain a new matrix [A00, 0B0, 00C],
118 |         where 0 is the zero (i.e., pad value) matrix.
119 |         """
120 |         shapes = [m.shape for m in matrices]
121 |         M, N = sum([s[0] for s in shapes]), sum([s[1] for s in shapes])
122 |         zeros = torch.FloatTensor(np.zeros((M, N))).to(device)
123 |         pad_matrices = pad_value + zeros
124 |         i, j = 0, 0
125 |         for k, matrix in enumerate(matrices):
126 |             m, n = shapes[k]
127 |             pad_matrices[i:i + m, j:j + n] = matrix
128 |             i += m
129 |             j += n
130 |         return pad_matrices
131 | 
132 |     def update(self, matrix, vectors, layer):
133 |         hidden_vectors = torch.relu(self.W_fingerprint[layer](vectors))
134 | 
135 |         return hidden_vectors + torch.matmul(matrix, hidden_vectors)
136 | 
137 |     def sum(self, vectors, axis):
138 |         sum_vectors = [torch.sum(v, 0) for v in torch.split(vectors, axis)]
139 |         return torch.stack(sum_vectors)
140 | 
141 |     def gnn(self, inputs):
142 |         """Cat or pad each input data for batch processing."""
143 |         Smiles, fingerprints, adjacencies, molecular_sizes = inputs
144 |         fingerprints = torch.cat(fingerprints)
145 |         adj = self.pad(adjacencies, 0)
146 |         """GNN layer (update the fingerprint vectors)."""
147 |         fingerprint_vectors = self.embed_fingerprint(fingerprints)
148 | 
149 |         for l in range(self.layer_hidden):
150 |             hs = self.update(adj, fingerprint_vectors, l)
151 |             fingerprint_vectors = F.normalize(hs, 2, 1)
152 |         """Attention layer"""
153 |         molecular_vectors = self.attentions(fingerprint_vectors, adj)
154 |         """Molecular vector by sum or mean of the fingerprint vectors."""
155 |         molecular_vectors = self.sum(molecular_vectors, molecular_sizes)
156 |         return Smiles, molecular_vectors
157 | 
158 |     def mlp(self, vectors):
159 |         """Regressor based on multilayer perceptron."""
160 |         for l in range(self.layer_output):
161 |         
162 |             vectors = torch.relu(self.W_output[l](vectors))
163 |         outputs = torch.sigmoid(self.W_property(vectors))
164 |         return outputs
165 | 
166 |     def forward_classifier(self, data_batch, train):
167 | 
168 |         inputs = data_batch[:-1]
169 |         correct_labels = torch.cat(data_batch[-1])
170 | 
171 |         if train:
172 |             Smiles, molecular_vectors = self.gnn(inputs)
173 |             predicted_scores = self.mlp(molecular_vectors)
174 |             '''loss function'''
175 |             loss = F.cross_entropy(predicted_scores, correct_labels)
176 |             predicted_scores = predicted_scores.to('cpu').data.numpy()
177 |             predicted_scores = [s[1] for s in predicted_scores]
178 |             correct_labels = correct_labels.to('cpu').data.numpy()
179 |             return Smiles,loss, predicted_scores, correct_labels
180 |         else:
181 |             with torch.no_grad():
182 |                 Smiles, molecular_vectors = self.gnn(inputs)
183 |                 predicted_scores = self.mlp(molecular_vectors)
184 |                 loss = F.cross_entropy(predicted_scores, correct_labels)
185 |             predicted_scores = predicted_scores.to('cpu').data.numpy()
186 |             predicted_scores = [s[1] for s in predicted_scores]
187 |             correct_labels = correct_labels.to('cpu').data.numpy()
188 | 
189 |             return Smiles, loss, predicted_scores, correct_labels
190 | 
191 | 
192 | class Trainer(object):
193 |     def __init__(self, model,lr,batch_train):
194 |         self.model = model
195 |         self.batch_train=batch_train
196 |         self.lr=lr
197 |         self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
198 | 
199 |     def train(self, dataset):
200 |         np.random.shuffle(dataset)
201 |         N = len(dataset)
202 |         loss_total = 0
203 |         SMILES,P, C = '',[], []
204 |         for i in range(0, N, self.batch_train):
205 |             data_batch = list(zip(*dataset[i:i + self.batch_train]))
206 |             Smiles,loss, predicted_scores, correct_labels = self.model.forward_classifier(data_batch, train=True)
207 |             SMILES += ' '.join(Smiles) + ' '
208 |             P.append(predicted_scores)
209 |             C.append(correct_labels)
210 |             self.optimizer.zero_grad()
211 |             loss.backward()
212 |             self.optimizer.step()
213 |             loss_total += loss.item()
214 |         tru = np.concatenate(C)
215 |         pre = np.concatenate(P)
216 |         AUC = roc_auc_score(tru, pre)
217 |         SMILES = SMILES.strip().split()
218 |         pred = [1 if i > 0.15 else 0 for i in pre]
219 |         predictions = np.stack((tru, pred, pre))
220 |         return AUC, loss_total, predictions
221 | 
222 | 
223 | class Tester(object):
224 |     def __init__(self, model,batch_test):
225 |         self.model = model
226 |         self.batch_test=batch_test
227 |     def test_classifier(self, dataset):
228 |         N = len(dataset)
229 |         loss_total = 0
230 |         SMILES, P, C = '', [], []
231 |         for i in range(0, N, self.batch_test):
232 |             data_batch = list(zip(*dataset[i:i + self.batch_test]))
233 |             (Smiles, loss, predicted_scores, correct_labels) = self.model.forward_classifier(
234 |                 data_batch, train=False)
235 |             SMILES += ' '.join(Smiles) + ' '
236 |             loss_total += loss.item()
237 |             P.append(predicted_scores)
238 |             C.append(correct_labels)
239 |         SMILES = SMILES.strip().split()
240 |         tru = np.concatenate(C)
241 |         pre = np.concatenate(P)
242 |         pred = [1 if i >0.15 else 0 for i in pre]
243 |         #AUC = roc_auc_score(tru, pre)
244 |         cnf_matrix=confusion_matrix(tru,pred)
245 |         tn = cnf_matrix[0, 0]
246 |         tp = cnf_matrix[1, 1]
247 |         fn = cnf_matrix[1, 0]
248 |         fp = cnf_matrix[0, 1]
249 |         acc = (tp + tn) / (tp + fp + fn + tn)
250 |         #  Tru=map(str,np.concatenate(C))
251 |         #  Pre=map(str,np.concatenate(P))
252 |         #  predictions = '\n'.join(['\t'.join(x) for x in zip(SMILES, Tru, Pre)])
253 |         predictions = np.stack((tru, pred, pre))
254 |         return acc, loss_total, predictions
255 | 
256 |     def save_result(self, result, filename):
257 |         with open(filename, 'a') as f:
258 |             f.write(result + '\n')
259 | 
260 |     def save_predictions(self, predictions, filename):
261 |         with open(filename, 'w') as f:
262 |             f.write('Smiles\tCorrect\tPredict\n')
263 |             f.write(predictions + '\n')
264 | 
265 |     def save_model(self, model, filename):
266 |         torch.save(model.state_dict(), filename)
267 | 
268 | def dump_dictionary(dictionary, filename):
269 |     with open('../DGCAN/model'+filename, 'wb') as f:
270 |         pickle.dump(dict(dictionary), f)
271 | 
272 | 
273 | 


--------------------------------------------------------------------------------
/DGCAN/results/AUC.txt:
--------------------------------------------------------------------------------
  1 | Epoch	Time(sec)	Loss_train	Loss_test	AUC_train	AUC_test
  2 | 1	7.2395853999999815	318.02376973629	33.23613902926445	0.6330387783115992	0.5
  3 | 2	13.431995	275.59704649448395	28.36697283387184	0.7726390395642837	0.7757009345794392
  4 | 3	19.53742679999999	258.0953543186188	26.692712754011154	0.8227923594659818	0.7827102803738317
  5 | 4	25.411535499999985	244.29262351989746	25.99587020277977	0.8555440089003034	0.8247663551401869
  6 | 5	31.270511900000002	235.61571648716927	26.206634640693665	0.8711811167445258	0.8271028037383178
  7 | 6	37.1334444	235.10905063152313	24.74921104311943	0.8782201330617486	0.8504672897196262
  8 | 7	43.15194819999999	230.60763642191887	24.482909947633743	0.8858224754798858	0.8644859813084113
  9 | 8	49.21481349999999	225.9473716020584	25.168260991573334	0.894445889698231	0.8738317757009346
 10 | 9	55.39354119999999	220.88472372293472	23.143073588609695	0.9038094737308211	0.8808411214953271
 11 | 10	61.388609	220.29008296132088	23.10640263557434	0.9080117951159034	0.8925233644859814
 12 | 11	67.3816218	220.04156962037086	23.67304638028145	0.905873895764607	0.8714953271028038
 13 | 12	73.3649585	214.85031658411026	23.34747040271759	0.9159177330794608	0.8551401869158879
 14 | 13	79.32801410000002	212.33444252610207	23.14932319521904	0.9178444716275156	0.8714953271028038
 15 | 14	85.31854019999997	211.54040449857712	22.778073489665985	0.9219235005645715	0.8901869158878505
 16 | 15	91.33987619999999	208.26400744915009	22.901916056871414	0.9267551530985012	0.8995327102803738
 17 | 16	98.36570660000001	209.3945328295231	23.913705557584763	0.9246417461863752	0.8878504672897196
 18 | 17	104.39726789999997	206.03158766031265	23.282782286405563	0.930114906901056	0.8901869158878505
 19 | 18	110.4006412	207.53857171535492	22.225304275751114	0.9226543992295261	0.9018691588785047
 20 | 19	116.81378369999999	204.79183167219162	23.475462794303894	0.926265719441185	0.8785046728971962
 21 | 20	122.6911399	205.36031165719032	22.78501933813095	0.9291473863661524	0.8878504672897196
 22 | 21	128.57552929999997	202.3321330845356	23.385528802871704	0.9302150906635376	0.8855140186915887
 23 | 22	134.4078144	202.55410113930702	23.08760157227516	0.9293607611751943	0.8925233644859814
 24 | 23	140.28498439999998	198.95897144079208	22.36356022953987	0.9377836694932141	0.8948598130841121
 25 | 24	146.0898661	197.13710144162178	23.3654263317585	0.9351247869019417	0.8785046728971962
 26 | 25	151.9243611	256.36723348498344	22.31619429588318	0.7880461675559591	0.8901869158878505
 27 | 26	157.7956577	199.1333883702755	22.19395723938942	0.9381437221865521	0.9018691588785047
 28 | 27	163.5824619	195.61116680502892	21.885735362768173	0.9357744592290832	0.8995327102803738
 29 | 28	169.4125838	196.21020331978798	21.808892458677292	0.937718217946731	0.9065420560747663
 30 | 29	175.33045549999997	196.93134278059006	22.267054110765457	0.9385652135408595	0.897196261682243
 31 | 30	181.20051949999998	195.89555063843727	22.040870487689972	0.9386104622844113	0.8995327102803738
 32 | 31	187.0336099	194.0237057507038	22.781775504350662	0.9417760754533178	0.8714953271028038
 33 | 32	192.90557769999998	193.68072113394737	22.449314266443253	0.9423293001527663	0.8948598130841121
 34 | 33	198.73298769999997	192.5338954925537	22.377066612243652	0.9452480516748955	0.8785046728971962
 35 | 34	204.81813569999997	192.58278796076775	23.285291463136673	0.9402174153696283	0.8808411214953271
 36 | 35	210.6708656	196.01435166597366	24.061037868261337	0.9359749651294087	0.8691588785046729
 37 | 36	216.5086068	193.9636361002922	22.313345968723297	0.936061864857086	0.8901869158878505
 38 | 37	222.3077349	192.51033294200897	22.285043627023697	0.9435566896185268	0.9042056074766355
 39 | 38	228.07614589999997	188.01407945156097	22.830698162317276	0.94882975955897	0.8995327102803738
 40 | 39	233.88910299999998	193.91294729709625	22.711496233940125	0.9402640478668054	0.8808411214953271
 41 | 40	239.76910729999997	192.2110168337822	21.79123494029045	0.9453036785706378	0.9042056074766355
 42 | 41	245.54325319999998	189.3926584124565	23.183754086494446	0.9464263178869528	0.8925233644859814
 43 | 42	251.29193049999998	197.67854461073875	24.210958123207092	0.9356472922709057	0.8644859813084113
 44 | 43	257.07465279999997	195.8016073703766	22.462971657514572	0.9351026468439347	0.897196261682243
 45 | 44	262.8480727	191.97943636775017	23.224840223789215	0.940738813735692	0.8878504672897196
 46 | 45	268.5777391	190.848837941885	22.82283341884613	0.9461044567936768	0.897196261682243
 47 | 46	274.3567221	190.04618108272552	22.433310955762863	0.9440021199105542	0.897196261682243
 48 | 47	280.1616897	190.5216095149517	22.426137387752533	0.9457633615250072	0.9042056074766355
 49 | 48	285.9260854	185.92078268527985	22.22221177816391	0.9480603925432284	0.8995327102803738
 50 | 49	291.66768609999997	187.782156676054	22.887968957424164	0.94423528239644	0.8901869158878505
 51 | 50	297.4925078	187.28414443135262	21.483285009860992	0.9458896982310093	0.9088785046728972
 52 | 51	303.2552989	185.18417713046074	21.38184556365013	0.9481489527752565	0.9205607476635514
 53 | 52	309.0365314	182.16105404496193	24.673764526844025	0.9509743009276684	0.8714953271028038
 54 | 53	314.8158657	188.7527618408203	23.513393253087997	0.9457229559191445	0.8878504672897196
 55 | 54	320.5944295	185.709531635046	21.631520986557007	0.9463949066796555	0.9135514018691588
 56 | 55	326.40547119999997	185.12931755185127	22.429152816534042	0.944120984346979	0.9018691588785047
 57 | 56	332.1655878	182.88407680392265	21.58257967233658	0.9498509697345405	0.9135514018691588
 58 | 57	337.97326309999994	182.04424741864204	21.475889027118683	0.9525705991099698	0.9182242990654206
 59 | 58	343.78313679999997	182.934487760067	21.883195608854294	0.949658766355968	0.9088785046728972
 60 | 59	349.55780849999996	184.17358297109604	21.290808767080307	0.9471534804171187	0.9182242990654206
 61 | 60	355.38770680000005	181.42354640364647	21.597694754600525	0.949361674452587	0.9135514018691588
 62 | 61	361.28093179999996	187.25566163659096	21.6785786151886	0.9452393340270551	0.9065420560747663
 63 | 62	367.0356904	181.59250125288963	21.666670441627502	0.9521657127991676	0.9112149532710281
 64 | 63	372.82640919999994	179.8839019536972	22.01644539833069	0.9538880709367459	0.9065420560747663
 65 | 64	378.58320630000003	182.93770709633827	22.33838379383087	0.9484886642903003	0.8995327102803738
 66 | 65	384.42532470000003	181.58496183156967	22.23741576075554	0.950153319901698	0.8995327102803738
 67 | 66	390.227352	182.49673774838448	21.934344708919525	0.9507491642128103	0.9042056074766355
 68 | 67	396.04737980000004	180.1727076768875	22.335491836071014	0.9534794484911551	0.9042056074766355
 69 | 68	401.8080486	182.3468733727932	21.559545934200287	0.953210031660283	0.9158878504672897
 70 | 69	407.56943720000004	177.6970148384571	21.813909739255905	0.9542144984169858	0.9158878504672897
 71 | 70	413.3951455	179.4230616092682	21.47458705306053	0.9503143888236987	0.9182242990654206
 72 | 71	419.26894319999997	187.4236896932125	21.997405976057053	0.9434714503952	0.9112149532710281
 73 | 72	425.0120637	185.64457353949547	21.80859535932541	0.9493346912568912	0.9112149532710281
 74 | 73	430.8028591	183.28448390960693	22.7678345143795	0.942662231275046	0.8995327102803738
 75 | 74	436.5995785	181.35295176506042	21.99883532524109	0.9524445391546925	0.9088785046728972
 76 | 75	442.4056693	180.42559936642647	22.20555028319359	0.9545189242145815	0.9065420560747663
 77 | 76	448.17074230000003	177.4391260445118	21.684407979249954	0.9532330019704651	0.9088785046728972
 78 | 77	453.95868340000004	187.34195244312286	22.080274641513824	0.9482340536232203	0.9042056074766355
 79 | 78	459.8026899	184.663908213377	22.014076620340347	0.9470819403546837	0.9088785046728972
 80 | 79	465.5365296	178.98830798268318	21.47413921356201	0.9541025527486882	0.9158878504672897
 81 | 80	471.3356651	178.3373854458332	21.658688694238663	0.9525628500896672	0.9088785046728972
 82 | 81	477.18991429999994	176.8597036600113	22.024581998586655	0.9535130737042532	0.9065420560747663
 83 | 82	483.0233918	177.2030012011528	21.766023725271225	0.957143212965218	0.9135514018691588
 84 | 83	488.7872165	176.38141465187073	21.79708757996559	0.9572966712422787	0.9112149532710281
 85 | 84	494.52968880000003	174.46399101614952	21.843281388282776	0.956292066110213	0.9112149532710281
 86 | 85	500.3899477	175.65917918086052	21.409487038850784	0.9559244027719352	0.9182242990654206
 87 | 86	506.2031879	176.77976202964783	21.49503728747368	0.9538743717758542	0.9228971962616822
 88 | 87	512.0002034	179.85141596198082	21.427332252264023	0.9514658102154229	0.9158878504672897
 89 | 88	517.7809181	178.52282038331032	21.403560250997543	0.9539066132353267	0.9158878504672897
 90 | 89	523.6232049	177.05544209480286	21.411171078681946	0.9535158412115041	0.9135514018691588
 91 | 90	529.4495936	176.31908676028252	21.366370409727097	0.955243734363584	0.9228971962616822
 92 | 91	535.3322216	176.20382365584373	21.893729746341705	0.9558952055704386	0.9088785046728972
 93 | 92	541.1628488	175.1233125925064	22.482848435640335	0.9574447328802002	0.9042056074766355
 94 | 93	547.003017	176.93210792541504	21.549375027418137	0.9527253027652932	0.9135514018691588
 95 | 94	552.8402735	173.00296890735626	21.5932075381279	0.9579943598202227	0.9205607476635514
 96 | 95	558.6800595	179.9282302260399	23.50808882713318	0.9537645017379945	0.8878504672897196
 97 | 96	564.5009782	174.56020081043243	22.648311734199524	0.9556187315960767	0.8995327102803738
 98 | 97	570.3044254	176.7171704173088	21.891200184822083	0.9524900646489693	0.9088785046728972
 99 | 98	576.1318849	178.38612964749336	22.245244562625885	0.9540746009254544	0.9088785046728972
100 | 99	581.9812099	177.73075929284096	24.06598174571991	0.9527554685943277	0.8551401869158879
101 | 100	587.8207973	177.71725061535835	23.09346652030945	0.9563492151349435	0.8925233644859814
102 | 101	593.7006995	173.766254901886	22.383565932512283	0.9588301470099851	0.8995327102803738
103 | 102	599.5459518	173.67894527316093	21.84225881099701	0.9581018774769188	0.9112149532710281
104 | 103	605.2735841	175.92625331878662	22.260210156440735	0.956732791639914	0.9065420560747663
105 | 104	611.1396088	174.88757956027985	21.68474268913269	0.95813674806828	0.9158878504672897
106 | 105	616.9215246	176.1177335381508	22.008816480636597	0.9573001306263422	0.9065420560747663
107 | 106	622.727265	174.49301874637604	21.83292892575264	0.9588928310492174	0.9088785046728972
108 | 107	628.5301017	172.7093889117241	21.64242872595787	0.9596258053446101	0.9135514018691588
109 | 108	634.310622	177.16424638032913	21.55477637052536	0.9525133117098767	0.9158878504672897
110 | 109	640.0827119	178.4102607667446	21.33096119761467	0.9554701164567051	0.9158878504672897
111 | 110	645.8604367	174.0306807756424	21.91256058216095	0.9572584796422166	0.9135514018691588
112 | 111	651.6639703	174.55561447143555	21.62583690881729	0.9593071268846725	0.9158878504672897
113 | 112	657.5478062	172.49658674001694	22.11896824836731	0.9582462029800518	0.9112149532710281
114 | 113	663.3088726999999	173.48215851187706	21.262643307447433	0.9584646976775077	0.9205607476635514
115 | 114	669.1280201	174.29942700266838	21.171885669231415	0.9568379569154472	0.9205607476635514
116 | 115	674.9632922	171.870591878891	21.214154481887817	0.9590825436712644	0.9205607476635514
117 | 116	680.7173928	176.74994710087776	22.71759131550789	0.9544714614652291	0.8901869158878505
118 | 117	686.4742894999999	184.73798117041588	21.845041394233704	0.9432161478513075	0.9112149532710281
119 | 118	692.2357881	177.2356958091259	21.59891825914383	0.9571596796333606	0.9158878504672897
120 | 119	698.004266	176.63044354319572	21.46969723701477	0.9547083600859034	0.9158878504672897
121 | 120	703.704268	172.92364439368248	21.97585704922676	0.9586079161777404	0.9065420560747663
122 | 121	709.4959497	180.26964315772057	21.951832473278046	0.948673395399296	0.9088785046728972
123 | 122	715.3568411	173.81644931435585	21.459405571222305	0.9592527453671928	0.9205607476635514
124 | 123	721.1472172	176.33700492978096	21.565876573324203	0.9538397779352181	0.9205607476635514
125 | 124	726.9662815	172.69166892766953	21.18627032637596	0.9598193924768083	0.9205607476635514
126 | 125	732.7549476	171.38375091552734	21.528594940900803	0.9606258440897115	0.9158878504672897
127 | 126	738.5203703	171.18208953738213	22.116858184337616	0.9604675426749618	0.9088785046728972
128 | 127	744.3382166	171.14211875200272	22.178175538778305	0.9611412923151859	0.9112149532710281
129 | 128	750.1037539	169.92953234910965	21.887178242206573	0.9611555449775278	0.9112149532710281
130 | 129	755.8869365	174.96560329198837	22.6447791159153	0.9555907797728429	0.8995327102803738
131 | 130	761.6342917	178.70918104052544	22.300085812807083	0.9529475335975381	0.9042056074766355
132 | 131	767.4047745	174.0145247578621	22.113102048635483	0.9579599043549494	0.9088785046728972
133 | 132	773.1641292	172.59970355033875	21.501632899045944	0.9596043571634159	0.9182242990654206
134 | 133	779.0603237	172.19679167866707	22.107417851686478	0.9594648747979719	0.9088785046728972
135 | 134	784.8799782	173.3250037431717	23.72211918234825	0.9588380344056502	0.8761682242990654
136 | 135	790.6637747999999	173.15374860167503	22.544156223535538	0.9600561527221201	0.897196261682243
137 | 136	796.4695435	171.82129180431366	22.224039256572723	0.9606888048796689	0.9065420560747663
138 | 137	802.2469289	171.67953670024872	22.0196373462677	0.9615298503332077	0.9042056074766355
139 | 138	808.016359	172.37993958592415	22.44371086359024	0.9589339285318929	0.8995327102803738
140 | 139	813.7643964	174.99391075968742	21.463502824306488	0.9590761784045874	0.9158878504672897
141 | 140	819.5146855	174.17848363518715	21.192844033241272	0.959691256891093	0.9228971962616822
142 | 


--------------------------------------------------------------------------------
/DGCAN/predict.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # -*- coding: utf-8 -*-
  4 | """
  5 | Created on Wed Apr 27 20:09:31 2022
  6 | 
  7 | @author:Jinyu-Sun
  8 | """
  9 | 
 10 | import timeit
 11 | import sys
 12 | import numpy as np
 13 | import math
 14 | import torch
 15 | import torch.nn as nn
 16 | import torch.nn.functional as F
 17 | import torch.optim as optim
 18 | import pickle
 19 | from sklearn.metrics import roc_auc_score, roc_curve
 20 | from sklearn.metrics import confusion_matrix
 21 | import preprocess as pp
 22 | import pandas as pd
 23 | import matplotlib.pyplot as plt
 24 | torch.cuda.empty_cache()
 25 | if torch.cuda.is_available():
 26 |     device = torch.device('cuda')
 27 |    
 28 | else:
 29 |     device = torch.device('cpu')
 30 |     
 31 | class GraphAttentionLayer(nn.Module):
 32 |     def __init__(self, in_features, out_features, dropout, alpha, concat=True):
 33 |         super(GraphAttentionLayer, self).__init__()
 34 |         self.dropout = dropout 
 35 |         self.concat = concat
 36 |         self.in_features = in_features   #dim of input feature
 37 |         self.out_features = out_features #dim of output feature
 38 |         self.alpha = alpha               # negative_slope leakyrelu
 39 |         self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))       
 40 |         self.a = nn.Parameter(torch.zeros(size=(2 * out_features, 1)))  
 41 |         self.leakyrelu = nn.LeakyReLU(self.alpha)
 42 | 
 43 |     def forward(self, input, adj):   
 44 |         """
 45 |         input: input_feature [N, in_features] in_features indicates the number of elements of the input feature vector of the node
 46 |         adj: adjacency matrix of the graph dimension [N, N] non-zero is one, data structure basics
 47 |         """
 48 |         h = torch.mm(input, self.W)    # [N, out_features]
 49 |         N = h.size()[0]     #Number of nodes of the graph
 50 |         a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features)     # [N, N, 2*out_features]
 51 |         e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))
 52 |         zero_vec =-9e10 *torch.ones_like(e)
 53 |         attention = torch.where(adj > 0, e, zero_vec)    
 54 |         # indicates that if the adjacency matrix element is greater than 0, then the two nodes are connected and the attention factor at that position is retained.
 55 |         # Otherwise it is necessary to mask and set to a very small value, the reason is that this minimum value will be disregarded during softmax.
 56 |         attention = F.softmax(attention, dim=1)
 57 |         attention = F.dropout(attention, self.dropout, training=self.training)
 58 |         h_prime = torch.matmul(attention, h)
 59 |         if self.concat:
 60 |             return F.elu(h_prime)  
 61 |         else:
 62 |             return h_prime
 63 | 
 64 | 
 65 | class GAT(nn.Module):
 66 |     def __init__(self, nfeat, nhid, dropout, alpha, nheads):
 67 |         super(GAT, self).__init__()
 68 |         """
 69 |         n_heads indicates that there are several GAL layers, which are finally stitched together, similar to self-attention
 70 |         to extract features from different subspaces.
 71 |         """
 72 |         self.dropout = dropout
 73 |         self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in
 74 |                            range(nheads)]
 75 |         for i, attention in enumerate(self.attentions):
 76 |             self.add_module('attention_{}'.format(i), attention)
 77 | 
 78 |         self.out_att = GraphAttentionLayer(nhid,56, dropout=dropout, alpha=alpha, concat=False)
 79 |         self.nheads=nheads
 80 | 
 81 |     def forward(self, x, adj):
 82 |         x = F.dropout(x, self.dropout, training=self.training)
 83 |         #x = torch.cat([att(x, adj) for att in self.attentions], dim=1)
 84 |         
 85 |         z=torch.zeros_like(self.attentions[1](x, adj))
 86 |         for att in self.attentions:
 87 |             z=torch.add(z, att(x, adj))
 88 |         x = z/self.nheads 
 89 |         x = F.dropout(x, self.dropout, training=self.training)
 90 |         x = F.elu(self.out_att(x, adj))
 91 |         return F.softmax(x, dim=1)
 92 | 
 93 | class MolecularGraphNeuralNetwork(nn.Module):
 94 |     def __init__(self, N_fingerprints, dim, layer_hidden, layer_output, dropout):
 95 |         super(MolecularGraphNeuralNetwork, self).__init__()
 96 |         self.layer_hidden=layer_hidden
 97 |         self.layer_output=layer_output
 98 |         self.embed_fingerprint = nn.Embedding(N_fingerprints, dim)
 99 |         self.W_fingerprint = nn.ModuleList([nn.Linear(dim, dim) for _ in range(layer_hidden)])
100 | 
101 |         self.W_output = nn.ModuleList([nn.Linear(56,56) for _ in range(layer_output)])
102 |         self.W_property = nn.Linear(56, 2)
103 |        
104 |         self.dropout = dropout
105 |         self.alpha = 0.25
106 |         self.nheads = 2
107 |         self.attentions = GAT(dim, dim, dropout, alpha=self.alpha, nheads=self.nheads).to(device)
108 | 
109 |     def pad(self, matrices, pad_value):
110 |         """Pad the list of matrices
111 |         with a pad_value (e.g., 0) for batch processing.
112 |         For example, given a list of matrices [A, B, C],
113 |         we obtain a new matrix [A00, 0B0, 00C],
114 |         where 0 is the zero (i.e., pad value) matrix.
115 |         """
116 |         shapes = [m.shape for m in matrices]
117 |         M, N = sum([s[0] for s in shapes]), sum([s[1] for s in shapes])
118 |         zeros = torch.FloatTensor(np.zeros((M, N))).to(device)
119 |         pad_matrices = pad_value + zeros
120 |         i, j = 0, 0
121 |         for k, matrix in enumerate(matrices):
122 |             m, n = shapes[k]
123 |             pad_matrices[i:i + m, j:j + n] = matrix
124 |             i += m
125 |             j += n
126 |         return pad_matrices
127 | 
128 |     def update(self, matrix, vectors, layer):
129 |         hidden_vectors = torch.relu(self.W_fingerprint[layer](vectors))
130 | 
131 |         return hidden_vectors + torch.matmul(matrix, hidden_vectors)
132 | 
133 |     def sum(self, vectors, axis):
134 |         sum_vectors = [torch.sum(v, 0) for v in torch.split(vectors, axis)]
135 |         return torch.stack(sum_vectors)
136 | 
137 |     def gnn(self, inputs):
138 |         """Cat or pad each input data for batch processing."""
139 |         Smiles, fingerprints, adjacencies, molecular_sizes = inputs
140 |         fingerprints = torch.cat(fingerprints)
141 |         adj = self.pad(adjacencies, 0)
142 |         """GNN layer (update the fingerprint vectors)."""
143 |         fingerprint_vectors = self.embed_fingerprint(fingerprints)
144 | 
145 |         for l in range(self.layer_hidden):
146 |             hs = self.update(adj, fingerprint_vectors, l)
147 |             fingerprint_vectors = F.normalize(hs, 2, 1)
148 |         """Attention layer"""
149 |         molecular_vectors = self.attentions(fingerprint_vectors, adj)
150 |         """Molecular vector by sum or mean of the fingerprint vectors."""
151 |         molecular_vectors = self.sum(molecular_vectors, molecular_sizes)
152 |         return Smiles, molecular_vectors
153 | 
154 |     def mlp(self, vectors):
155 |         """Regressor based on multilayer perceptron."""
156 |         for l in range(self.layer_output):
157 |         
158 |             vectors = torch.relu(self.W_output[l](vectors))
159 |         outputs = torch.sigmoid(self.W_property(vectors))
160 |         return outputs
161 | 
162 |     def forward_classifier(self, data_batch):
163 | 
164 |         inputs = data_batch[:-1]
165 |         correct_labels = torch.cat(data_batch[-1])
166 | 
167 | 
168 |         with torch.no_grad():
169 |             Smiles, molecular_vectors = self.gnn(inputs)
170 |             predicted_scores = self.mlp(molecular_vectors)
171 |           
172 |         predicted_scores = predicted_scores.to('cpu').data.numpy()
173 |         predicted_scores = [s[1] for s in predicted_scores]
174 |         correct_labels = correct_labels.to('cpu').data.numpy()
175 | 
176 |         return Smiles,predicted_scores, correct_labels
177 | 
178 | 
179 | class Tester(object):
180 |     def __init__(self, model,batch_test):
181 |         self.model = model
182 |         self.batch_test=batch_test
183 |     def test_classifier(self, dataset):
184 |         N = len(dataset)
185 |         SMILES, P, C = '', [], []
186 |         for i in range(0, N, self.batch_test):
187 |             data_batch = list(zip(*dataset[i:i + self.batch_test]))
188 |             Smiles, predicted_scores, correct_labels = self.model.forward_classifier( data_batch)
189 |             SMILES += ' '.join(Smiles) + ' '
190 |            
191 |             P.append(predicted_scores)
192 |             C.append(correct_labels)
193 |         SMILES = SMILES.strip().split()
194 |         tru = np.concatenate(C)
195 |         pre = np.concatenate(P)
196 |         pred = [1 if i >0.15 else 0 for i in pre]
197 |         #AUC = roc_auc_score(tru, pre)
198 |         cnf_matrix=confusion_matrix(tru,pred)
199 |         tn = cnf_matrix[0, 0]
200 |         tp = cnf_matrix[1, 1]
201 |         fn = cnf_matrix[1, 0]
202 |         fp = cnf_matrix[0, 1]
203 |         acc = (tp + tn) / (tp + fp + fn + tn)
204 |         #  Tru=map(str,np.concatenate(C))
205 |         #  Pre=map(str,np.concatenate(P))
206 |         #  predictions = '\n'.join(['\t'.join(x) for x in zip(SMILES, Tru, Pre)])
207 |         predictions = np.stack((tru, pred, pre))
208 |         return acc,  predictions
209 | 
210 |     def save_result(self, result, filename):
211 |         with open(filename, 'a') as f:
212 |             f.write(result + '\n')
213 | 
214 |     def save_predictions(self, predictions, filename):
215 |         with open(filename, 'w') as f:
216 |             f.write('Smiles\tCorrect\tPredict\n')
217 |             f.write(predictions + '\n')
218 | 
219 | def dump_dictionary(dictionary, filename):
220 |     with open('../DGCAN/model'+filename, 'wb') as f:
221 |         pickle.dump(dict(dictionary), f)
222 | def metrics(cnd_matrix):
223 |     '''Evaluation Metrics'''
224 |     
225 |     tn = cnd_matrix[0, 0]
226 |     tp = cnd_matrix[1, 1]
227 |     fn = cnd_matrix[1, 0]
228 |     fp = cnd_matrix[0, 1]
229 | 
230 |     bacc = ((tp / (tp + fn)) + (tn / (tn + fp))) / 2  # balance accurance
231 |     pre = tp / (tp + fp)  # precision/q+
232 |     rec = tp / (tp + fn)  # recall/se
233 |     sp = tn / (tn + fp)
234 |     q_ = tn / (tn + fn)
235 |     f1 = 2 * pre * rec / (pre + rec)  # f1score
236 |     mcc = ((tp * tn) - (fp * fn)) / math.sqrt(
237 |         (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))  # Matthews correlation coefficient
238 |     acc = (tp + tn) / (tp + fp + fn + tn)  # accurancy
239 |     
240 |     print('bacc:', bacc)
241 |     print('pre:', pre)
242 |     print('rec:', rec)
243 |     print('f1:', f1)
244 |     print('mcc:', mcc)
245 |     print('sp:', sp)
246 |     print('q_:', q_)
247 |     print('acc:', acc)
248 |     
249 |     
250 | def predict (test_name, property, radius, dim, layer_hidden, layer_output, dropout, batch_train,
251 |     batch_test, lr, lr_decay, decay_interval, iteration, N):
252 |     '''
253 |     
254 |     Parameters
255 |     ----------
256 |     data_test = '../dataset/data_test.txt', #test set   
257 |     radius = 1,        #hops of radius subgraph: 1, 2 
258 |     dim = 52,          #dimension of graph convolution layers
259 |     layer_hidden = 4,  #Number of graph convolution layers
260 |     layer_output = 10, #Number of dense layers
261 |     dropout = 0.45,    #drop out rate :0-1
262 |     batch_train = 8,   # batch of training set
263 |     batch_test = 8,    #batch of test set
264 |     lr =3e-4,          #learning rate: 1e-5,1e-4,3e-4, 5e-4, 1e-3, 3e-3,5e-3
265 |     lr_decay = 0.85,   #Learning rate decay:0.5, 0.75, 0.85, 0.9
266 |     decay_interval = 25,#Number of iterations for learning rate decay:10,25,30,50
267 |     iteration = 140,    #Number of iterations 
268 |     N = 5000,           #length of embedding: 2000,3000,5000,7000 
269 |     dataset_train = ('../dataset/data_train.txt') #training set
270 | 
271 |     Returns
272 |     -------
273 |     res_dev 
274 |     Predicting results
275 | 
276 |     '''
277 |     (radius, dim, layer_hidden, layer_output,
278 |      batch_train, batch_test, decay_interval,
279 |      iteration, dropout) = map(int, [radius, dim, layer_hidden, layer_output,
280 |                                      batch_train, batch_test,
281 |                                      decay_interval, iteration, dropout])
282 |                                      
283 |     lr, lr_decay = map(float, [lr, lr_decay])
284 |     if torch.cuda.is_available():
285 |         device = torch.device('cuda')
286 |         print('The code uses a GPU!')
287 |     else:
288 |         device = torch.device('cpu')
289 |         print('The code uses a CPU...')
290 | 
291 |     lr, lr_decay = map(float, [lr, lr_decay])
292 |     path = ''
293 |     dataname = ''
294 |     torch.manual_seed(0)
295 |     model = MolecularGraphNeuralNetwork(
296 |         N, dim, layer_hidden, layer_output, dropout).to(device)
297 |     model.load_state_dict(torch.load(r'model/model.pth'))
298 |     model.eval()
299 |     tester = Tester(model,batch_test)
300 |     dataset_dev=pp.create_testdataset(test_name, path, dataname,property)
301 |     np.random.seed(0)
302 |     #np.random.shuffle(dataset_dev)  
303 |     prediction_dev,  dev_res =  tester.test_classifier(dataset_dev)
304 |     if property == True:    
305 |         res_dev  = dev_res.T
306 |         cnd_matrix=confusion_matrix(res_dev[:,0], res_dev[:,1])
307 |         cnd_matrix 
308 |         metrics(cnd_matrix)
309 |     elif property == False:
310 |         res_dev =  dev_res.T[:,1]
311 | 
312 |     return res_dev
313 | 


--------------------------------------------------------------------------------
/dataset/bRo5.txt:
--------------------------------------------------------------------------------
  1 | CC1=NC=C(N=C1)C(=O)NCCC1=CC=C(C=C1)S(=O)(=O)NC(=O)NC1CCCCC1	1
  2 | CN1CCN(CC1)C(=O)O[C@@H]1N(C(=O)C2=NC=CN=C12)C1=NC=C(Cl)C=C1	1
  3 | [H][C@@]12CCO[C@]1([H])OC[C@@H]2OC(=O)N[C@@H](CC1=CC=CC=C1)[C@H](O)CN(CC(C)C)S(=O)(=O)C1=CC=C(N)C=C1	1
  4 | COC1=C(OC)C=C2[C@@H](CN(C)CCCN3CCC4=CC(OC)=C(OC)C=C4CC3=O)CC2=C1	1
  5 | [H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)[C@H](C(O)=O)C1=CSC=C1)C(O)=O	1
  6 | CC[N+](C)(CC)CCOC(=O)C1C2=CC=CC=C2OC2=CC=CC=C12	1
  7 | CC(C)O	1
  8 | COC1=CC2=C(NC(=N2)[S@@](=O)CC2=NC=C(C)C(OC)=C2C)C=C1	1
  9 | COC1=C(OC)C=C2C(N)=NC(=NC2=C1)N1CCN(CC1)C(=O)C1CCCO1	1
 10 | CCCCC(=O)N(CC1=CC=C(C=C1)C1=CC=CC=C1C1=NNN=N1)[C@@H](C(C)C)C(O)=O	1
 11 | CN(CCCCCCCCCCN(C)C(=O)OC1=CC=CC(=C1)[N+](C)(C)C)C(=O)OC1=CC=CC(=C1)[N+](C)(C)C	1
 12 | NC[C@H](O)C1=CC(O)=C(O)C=C1	1
 13 | C[N+]1(C)CCC(C1)OC(=O)C(O)(C1CCCC1)C1=CC=CC=C1	1
 14 | CCC1(CCC(=O)NC1=O)C1=CC=CC=C1	1
 15 | OCC(O)COC1=CC=C(Cl)C=C1	1
 16 | O=[N+]([O-])c1cc([N+](=O)[O-])c(O)c([N+](=O)[O-])c1	1
 17 | NC1=NC(=O)C2=C(N1)N(CCC(CO)CO)C=N2	1
 18 | CC1=CC(CN2CCC(CC2)=C2C3=CC=C(Cl)C=C3CCC3=C2N=CC=C3)=CN=C1	1
 19 | CSCCNC1=C2N=CN([C@@H]3O[C@H](COP(O)(=O)OP(O)(=O)C(Cl)(Cl)P(O)(O)=O)[C@@H](O)[C@H]3O)C2=NC(SCCC(F)(F)F)=N1	1
 20 | CCC(C1=CC=CC=C1)C1=C(O)C2=C(OC1=O)C=CC=C2	1
 21 | CS(=O)(=O)OCCCCOS(C)(=O)=O	1
 22 | CN1[C@@H](CNC2=CC=C(C=C2)C(=O)N[C@@H](CCC(O)=O)C(O)=O)CNC2=C1C(=O)N=C(N)N2	1
 23 | CC(C)(C)C(=O)OCOP(=O)(COCCN1C=NC2=C(N)N=CN=C12)OCOC(=O)C(C)(C)C	1
 24 | CNC[C@H](O)C1=CC(O)=C(O)C=C1	1
 25 | CC(C)(C)NCC(O)C1=CC(Cl)=C(N)C(Cl)=C1	1
 26 | CCCCOC1=CC=C(OCCCN2CCOCC2)C=C1	1
 27 | CC1(C)C[C@@H]1C(=O)N\C(=C/CCCCSC[C@H](N)C(O)=O)C(O)=O	1
 28 | CC(C)[C@@H](C)\C=C\[C@@H](C)[C@@]1([H])CC[C@@]2([H])\C(CCC[C@]12C)=C\C=C1\C[C@@H](O)CCC1=C	1
 29 | CN1C[C@@H](C2=CC(=CC=C2)S(=O)(=O)NCCOCCOCCNC(=O)NCCCCNC(=O)NCCOCCOCCNS(=O)(=O)C2=CC(=CC=C2)[C@@H]2CN(C)CC3=C2C=C(Cl)C=C3Cl)C2=C(C1)C(Cl)=CC(Cl)=C2	1
 30 | CN1CCOC(C2=CC=CC=C2)C2=CC=CC=C2C1	1
 31 | CC(C(O)=O)C1=CC(OC2=CC=CC=C2)=CC=C1	1
 32 | [H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)[C@H](NC(=O)N1CCN(CC)C(=O)C1=O)C1=CC=CC=C1)C(O)=O	1
 33 | CC(C)NCC(O)C1=CC(O)=C(O)C=C1	1
 34 | [H][C@@]12CCCN1C(=O)[C@H](CC(C)C)N1C(=O)[C@](NC(=O)[C@H]3CN(C)[C@]4([H])CC5=CNC6=CC=CC(=C56)[C@@]4([H])C3)(O[C@@]21O)C(C)C	1
 35 | [H][C@]12CC[C@]([H])(C[C@@H](C1)OC(=O)C(O)C1=CC=CC=C1)N2C	1
 36 | NC(N)=N	1
 37 | OC1=C(CC2=C(O)C(Cl)=CC(Cl)=C2Cl)C(Cl)=C(Cl)C=C1Cl	1
 38 | C\C(N(CC1=CN=C(C)NC1=N)C=O)=C(\CCO)SSCC1CCCO1	1
 39 | ONC(=O)\C=C\C1=CC=CC(=C1)S(=O)(=O)NC1=CC=CC=C1	1
 40 | CN1C(C(=O)NC2=CC=CC=N2)=C(O)C2=C(C=C(Cl)S2)S1(=O)=O	1
 41 | CCOC1=NC2=C(N1CC1=CC=C(C=C1)C1=CC=CC=C1C1=NOC(=O)N1)C(=CC=C2)C(=O)OCC1=C(C)OC(=O)O1	1
 42 | C[N+]1=C(\C=N\O)C=CC=C1	1
 43 | NC(=N)NC(N)=N	1
 44 | CC(C)=C[C@@H]1[C@@H](C(=O)OC2CC(=O)C(CC=C)=C2C)C1(C)C	1
 45 | CCCCN1CCCCC1C(=O)NC1=C(C)C=CC=C1C	1
 46 | FC(F)(F)CNC(=O)C1(CCCCN2CCC(CC2)NC(=O)C2=C(C=CC=C2)C2=CC=C(C=C2)C(F)(F)F)C2=CC=CC=C2C2=CC=CC=C12	1
 47 | NC1=CC=C(C=C1)C(O)=O	1
 48 | [H][C@@]12C[C@@H](C)[C@](OC(=O)CC)(C(=O)SCF)[C@@]1(C)C[C@H](O)[C@@]1(F)[C@@]2([H])C[C@H](F)C2=CC(=O)C=C[C@]12C	1
 49 | CCCCCOC1=CC=C(C=C1)C1=CC(=NO1)C1=CC=C(C=C1)C(=O)N[C@H]1C[C@@H](O)[C@@H](O)NC(=O)[C@@H]2[C@@H](O)[C@@H](C)CN2C(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@@H](NC1=O)[C@@H](C)O)[C@H](O)[C@@H](O)C1=CC(OS(O)(=O)=O)=C(O)C=C1)[C@H](O)CC(N)=O	1
 50 | C=CC[C@@H](CCC)C(=O)O	1
 51 | [H][C@@]12CC[C@](OC(=O)CCC)(C(=O)CO)[C@@]1(C)C[C@H](O)[C@@]1([H])[C@@]2([H])CCC2=CC(=O)CC[C@]12C	1
 52 | OC(C(O)=O)C1=CC=CC=C1	1
 53 | [H][C@@]12OC3=C(O)C=CC4=C3[C@@]11CCN(C)[C@]([H])(C4)[C@]1([H])C=C[C@@H]2O	1
 54 | COC1=CC=CC(=C1)N1CCN(CC1)C1=NC2=C(C=CC=C2F)[C@H](CC(O)=O)N1C1=CC(=CC=C1OC)C(F)(F)F	1
 55 | [H][C@]1(O)CO[C@]2([H])[C@]([H])(O)CO[C@]12[H]	1
 56 | COc1cc2c(cc1OC)[C@@H]1C[C@H](O)[C@@H](CC(C)C)CN1CC2	1
 57 | CC[C@@H](c1ccccc1)[C@H](c1ccc(OCCNC)cc1)c1ccc(O[C@@H]2O[C@H](C(=O)O)[C@@H](O)[C@@H](O)[C@@H]2O)cc1	1
 58 | C[C@@H]1CNc2c(cccc2S(=O)(=O)N[C@@H](CCCNC(=N)N)C(=O)N2CC[C@@H](C)C[C@@H]2C(=O)O)C1	1
 59 | O=C[C@H](O)[C@@H](O)[C@H](O)CO	1
 60 | O=C1c2cccc(O)c2C(=O)c2c(O)cccc21	1
 61 | C[C@]12CC[C@@H]3c4ccc(O)c(O)c4CC[C@H]3[C@@H]1CC[C@@H]2O	1
 62 | CCNCC#CCOC(=O)[C@](O)(c1ccccc1)C1CCCCC1	1
 63 | CCN(CC)CC(=O)Nc1c(C)ccc(O)c1C	1
 64 | NC(=O)c1c[nH]c(=O)cn1	1
 65 | C[C@H]1O[C@@H](n2cc(F)c(=O)[nH]c2=O)[C@H](O)[C@@H]1O	1
 66 | C=C1CC[C@@]2(O)[C@H]3Cc4ccc(O)c5c4[C@@]2(CCN3CC2CC2)[C@H]1O5	1
 67 | CCN(CC)C(=O)N1CC[N+](C)([O-])CC1	1
 68 | COc1ccccc1O	1
 69 | CCCCCCCCCCCC[N+](C)(C)CCOc1ccccc1	1
 70 | C[N+]1([O-])CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1	1
 71 | CCCCCc1cc(O)c2c(c1)OC(C)(C)[C@H]1CC[C@]3(C)O[C@@H]3[C@@H]21	1
 72 | NC(=O)N1c2ccccc2[C@H](O)[C@@H](O)c2ccccc21	1
 73 | CC(C)(O)CNc1nc(Nc2ccnc(C(F)(F)F)c2)nc(-c2cccc(C(F)(F)F)n2)n1	1
 74 | CCCCCCCCCCC(=O)O[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(C)[C@H]3CC[C@]12C	1
 75 | O[Si](O)(O)O	1
 76 | CN1C(=O)NC(=O)[C@@](C)(C2=C[C@@H](O)CCC2)C1=O	1
 77 | C[C@@H]([C@H](O)c1ccccc1)N(C)C	1
 78 | O=C1N[C@H](O)[C@@H](c2ccccc2)CO1	1
 79 | Nc1nc2c(c(=O)[nH]1)N[C@@H](CNc1ccc(C(=O)N[C@H](CCC(=O)O)C(=O)O)cc1)CN2	1
 80 | O=[N+]([O-])OCC(CO[N+](=O)[O-])(CO[N+](=O)[O-])CO[N+](=O)[O-]	1
 81 | C[C@H](NC(=O)[C@@H](Cc1ccc2ccccc2c1)NC(=O)[C@@H](C)N)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N[C@H](Cc1ccccc1)C(=O)N[C@@H](CCCCN)C(N)=O	1
 82 | CC1=C(/C=C/C(C)=C/C=C/C(C)=C/C(=O)O)C(C)(C)CCC1=O	1
 83 | OCCN1CCOCC1	1
 84 | COC(=O)c1ccc(C)cc1O	1
 85 | COc1cc(C[C@@](C)(O)C(=O)O)ccc1O	1
 86 | CN1CCCN=C1/C=C/c1cccs1	1
 87 | CCc1nc(C2CC2)c(C(N)=O)n1Cc1ccc2oc(-c3ccccc3NS(=O)(=O)C(F)(F)F)c(Br)c2c1	1
 88 | NC[C@H](O)COc1ccccc1C(=O)CCc1ccccc1	1
 89 | O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(OC(F)(F)[C@@H](F)C(F)(F)F)cc1Cl	1
 90 | CC(C)C(=O)O[C@H](C)OC(=O)NCC1(CC(=O)O)CCCCC1	1
 91 | CN(C(=O)CO)c1c(I)c(C(=O)NC[C@H](O)CO)c(I)c(C(=O)NC[C@H](O)CO)c1I	1
 92 | C[C@]12C=CC(=O)C=C1CC[C@@H]1[C@@H]2CC[C@]2(C)C(=O)CC[C@@H]12	1
 93 | CC(C)(C)[C@@H]1NC(=O)O[C@@H]2CCC[C@H]2OC/C=C/C(F)(F)c2nc3ccccc3nc2O[C@@H]2C[C@@H](C(=O)N[C@]3(C(=O)NS(=O)(=O)C4(C)CC4)C[C@H]3C(F)F)N(C2)C1=O	1
 94 | CCN(CC)CCc1nc(-c2ccccc2)no1	1
 95 | CN1C(=O)CC[C@H]1c1ccc[n+]([O-])c1	1
 96 | CNCc1ccc(-c2[nH]c3cc(F)cc4c3c2CCNC4=O)cc1	1
 97 | O=C(O[C@@H]1CN2CCC1CC2)c1ccccc1	1
 98 | CC(C)[C@@H](C(=O)O)N(Cc1ccc(-c2ccccc2-c2nnn[nH]2)cc1)C(=O)CC[C@@H](C)O	1
 99 | O=C1CN2Cc3c(ccc(Cl)c3Cl)NC2=N1	1
100 | CC#CC[C@@H](C)[C@H](O)/C=C/[C@@H]1[C@@H](O)C[C@@H]2C/C(=C/CCCC(=O)OCC(=O)c3ccccc3)C[C@H]21	1
101 | C[C@@H](c1cc2ccccc2s1)N(O[C@@H]1O[C@H](C(=O)O)[C@@H](O)[C@H](O)[C@H]1O)C(N)=O	1
102 | CCN(CC)CCOCCOC(=O)C(CC)(CC)c1ccccc1	1
103 | O=C(CSc1ccncc1)N[C@H]1C(=O)N2C(C(=O)O)=C(CO)CS[C@H]12	1
104 | COc1cc2c(cc1OC)[C@@H]1C[C@@H](O)[C@@H](CC(C)C)CN1CC2	1
105 | COc1ccc2[nH]c(C)c(CC(=O)O)c2c1	1
106 | CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](C(=O)Oc3ccc4c(c3)CCC4)c3ccccc3)C(=O)N2[C@H]1C(=O)O	1
107 | [O-][n+]1ccccc1SSc1cccc[n+]1[O-]	1
108 | Cc1cc(OS(=O)(=O)O)c2ccccc2c1OS(=O)(=O)O	1
109 | CCCCCCCCCCCC(=O)OCN1C(=O)CCc2ccc(OCCCCN3CCN(c4cccc(Cl)c4Cl)CC3)cc21	1
110 | CCCCCCc1ccc(O)cc1O	1
111 | C[C@H](I)[C@H]1OC[C@@H](CO)O1	1
112 | CN1CCC[C@@H]1c1ccc[n+](C)c1	1
113 | C[C@H](CCC(=O)O)[C@H]1CC[C@H]2[C@@H]3C(=O)C[C@@H]4CC(=O)CC[C@]4(C)[C@H]3CC(=O)[C@@]21C	1
114 | CCC[C@@H](Cc1ccccc1)N1CCCC1	1
115 | CCCNC(=O)NS(=O)(=O)c1ccc(Cl)c(O)c1	1
116 | O=C(O)COc1ccc(C(=O)c2cccs2)c(Cl)c1Cl	1
117 | CNCCN1C(=O)[C@H](OC(C)=O)[C@@H](c2ccc(OC)cc2)Sc2ccccc21	1
118 | CCCCCCCCCC[N+](C)(C)CCCCCCCCCC	1
119 | O=C(O)CCCCCCNC1c2ccccc2CCc2ccccc21	1
120 | NC(=O)N1c2ccccc2C=Cc2cc(O)ccc21	1
121 | C/C(=C(/CCO)SSC[C@H]1CCCO1)N(C=O)Cc1cnc(C)nc1N	1
122 | O=C1NC(=O)[C@@H](N2C(=O)c3ccccc3C2=O)C[C@@H]1O	1
123 | CC(C)(C)NC[C@H](O)COc1cccc2c1SCCC2	1
124 | CN1C(=O)NC(=O)[C@](C)([C@]23CCCC[C@@H]2O3)C1=O	1
125 | N[C@H](CO)C(=O)NNCc1ccc(O)c(O)c1O	1
126 | OC[C@H]1O[C@](O)(CO[C@]2(CO)O[C@H](CO)[C@@H](O)[C@@H]2O)[C@@H](O)[C@@H]1O	1
127 | Nc1nccs1	1
128 | CCn1cc(C(=O)O)c(=O)c2cnc(N3CCCC3)nc21	1
129 | CN(Cc1nc2c(N)[nH]c(=N)[nH]c-2nc1=O)c1ccc(C(=O)N[C@@H](CCC(=O)O)C(=O)O)cc1	1
130 | C[C@]12C[C@H]3C[C@@](N)(C1)C[C@@](CO)(C3)C2	1
131 | CNCCC[C@](C#N)(c1ccc(OC)c(OC)c1)C(C)C	1
132 | CN1C(=O)[C@H](O[C@@H]2O[C@H](C(=O)O)[C@@H](O)[C@H](O)[C@H]2O)C[C@H]1c1cccnc1	1
133 | CCOc1c(N)c2c(c(OCC)c1OCC)[C@H]([C@@H]1c3c(cc4c(c3OC)OCO4)CCN1C)OC2=O	1
134 | O=C(c1ccc(O)cc1)c1ccc2n1CC[C@@H]2C(=O)O	1
135 | CN(c1nccc(=O)[nH]1)C1CCN(c2nc3ccccc3n2Cc2ccc(F)cc2)CC1	1
136 | CCCc1nnc(NC(=O)[C@@]2(C)Sc3ccccc3N(C)C2=O)s1	0
137 | COc1ccccc1OC(=O)Cn1nc(C)c([N+](=O)[O-])c1C	0
138 | Cc1ccc2c(c1)C(=O)NC[C@]1(CC[C@H](N(C)C(=O)c3ccno3)CC1)O2	0
139 | CC(=O)N[C@@H](C)C(=O)Nc1ccccc1N1CCCCC1	0
140 | CCn1/c(=N/C(=O)c2ccc([N+](=O)[O-])s2)sc2cc([N+](=O)[O-])ccc21	0
141 | O=C(Nc1cccc(N2CCCC2)c1)c1ccccc1	0
142 | O=C([C@H]1CCCO1)N1CCN(Cc2cn3cc(Cl)ccc3n2)CC1	0
143 | CN(C(=O)C[N@H+]1CC[C@H](C(=O)NC2CCCCC2)CC1)c1ccccc1	0
144 | O=C(CN1CCN(Cc2ccc(Cl)s2)CC1)Nc1cccc(S(=O)(=O)N2CCCCC2)c1	0
145 | COc1ccc2sc(N(Cc3ccccc3)C(=O)C34C[C@H]5C[C@@H](C3)C[C@@H](C4)C5)nc2c1	0
146 | COc1ccc(OC)c2sc(N(CCCN(C)C)C(=O)[C@H]3CC(=O)N(c4ccc(F)cc4)C3)nc12	0
147 | Cc1nc(-c2ccc(F)cc2)n(CC(=O)N(C)C2CCCCC2)c(=O)c1CCO	0
148 | Cn1c(C[N@@H+]2CC[C@H](CNc3ncccc3C#N)C2)nc2ccccc21	0
149 | CCCNC(=O)[C@H]1CCCN1C(=O)/C=C/c1ccc(SC(F)(F)F)cc1	0
150 | COc1cc(C(=O)OCC(=O)NC2CC2)ccc1OCCC(C)C	0
151 | NC(=O)[C@H]1CC(C(=O)Nc2ncn[n-]2)=NN1c1ccc(F)cc1	0
152 | Cn1c(=O)c2ccccc2n(CC(=O)NCc2ccccc2Cl)c1=O	0
153 | C[C@](O)(CNC(=O)c1cc2ccccc2o1)C1CC1	0
154 | COc1ccccc1C(=O)OCC(=O)N[C@@H](C)c1ccccc1	0
155 | COc1cccc(-c2nn(-c3ccc(C)cc3)cc2C(=O)N2CCC(N(C)C)CC2)c1	0
156 | CC(=O)Nc1ccc(/C(C)=N/O[C@@H](C)C(=O)Nc2ccc(F)c(F)c2)cc1	0
157 | COC(=O)C[C@@H]1CN(Cc2c(Cl)nc3sccn23)CCO1	0
158 | Cc1ccc(N2C[C@H](C(=O)N3CCC[C@H]([C@@H](C)O)C3)CCC2=O)cc1C	0
159 | CC[C@H](C)c1ccc(NC(=O)CSc2nnc(-c3cccnc3)n2N)cc1	0
160 | CC(=O)Nc1ccc(-c2nnc3n(Cc4ccc(C)cc4)c(=O)c4ccccc4n23)cc1	0
161 | CC(C)CNC(=O)C[C@H]1CSc2nc3c(cnn3-c3ccc(Cl)cc3)c(=O)n21	0
162 | O[C@]1(c2ccccc2)CC[N@H+](CCOc2cccc(Cl)c2)CC1	0
163 | CCc1nc2n(n1)CCC[C@@H]2NC(=O)N[C@@H]1CCC(C)(C)c2ccccc21	0
164 | C/C(=N\NC(=O)COc1ccccc1[N+](=O)[O-])c1ccc(-c2ccccc2)cc1	0
165 | O=C(CSc1ccc(F)cc1)N1CCN(c2nc3ccc(Br)cc3s2)CC1	0
166 | COCc1ccc(C[N@H+]2CCCC[C@@H]2CNC(=O)c2ccccc2)cc1	0
167 | Cc1cccc(Cl)c1OCc1cc(=O)n(C)c(=O)n1C	0
168 | CCOC(=O)CSc1nnc(-c2ccccc2[N+](=O)[O-])o1	0
169 | O=C(CC1=CCCCC1)NC[C@@H]1Cc2cc(F)cc(-c3cncnc3)c2O1	0
170 | CN(C)CCN(C(=O)c1ccc([N+](=O)[O-])cc1)c1nc2ccc(OC(F)(F)F)cc2s1	0
171 | CC(C)N(C(=O)/C=C/c1cncc(F)c1)C1CCC1	0
172 | Cc1cc(C)n2nc(C(=O)N/N=C/c3ccc(Cl)c(Cl)c3)nc2n1	0
173 | O=C(Nc1ccccc1F)c1nc(SCc2ccc(Cl)cc2)ncc1Br	0
174 | CCOC(=O)C1=C(c2ccccc2)N=c2s/c(=C\c3cc4c(cc3C)OCO4)c(=O)n2[C@H]1c1ccc(Cl)cc1	0
175 | COc1ccc(Br)c(C(=O)NCCCN2CCc3ccccc3C2)c1	0
176 | COc1ccc(S[C@@H](C)C(=O)Nc2ccc(CN3CCCC3)cc2)cc1	0
177 | CC(C)C[C@@H](N[C@@H](C)C(=O)Nc1ccc(S(N)(=O)=O)cc1)c1ccccc1	0
178 | C[C@H](NC(=O)NC[C@H](O)c1cccc(F)c1)c1ccc(F)cc1	0
179 | COc1ccc([C@@H](C)NCc2cccc(OC)c2OC)c(F)c1	0
180 | Cn1c(SCC(=O)N(c2ccccc2)[C@H]2CCS(=O)(=O)C2)nnc1C1CC1	0
181 | Cn1c(CSc2ncccn2)nnc1SCC(=O)O	0
182 | CC1=C(C#N)C(=O)N(C2CCCC2)C(=O)/C1=C/Nc1ccc([N+](=O)[O-])cc1C	0
183 | COc1cc([C@H]2Nc3ccc(OCCC(C)C)cc3[C@@H]3C=CC[C@@H]23)ccc1OC(C)=O	0
184 | CC(=O)NCC(=O)N1CCN(CC(=O)NC(C)C)CC1	0
185 | CCCc1ccc(OCC(=O)c2ccc3c(c2)N(CC(=O)NCc2cccnc2)C(=O)CO3)cc1	0
186 | COc1ccc(S(=O)(=O)N(C)c2ccc(C(=O)Nc3ccccc3SC)cc2)cc1OC	0
187 | Cc1cc([C@H]2CCC[N@@H+]2Cc2ccccc2Br)on1	0
188 | O=[N+]([O-])c1ccccc1-c1ccc(CN2CCC(n3cncn3)CC2)s1	0
189 | COc1cc([N+](=O)[O-])ccc1NC(=O)[C@@H]1CCCC[C@H]1C(=O)O	0
190 | Cc1ccc(OCc2cccc(C(=O)N(C3CCCC3)C3CC3)c2)cn1	0
191 | COc1ccc(CNC(=O)NCc2ccc(Cn3cnc4ccccc43)cc2)cc1	0
192 | CC1(C)CCc2cc(S(=O)(=O)NCc3csc(-c4cccs4)n3)ccc2O1	0
193 | CCn1c(SCC(=O)Nc2ccc(Br)cc2C(=O)O)nnc1C1CC1	0
194 | CC(=O)c1cccc(NC(=O)[C@H](N[C@H](C)c2ccc(F)cc2)c2ccccc2)c1	0
195 | C[C@@H](NC(=O)COc1ccc2c3c(c(=O)oc2c1)CCCC3)C(=O)NCC(=O)NCC(=O)O	0
196 | CC(C)(C)c1n[nH]c([C@H]2CN(C(=O)CCn3cncn3)CCO2)n1	0
197 | Cc1nc(C(C)(C)NC(=O)NCCOC[C@H]2CCCO2)sc1C	0
198 | O=C(O)c1ccc2c(c1)=N[C@@H](c1ccccc1Cl)N=2	0
199 | CN(CC(=O)N(C)c1cccc2ncccc12)C(=O)OC(C)(C)C	0
200 | C[C@@H]1COCC[N@H+]1Cc1ccc(-c2c(F)cccc2F)o1	0
201 | COc1ccc(NC(=O)c2nc(-c3ccco3)n(-c3cccc(C)c3)n2)cc1	0
202 | COc1ccc(N2CCN(Cc3cc(OC)c(O)c([N+](=O)[O-])c3)CC2)cc1	0
203 | COCCNC(=O)[C@@H]1CCCN([C@H]2CC[N@H+](Cc3cc(C)cc(C)c3)CC2)C1	0
204 | N#C[C@@H]1C(=O)Nc2nc(NCc3ccc4c(c3)OCCO4)[nH]c(=O)c2[C@@H]1/C=C/c1ccccc1	0
205 | Cc1ccc(C(C)C)cc1OCC(=O)Nc1ccc2c(c1)OCC(=O)N2	0
206 | CCCCOc1c(Cl)cc(C(=O)Nc2ccc(S(=O)(=O)N3C[C@H](C)O[C@@H](C)C3)cc2)cc1Cl	0
207 | Cc1nccc(CN2CCC[C@H](c3cc(=O)[nH]c(-c4ccncc4)n3)C2)n1	0
208 | COCc1ccc(CN[C@H]2CCCN(c3ccccc3F)C2=O)cc1	0
209 | Cc1ccc([C@H](NC(=O)CS(=O)(=O)Cc2cccc(Br)c2)C2CC2)cc1	0
210 | COc1cccc([C@@H]2[C@@H]3CCCC=C3[C@H](C#N)C(=N)C2(C#N)C#N)c1	0
211 | Cc1n[nH]c(C)c1[C@H]1CCCN1C(=O)COc1ccccc1	0
212 | O=C(NCc1cccs1)c1cccc(NC(=O)c2cc3sccn3c2)c1	0
213 | CC(C)[C@@H]1CCc2ccccc2N1C(=O)c1ccc2c(c1)C(=O)N(C)C2=O	0
214 | Cc1nc([N+](=O)[O-])cn1CC(=O)NCc1cccnc1	0
215 | CCN(CC(=O)Nc1nc(-c2ccc(OC)cc2)cn1-c1ccc(C(C)C)cc1)C(=O)c1ccc(C)cc1	0
216 | COc1ccc([C@@H]2c3c(n[nH]c3-c3ccc(Cl)cc3)C(=O)N2c2ccc(C)cc2)c(OC)c1	0
217 | O=C(Cn1nc2nc(Nc3ccccc3)ccn2c1=O)NCc1ccc(F)cc1F	0
218 | Cc1cccc(OCCCCNc2ccc(C)cc2C(=O)O)c1	0
219 | CC(C)n1cc(S(=O)(=O)N2CCC(n3cccc3)CC2)cn1	0
220 | O=C1CCc2ccc(C(=O)NCc3ccc(CC(=O)NC4CCCCC4)cc3)nc2N1	0
221 | CCOc1cccc(C(=O)NCC23CC4(CNC(=O)c5cccc(OCC)c5)C[C@H](C2)C[C@H](C3)C4)c1	0
222 | C[C@H](Oc1cccc(C=O)c1)C(=O)Nc1ccc(N2CCCCC2)cc1	0
223 | Cc1ccc(-c2[nH]ncc2C[N@H+](C)Cc2cc(C)on2)cc1	0
224 | Cc1cccc(Cl)c1NC(=O)[C@@H](C)c1ccc(Cl)s1	0
225 | Cc1nnc(NC(=O)CSc2nnc(C)c(O)n2)s1	0
226 | O=C(OCCN1CCCC1=O)[C@H]1C[C@H]1c1ccc(OC(F)(F)F)cc1	0
227 | Cc1ccccc1NC(=O)Cc1cc(-c2ccccc2)on1	0
228 | Cc1cccc(NCC(=O)N/N=C/c2ccc3c(c2)OCO3)c1	0
229 | Cc1ccc(C)n1N1C(=O)/C(=C/c2c(F)cccc2Cl)SC1=S	0
230 | COc1ccc(/N=C2\NC(=O)/C(=C\C=C\c3ccccc3)S2)cc1	0
231 | COc1cccc(NC2=NN=C(c3cc(C)n(-c4ncn[nH]4)c3C)CS2)c1	0
232 | Cc1nn(-c2ccccc2)c(C)c1CN(C)C(=O)Cc1ccc2c(c1)CCCC2	0
233 | Cc1cc(C)c([N+](=O)[O-])cc1NC(=O)N1CCC(c2ccn[nH]2)CC1	0
234 | COC(=O)c1cc(C2OCCO2)ccn1	0
235 | Cc1cccc(OCCCCn2c([C@@H](C)NC(=O)c3cccc(Br)c3)nc3ccccc32)c1	0
236 | CCc1cccc(CC)c1NC(=O)CN(c1ccc(F)cc1)S(C)(=O)=O	0
237 | O=C(NCc1ccc2c(c1)OCO2)c1ccc2c(=O)n(C[C@H]3CCCO3)c(=S)[nH]c2c1	0
238 | O=C(O)CNC(=O)CCCCCN1C(=O)/C(=C/C=C/c2ccccc2)SC1=S	0
239 | COc1cc(/C=C(\C#N)C(=O)Nc2cc(Cl)ccc2Cl)cc2c1OCCO2	0
240 | NC(=O)c1ccc(C[N@H+](CC(=O)N2CCC(C(=O)Nc3ccccc3)CC2)C2CC2)cc1	0
241 | COCCOc1ccc(C#N)cc1NC(=O)c1ccsc1	0
242 | Cn1c(=O)c2c(nc(CN3CCOCC3)n2Cc2ccccc2F)n(C)c1=O	0
243 | CC(=O)[C@H]1C(=O)C(=O)N(CCC(=O)O)[C@H]1c1ccc(C(C)C)cc1	0
244 | CCN1CCN(c2ccc(NC(=O)[C@@H](NC(=O)c3ccc(Cl)cc3)C(C)C)cc2C)CC1	0
245 | C#CCNC(=O)N1CCN(S(=O)(=O)c2ccc3c(c2)OCCO3)CC1	0
246 | O=C(c1ccc(Cl)cc1Cl)N1CCCN(c2ccc(C(F)(F)F)cn2)CC1	0
247 | CC(C)[C@H](CNC(=O)CCS(C)(=O)=O)N1CCc2ccccc2C1	0
248 | O=C(NCCNC(=O)c1cccnc1OCC(F)F)c1cccnc1	0
249 | CCc1cc(C(F)(F)F)n2nc([C@H]3CCCN3C(=O)[C@H]3CCS(=O)(=O)C3)cc2n1	0
250 | CCCN(CCC)CCCNC(=O)c1oc(=O)c2ccccc2c1-c1ccccc1	0
251 | CN(C(=O)c1cccc(-c2ccoc2)c1)c1ccc(F)c(F)c1	0
252 | Cc1cc(C)cc(NC(=S)N(CCCn2ccnc2)C[C@H]2CC=CCC2)c1	0
253 | CCc1ccc([N+](=O)[O-])cc1S(=O)(=O)NC(C)(C)CC	0
254 | Nc1cc(N2CCC[C@@H](c3[nH+]ccn3Cc3cscn3)C2)ncn1	0
255 | O=c1[nH]cc(-c2cc(Cl)cc(Cl)c2)c(=O)[nH]1	0
256 | O=C(O)c1nc(O)n(-c2cccc(Cl)c2)n1	0
257 | CC/N=C(/NCc1cc(C)on1)N(C)Cc1ccc(OC)cc1	0
258 | Fc1ccc(NCl)c(S)c1	0
259 | CN(CCCO)C(=O)OC(C)(C)C	0
260 | O=C1C[C@H](C(=O)Nc2ccccc2C(F)(F)F)c2c(nc(Nc3cc(Cl)cc(Cl)c3)[nH]c2=O)N1	0


--------------------------------------------------------------------------------
/Discussion/GNN.py:
--------------------------------------------------------------------------------
  1 | import timeit
  2 | 
  3 | import numpy as np
  4 | import math
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | import torch.optim as optim
  9 | import pickle
 10 | from sklearn.metrics import roc_auc_score,roc_curve
 11 | from sklearn.metrics import confusion_matrix
 12 | import preprocess as pp
 13 | import pandas as pd
 14 | import matplotlib.pyplot as plt
 15 |     
 16 |     
 17 | class MolecularGraphNeuralNetwork(nn.Module):
 18 |     def __init__(self, N_fingerprints, dim, layer_hidden, layer_output):
 19 |         super(MolecularGraphNeuralNetwork, self).__init__()
 20 |         self.embed_fingerprint = nn.Embedding(N_fingerprints, dim)
 21 |         self.W_fingerprint =   nn.ModuleList([nn.Linear(dim, dim)
 22 |                                             for _ in range(layer_hidden)])
 23 | 
 24 |         self.W_output = nn.ModuleList([nn.Linear(dim, dim)
 25 |                                        for _ in range(layer_output)])
 26 |         self.W_property = nn.Linear(dim, 2)
 27 | 
 28 | 
 29 |     def pad(self, matrices, pad_value):
 30 |         """Pad the list of matrices
 31 |         with a pad_value (e.g., 0) for batch processing.
 32 |         For example, given a list of matrices [A, B, C],
 33 |         we obtain a new matrix [A00, 0B0, 00C],
 34 |         where 0 is the zero (i.e., pad value) matrix.
 35 |         """
 36 |         shapes = [m.shape for m in matrices]
 37 |         M, N = sum([s[0] for s in shapes]), sum([s[1] for s in shapes])
 38 |         zeros = torch.FloatTensor(np.zeros((M, N))).to(device)
 39 |         pad_matrices = pad_value + zeros
 40 |         i, j = 0, 0
 41 |         for k, matrix in enumerate(matrices):
 42 |             m, n = shapes[k]
 43 |             pad_matrices[i:i+m, j:j+n] = matrix
 44 |             i += m
 45 |             j += n
 46 |         return pad_matrices
 47 | 
 48 |     def update(self, matrix, vectors, layer):
 49 |         hidden_vectors = torch.relu(self.W_fingerprint[layer](vectors))
 50 | 
 51 |         return hidden_vectors + torch.matmul(matrix, hidden_vectors)
 52 | 
 53 |     def sum(self, vectors, axis):
 54 |         sum_vectors = [torch.sum(v, 0) for v in torch.split(vectors, axis)]
 55 |         return torch.stack(sum_vectors)
 56 |     def gnn(self, inputs):
 57 | 
 58 |         """Cat or pad each input data for batch processing."""
 59 |         Smiles,fingerprints, adjacencies, molecular_sizes = inputs
 60 |         fingerprints = torch.cat(fingerprints)
 61 |         adjacencies = self.pad(adjacencies, 0)
 62 | 
 63 |         """GNN layer (update the fingerprint vectors)."""
 64 |         fingerprint_vectors = self.embed_fingerprint(fingerprints)
 65 |         for l in range(layer_hidden):
 66 |             hs = self.update(adjacencies, fingerprint_vectors, l)
 67 |             fingerprint_vectors = F.normalize(hs, 2, 1)  # normalize.
 68 | 
 69 |         """Molecular vector by sum or mean of the fingerprint vectors."""
 70 |         molecular_vectors = self.sum(fingerprint_vectors, molecular_sizes)
 71 |           
 72 |         return Smiles,molecular_vectors
 73 | 
 74 |     def mlp(self, vectors):
 75 |         """Classifier  based on multilayer perceptron给予多层感知器的分类器."""
 76 |         for l in range(layer_output):
 77 |             vectors = torch.relu(self.W_output[l](vectors))
 78 |         outputs = torch.sigmoid(self.W_property(vectors))
 79 |         return outputs
 80 | 
 81 | 
 82 |     def forward_classifier(self, data_batch, train):
 83 | 
 84 |         inputs = data_batch[:-1]
 85 |         correct_labels = torch.cat(data_batch[-1])
 86 | 
 87 |         if train:
 88 |             Smiles,molecular_vectors = self.gnn(inputs)
 89 |           
 90 |             predicted_scores = self.mlp(molecular_vectors)
 91 |             
 92 |             loss = F.cross_entropy(predicted_scores, correct_labels.long())
 93 |             predicted_scores = predicted_scores.to('cpu').data.numpy()
 94 |             predicted_scores = [s[1] for s in predicted_scores]
 95 |       
 96 |             
 97 |             correct_labels = correct_labels.to('cpu').data.numpy()
 98 |             return loss,predicted_scores, correct_labels
 99 |         else:
100 |             with torch.no_grad():
101 |                 Smiles,molecular_vectors = self.gnn(inputs)
102 |                 predicted_scores = self.mlp(molecular_vectors)
103 |                 loss = F.cross_entropy(predicted_scores, correct_labels.long())
104 |             predicted_scores = predicted_scores.to('cpu').data.numpy()
105 |             predicted_scores = [s[1] for s in predicted_scores]
106 |             correct_labels = correct_labels.to('cpu').data.numpy()
107 |          
108 |             return Smiles,loss,predicted_scores, correct_labels
109 | 
110 | class Trainer(object):
111 |     def __init__(self, model):
112 |         self.model = model
113 |         self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
114 | 
115 |     def train(self, dataset):
116 |         np.random.shuffle(dataset)
117 |         N = len(dataset)
118 |         loss_total = 0
119 |         P, C = [], []
120 |         for i in range(0, N, batch_train):
121 |             data_batch = list(zip(*dataset[i:i+batch_train]))
122 |             loss,predicted_scores, correct_labels= self.model.forward_classifier(data_batch, train=True)
123 |          
124 |             P.append(predicted_scores)
125 |             C.append(correct_labels)
126 |             self.optimizer.zero_grad()
127 |             loss.backward()
128 |             self.optimizer.step()
129 |             loss_total += loss.item()
130 |         tru=np.concatenate(C)
131 |         pre=np.concatenate(P)
132 |         AUC = roc_auc_score(tru, pre)
133 |         pred = [1 if i >0.4 else 0 for i in pre]
134 |         predictions =np.stack((tru,pred,pre))
135 |         return AUC, loss_total,predictions
136 | 
137 | 
138 | class Tester(object):
139 |     def __init__(self, model):
140 |         self.model = model
141 | 
142 |     def test_classifier(self, dataset):
143 |         N = len(dataset)
144 |         loss_total = 0
145 |         SMILES,P, C ='', [], []
146 |         for i in range(0, N, batch_test):
147 |             data_batch = list(zip(*dataset[i:i+batch_test]))
148 |             (Smiles,loss,predicted_scores,correct_labels) = self.model.forward_classifier(
149 |                                                data_batch, train=False)
150 |             
151 |             SMILES += ' '.join(Smiles) + ' '
152 |    
153 |             loss_total += loss.item()
154 |             P.append(predicted_scores)
155 |             C.append(correct_labels)
156 |         SMILES = SMILES.strip().split()
157 |         tru=np.concatenate(C)
158 |     
159 |         pre=np.concatenate(P)
160 |         AUC = roc_auc_score(tru, pre)
161 |         pred = [1 if i >0.4 else 0 for i in pre]
162 |       #  Tru=map(str,np.concatenate(C))
163 |       #  Pre=map(str,np.concatenate(P))
164 |       #  predictions = '\n'.join(['\t'.join(x) for x in zip(SMILES, Tru, Pre)])
165 |         predictions =np.stack((tru,pred,pre))
166 |         return AUC, loss_total,predictions
167 |     def save_result(self, result, filename):
168 |         with open(filename, 'a') as f:
169 |             f.write(result + '\n')
170 |     def save_predictions(self, predictions, filename):
171 |         with open(filename, 'w') as f:
172 |             f.write('Smiles\tCorrect\tPredict\n')
173 |             f.write(predictions + '\n')
174 |     def save_model(self, model, filename):
175 |         torch.save(model.state_dict(), filename)
176 | def split_dataset(dataset, ratio):
177 |     """Shuffle and split a dataset."""
178 |     np.random.seed(111)  # fix the seed for shuffle.
179 |     np.random.shuffle(dataset)
180 |     n = int(ratio * len(dataset))
181 |     return dataset[:n], dataset[n:]
182 | def edit_dataset(drug,non_drug,task):
183 |     np.random.seed(111)  # fix the seed for shuffle.
184 |     
185 |     if task =='balance':
186 |         #np.random.shuffle(non_drug)
187 |         non_drug=non_drug[0:len(drug)]
188 |        
189 |     else:
190 |         np.random.shuffle(non_drug)
191 |     np.random.shuffle(drug)
192 |     dataset_train_drug, dataset_test_drug = split_dataset(drug, 0.9)
193 |    # dataset_train_drug,dataset_dev_drug =  split_dataset(dataset_train_drug, 0.9)
194 |     dataset_train_no, dataset_test_no = split_dataset(non_drug, 0.9)
195 |    # dataset_train_no,dataset_dev_no =  split_dataset(dataset_train_no, 0.9)
196 |     dataset_train =  dataset_train_drug+dataset_train_no
197 |     dataset_test= dataset_test_drug+dataset_test_no
198 |   #  dataset_dev = dataset_dev_drug+dataset_dev_no
199 |     return dataset_train, dataset_test
200 | 
201 | def dump_dictionary(dictionary, filename):
202 |         with open(filename, 'wb') as f:
203 |             pickle.dump(dict(dictionary), f)
204 | if __name__ == "__main__":
205 |        
206 |     radius=1
207 |     dim=65
208 |     layer_hidden=0
209 |     layer_output=5
210 | 
211 |     batch_train=48
212 |     batch_test=48
213 |     lr=3e-4
214 |     lr_decay=0.85
215 |     decay_interval=10#下降间隔
216 |     iteration=140
217 |     N=5000
218 |     (radius, dim, layer_hidden, layer_output,
219 |      batch_train, batch_test, decay_interval,
220 |      iteration) = map(int, [radius, dim, layer_hidden, layer_output,
221 |                             batch_train, batch_test,
222 |                             decay_interval, iteration])
223 |     lr, lr_decay = map(float, [lr, lr_decay])
224 |     if torch.cuda.is_available():
225 |         device = torch.device('cuda')
226 |         print('The code uses a GPU!')
227 |     else:
228 |         device = torch.device('cpu')
229 |         print('The code uses a CPU...')
230 |     print('-'*100)
231 | 
232 | #    print('Preprocessing the', dataset, 'dataset.')
233 |     print('Just a moment......')
234 |     print('-'*100)
235 |     path='E:/code/drug/drugnn/'
236 |     dataname=''
237 |     
238 |     dataset_train = pp.create_dataset('data_train.txt',path,dataname)
239 |     dataset_test = pp.create_dataset('data_test.txt',path,dataname)
240 |     
241 |     #dataset_train, dataset_test = edit_dataset(dataset_drug, dataset_nondrug,'balance')   
242 |     #dataset_train, dataset_dev = split_dataset(dataset_train, 0.9)   
243 |     print('The preprocess has finished!')
244 |     print('# of training data samples:', len(dataset_train))
245 |     #print('# of development data samples:', len(dataset_dev))
246 |     print('# of test data samples:', len(dataset_test))
247 |     print('-'*100)
248 | 
249 |     print('Creating a model.')
250 |     torch.manual_seed(111)
251 |     model = MolecularGraphNeuralNetwork(
252 |             N, dim, layer_hidden, layer_output).to(device)
253 |     trainer = Trainer(model)
254 |     tester = Tester(model)
255 |     print('# of model parameters:',
256 |           sum([np.prod(p.size()) for p in model.parameters()]))
257 |     print('-'*100)
258 |     file_result = path+'AUC'+'.txt'
259 | #    file_result = '../output/result--' + setting + '.txt'
260 |     result = 'Epoch\tTime(sec)\tLoss_train\tLoss_test\tAUC_train\tAUC_test'
261 |     file_test_result  =  path+ 'test_prediction'+ '.txt'
262 |     file_predictions =  path+'train_prediction' +'.txt'
263 |     file_model =   path+'model'+'.h5'
264 |     with open(file_result, 'w') as f:
265 |         f.write(result + '\n')
266 | 
267 |     print('Start training.')
268 |     print('The result is saved in the output directory every epoch!')
269 | 
270 |     np.random.seed(111)
271 | 
272 |     start = timeit.default_timer()
273 | 
274 |     for epoch in range(iteration):
275 | 
276 |         epoch += 1
277 |         if epoch % decay_interval == 0:
278 |             trainer.optimizer.param_groups[0]['lr'] *= lr_decay
279 | #[‘amsgrad’, ‘params’, ‘lr’, ‘betas’, ‘weight_decay’, ‘eps’]
280 |         prediction_train,loss_train,train_res= trainer.train(dataset_train)
281 |        
282 | 
283 |         #prediction_dev,dev_res = tester.test_classifier(dataset_dev)
284 |         prediction_test,loss_test,test_res = tester.test_classifier(dataset_test)
285 | 
286 | 
287 |         time = timeit.default_timer() - start
288 | 
289 |         if epoch == 1:
290 |             minutes = time * iteration / 60
291 |             hours = int(minutes / 60)
292 |             minutes = int(minutes - 60 * hours)
293 |             print('The training will finish in about',
294 |                   hours, 'hours', minutes, 'minutes.')
295 |             print('-'*100)
296 |             print(result)
297 | 
298 |         result = '\t'.join(map(str, [epoch, time, loss_train, loss_test,prediction_train,prediction_test]))
299 |         tester.save_result(result, file_result)
300 | 
301 |         print(result)
302 |         
303 | 
304 |     
305 |     loss = pd.read_table(file_result)
306 |     plt.plot(loss['Loss_train'], color='r',label='Loss of train set')
307 |     plt.plot(loss['Loss_test'], color='y',label='Loss of train set')
308 |     plt.plot(loss['AUC_train'], color='y',label='AUC of train set')
309 |     plt.plot(loss['AUC_test'], color='b',label='AUC of test set')
310 |    # plt.plot(loss['AUC_test'], color='y',label='AUC of test set')
311 |     plt.ylabel('AUC')
312 |     plt.xlabel('Epoch')
313 |     plt.legend()
314 |     plt.savefig(path+'loss.tif',dpi=300)
315 |     plt.show()
316 |     colors = ['#00CED1','#DC143C' ]
317 | 
318 |     target_names=np.array(['druglike','not-drug'])
319 |     lw=2
320 |     res_test  = test_res.T
321 | 
322 |     for color,i,target_name in zip(colors,[1,0],target_names):
323 |       
324 |         plt.scatter((res_test[res_test[:,0]==i,0]),(res_test[res_test[:,0]==i,2]),color = color,alpha=.8,lw=lw,label=target_name)
325 |     plt.legend(loc='best',shadow=False,scatterpoints=1)
326 |     plt.title('the results of gnn classification')
327 |     res_train  = train_res.T
328 |     cn_matrix=confusion_matrix(res_train[:,0], res_train[:,1])
329 |     cn_matrix
330 |     
331 |     tn1 = cn_matrix[0,0]
332 |     tp1 = cn_matrix[1,1]
333 |     fn1 = cn_matrix[1,0]
334 |     fp1 = cn_matrix[0,1]
335 | 
336 |    
337 |     bacc_train = ((tp1/(tp1+fn1))+(tn1/(tn1+fp1)))/2#balance accurance
338 |     pre_train = tp1/(tp1+fp1)#precision/q+
339 |     rec_train = tp1/(tp1+fn1)#recall/se
340 |     sp_train=tn1/(tn1+fp1)
341 |     q__train=tn1/(tn1+fn1)
342 |     f1_train = 2*pre_train*rec_train/(pre_train+rec_train)#f1score
343 |     mcc_train = ((tp1*tn1) - (fp1*fn1))/math.sqrt((tp1+fp1)*(tp1+fn1)*(tn1+fp1)*(tn1+fn1))#Matthews correlation coefficient
344 |     acc_train=(tp1+tn1)/(tp1+fp1+fn1+tn1)#accurancy
345 |     fpr_train, tpr_train, thresholds_train =roc_curve(res_train[:,0],res_train[:,1])
346 |     print('bacc_train:',bacc_train)
347 |     print('pre_train:',pre_train)
348 |     print('rec_train:',rec_train)
349 |     print('f1_train:',f1_train)
350 |     print('mcc_train:',mcc_train)
351 |     print('sp_train:',sp_train)
352 |     print('q__train:',q__train)
353 |     print('acc_train:',acc_train)
354 |     
355 |  
356 |     '''    
357 |     res_dev  = dev_res.T
358 |     cn_matrix=confusion_matrix(res_dev[:,0], res_dev[:,1])
359 |     cn_matrix
360 |     
361 |     tn2 = cn_matrix[0,0]
362 |     tp2 = cn_matrix[1,1]
363 |     fn2 = cn_matrix[1,0]
364 |     fp2 = cn_matrix[0,1]
365 | 
366 |    
367 |     bacc_dev = ((tp2/(tp2+fn2))+(tn2/(tn2+fp2)))/2#balance accurance
368 |     pre_dev= tp2/(tp2+fp2)#precision/q+
369 |     rec_dev = tp2/(tp2+fn2)#recall/se
370 |     sp_dev=tn2/(tn2+fp2)
371 |     q__dev=tn2/(tn2+fn2)
372 |     f1_dev = 2*pre_dev*rec_dev/(pre_dev+rec_dev)#f1score
373 |     mcc_dev = ((tp2*tn2) - (fp2*fn2))/math.sqrt((tp2+fp2)*(tp2+fn2)*(tn2+fp2)*(tn2+fn2))#Matthews correlation coefficient
374 |     acc_dev=(tp2+tn2)/(tp2+fp2+fn2+tn2)#accurancy
375 |     fpr_dev, tpr_dev, thresholds_dev =roc_curve(res_dev[:,0],res_dev[:,1])
376 |     print('bacc_dev:',bacc_dev)
377 |     print('pre_dev:',pre_dev)
378 |     print('rec_dev:',rec_dev)
379 |     print('f1_dev:',f1_dev)
380 |     print('mcc_dev:',mcc_dev)
381 |     print('sp_dev:',sp_dev)
382 |     print('q__dev:',q__dev)
383 |     print('acc_dev:',acc_dev)
384 |   
385 |     '''  
386 |    
387 |     cnf_matrix=confusion_matrix(res_test[:,0], res_test[:,1])
388 |     cnf_matrix
389 |     
390 |     tn = cnf_matrix[0,0]
391 |     tp = cnf_matrix[1,1]
392 |     fn = cnf_matrix[1,0]
393 |     fp = cnf_matrix[0,1]
394 |     
395 |     bacc = ((tp/(tp+fn))+(tn/(tn+fp)))/2#balance accurance
396 |     pre = tp/(tp+fp)#precision/q+
397 |     rec = tp/(tp+fn)#recall/se
398 |     sp=tn/(tn+fp)
399 |     q_=tn/(tn+fn)
400 |     f1 = 2*pre*rec/(pre+rec)#f1score
401 |     mcc = ((tp*tn) - (fp*fn))/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))#Matthews correlation coefficient
402 |     acc=(tp+tn)/(tp+fp+fn+tn)#accurancy
403 |     fpr, tpr, thresholds =roc_curve(res_test[:,0], res_test[:,1])
404 |     print('bacc:',bacc)
405 |     print('pre:',pre)
406 |     print('rec:',rec)
407 |     print('f1:',f1)
408 |     print('mcc:',mcc)
409 |     print('sp:',sp)
410 |     print('q_:',q_)
411 |     print('acc:',acc)
412 |     print('auc:',prediction_test)
413 |   
414 | 


--------------------------------------------------------------------------------
/dataset/withdrawn.txt:
--------------------------------------------------------------------------------
  1 | O=C(Nc1ccc(Cl)c(Cl)c1)c1cc(Cl)cc(Cl)c1O	0
  2 | CC(=O)NC1=CC=CC=C1	0
  3 | CC(=O)NC1=C(C=CC(=C1)[As](=O)(O)O)O	0
  4 | CC(=O)NC1=NC=C(N=N1)C=CC2=CC=C(O2)[N+](=O)[O-]	0
  5 | CC(=O)OC1=CC=CC=C1C(=O)O	0
  6 | CC1=CC(=C(C(=C1C=CC(=CC=CC(=CC(=O)O)C)C)C)C)OC	0
  7 | C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O)O)O)N	0
  8 | CC(C(=O)NC(C)C(=O)NC1C2C1CN(C2)C3=C(C=C4C(=O)C(=CN(C4=N3)C5=C(C=C(C=C5)F)F)C(=O)O)F)N	0
  9 | C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl	0
 10 | CC(C)C(CC1=CC(=C(C=C1)OC)OCCCOC)CC(C(CC(C(C)C)C(=O)NCC(C)(C)C(=O)N)O)N	0
 11 | CC1=C(N=CN1)CN2CCC3=C(C2=O)C4=CC=CC=C4N3C	0
 12 | CCC(C(CC(C)N(C)C)(C1=CC=CC=C1)C2=CC=CC=C2)OC(=O)C	0
 13 | CCCN(CCC)C(=O)CC1=C(N=C2N1C=C(C=C2)Cl)C3=CC=C(C=C3)Cl	0
 14 | CCN(CC)C(C)C(=O)C1=CC=CC=C1	0
 15 | C1CC2=CC=CC=C2C(C3=CC=CC=C31)NCCCCCCC(=O)O	0
 16 | CCC1(CCC(=O)NC1=O)C2=CC=C(C=C2)N	0
 17 | Cc1c(N(C)C)c(=O)n(-c2ccccc2)n1C	0
 18 | CCC1(C(=O)NC(=O)NC1=O)CCC(C)C	0
 19 | CC(C)CCOCC(CN1CCOCC1)OC(=O)C2=CC(=C(C(=C2)OC)OC)OC	0
 20 | CC(CC1=CC=CC=C1)N	0
 21 | C1=CC=C(C=C1)C(C#N)OC2C(C(C(C(O2)COC3C(C(C(C(O3)CO)O)O)O)O)O)O	0
 22 | CC1CC2C(CCC3(C2CCC3(C(=O)C)OC(=O)C)C)C4(C1=CCCC4)C	0
 23 | C1CN(CCN1CCOC(=O)C2=CC=CC=C2NC3=C4C=CC(=CC4=NC=C3)C(F)(F)F)C5=CC=CC(=C5)C(F)(F)F	0
 24 | CC(C)C1(C(=O)NC(=O)NC1=O)CC=C	0
 25 | COC1=CC=CC2=C3C(=C(C=C21)[N+](=O)[O-])C(=CC4=C3OCO4)C(=O)O	0
 26 | COC1=CC=C(C=C1)CCN2CCC(CC2)NC3=NC4=CC=CC=C4N3CC5=CC=C(C=C5)F	0
 27 | CC(=O)OCC1C(C(C(O1)N2C(=O)NC(=O)C=N2)OC(=O)C)OC(=O)C	0
 28 | CCC1(C(=O)NC(=O)NC1=O)CC	0
 29 | CCC(C)(C(=O)OCC)OC1=CC=C(C=C1)CC2=CC=C(C=C2)Cl	0
 30 | C1=CC=C(C=C1)CN2C3=CC=CC=C3C(=N2)OCC(=O)O	0
 31 | CC(CC1=CC(=CC=C1)C(F)(F)F)NCCOC(=O)C2=CC=CC=C2	0
 32 | CC(C1=CC2=C(C=C1)OC(=N2)C3=CC=C(C=C3)Cl)C(=O)O	0
 33 | CCC1=C(C2=CC=CC=C2O1)C(=O)C3=CC=C(C=C3)O	0
 34 | CCC1=C(C2=CC=CC=C2O1)C(=O)C3=CC(=C(C(=C3)Br)O)Br	0
 35 | CCC1=C(C2=CC=CC=C2O1)C(=O)C3=CC(=C(C(=C3)I)O)I	0
 36 | CN(C)CCCOC1=NN(C2=CC=CC=C21)CC3=CC=CC=C3	0
 37 | C1=CC=C(C=C1)CO	0
 38 | CC(C)COCC(CN(CC1=CC=CC=C1)C2=CC=CC=C2)N3CCCC3	0
 39 | CCC(=O)N1C2=CC=CC=C2N(C1=O)C3CCN(CC3)CCC(C#N)(C4=CC=CC=C4)C5=CC=CC=C5	0
 40 | CC(CS(=O)(=O)C1=CC=C(C=C1)F)(C(=O)NC2=CC(=C(C=C2)C#N)C(F)(F)F)O	0
 41 | C1=C(C=C(C(=C1SC2=C(C(=CC(=C2)Cl)Cl)O)O)Cl)Cl	0
 42 | B(O)(O)O	0
 43 | C1=CC(=C(C(=C1)C(=O)C2=CC=C(C=C2)Br)N)CC(=O)O	0
 44 | CC(C)C(C(=O)NC(=O)N)Br	0
 45 | CC1=NN=C2N1C3=C(C=C(S3)Br)C(=NC2)C4=CC=CC=C4Cl	0
 46 | C1=CC2=C(C(=C(C=C2Br)Br)O)N=C1	0
 47 | CCOC1=CC=C(C=C1)NC(=O)CC(C)O	0
 48 | CC(C)(C)N1CCC(CC1)(C2=CC=CC=C2)C3=CC=CC=C3	0
 49 | CCCCOC1=CC=C(C=C1)CC(=O)NO	0
 50 | COC1=CC(=C(C(=C1)OC)C(=O)CCCN2CCCC2)OC	0
 51 | CCCCN=C(N)N=C(N)N	0
 52 | CCCCC(C(=O)N(C1=CC=CC=C1)NC2=CC=CC=C2)C(=O)O	0
 53 | CCCC(=O)NC1=C(C=C(C(=C1I)C=C(CC)C(=O)O)I)I	0
 54 | CC(C)(C)C(C)(C1CC23CCC1(C4C25CCN(C3CC6=C5C(=C(C=C6)O)O4)CC7CC7)OC)O	0
 55 | CC(C(=O)C1=CC(=CC=C1)Cl)NC(C)(C)C	0
 56 | CCCCOC(=O)C1=CC=C(C=C1)N	0
 57 | CCN(CC(C)O)C1=NN=C(C=C1)NNC(=O)OCC	0
 58 | CN1C2=C(C=C(C=C2)Cl)C(=NC(C1=O)OC(=O)N(C)C)C3=CC=CC=C3	0
 59 | CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn[nH]2)cc1	0
 60 | CC12CCC(=O)C=C1C=CC3C2CCC4(C3CCC45CCC(=O)O5)C	0
 61 | CC1=C(C(CCC1=O)(C)C)C=CC(=CC=CC(=CC=CC=C(C)C=CC=C(C)C=CC2=C(C(=O)CCC2(C)C)C)C)C	0
 62 | CCN(CC)CCOCCOC(=O)C1(c2ccccc2)CCCC1	0
 63 | CN(C)CCOC(C1=CC=C(C=C1)Cl)C2=CC=CC=N2	0
 64 | CCCC(C)(COC(=O)N)COC(=O)NC(C)C	0
 65 | CCCCCCNC(=O)N1C=C(C(=O)NC1=O)F	0
 66 | CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(F)(F)F	0
 67 | C1C(=C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CS3)C(=O)[O-])C[N+]4=CC=CC=C4	0
 68 | CC(C)C1=C(C(=C(C(=N1)C(C)C)COC)C2=CC=C(C=C2)F)C=CC(CC(CC(=O)O)O)O	0
 69 | CC(CCC(=O)O)C1CCC2C1(CCC3C2C(CC4C3(CCC(C4)O)C)O)C	0
 70 | C(C(Cl)(Cl)Cl)(O)O	0
 71 | C1=CC(=CC=C1C(C(CO)NC(=O)C(Cl)Cl)O)[N+](=O)[O-]	0
 72 | N=C(NCCCCCCNC(=N)NC(=N)Nc1ccc(Cl)cc1)NC(=N)Nc1ccc(Cl)cc1	0
 73 | CC(=O)C1(CCC2C1(CCC3C2C=C(C4=CC(=O)CCC34C)Cl)C)O	0
 74 | CC(=O)C1(CCC2C1(CCC3C2C=C(C4=CC(=O)CCC34C)Cl)C)OC(=O)C	0
 75 | CN1C(S(=O)(=O)CCC1=O)C2=CC=C(C=C2)Cl	0
 76 | C1=CC=C2C=C(C=CC2=C1)N(CCCl)CCCl	0
 77 | C(Cl)(Cl)Cl	0
 78 | CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl	0
 79 | CC(C)(CC1=CC=C(C=C1)Cl)N	0
 80 | C1C(C(OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=C3)O)O)O	0
 81 | C1=CC=C(C=C1)C2=NC3=CC=CC=C3C(=C2)C(=O)O	0
 82 | COC1=CC(=CC(=C1OC)OC)C=CC(=O)N2CCN(CC2)CC(=O)N3CCCC3	0
 83 | CC(C)(C(=O)O)OC1=CC=C(C=C1)C2CC2(Cl)Cl	0
 84 | COC1CN(CCC1NC(=O)C2=CC(=C(C=C2OC)N)Cl)CCCOC3=CC=C(C=C3)F	0
 85 | C1=CC2=C(C(=C(C=C2Cl)I)O)N=C1	0
 86 | CC(CC1=CC=CC=C1)NCC2=CC=CC=C2Cl	0
 87 | CC(CN(C)C)C(C)(CC1=CC=C(C=C1)Cl)O	0
 88 | C1=CC(=CC=C1C(C2=CC=C(C=C2)Cl)C(Cl)(Cl)Cl)Cl	0
 89 | CCOC(=O)C(C)(C)OC1=CC=C(C=C1)Cl	0
 90 | CCOC(=O)NC(C)(C)CC1=CC=C(C=C1)Cl	0
 91 | CN(C)CCCC1C2=CC=CC=C2NC3=C1C=C(C=C3)Cl	0
 92 | CC1=C(C2=C(N1CC(=O)O)C=C(C=C2)OC)C(=O)C3=CC=C(C=C3)Cl	0
 93 | C1=CC=C(C=C1)C2=NC(C(=O)NC3=C2C=C(C=C3)Cl)C(=O)O	0
 94 | CN1CCN(CC1)C2=NC3=C(C=CC(=C3)Cl)NC4=CC=CC=C42	0
 95 | [Co]	0
 96 | CN1CCC23C4C1CC5=C2C(=C(C=C5)OC)OC3C(C=C4)O	0
 97 | CCC(C1=CC=C(C=C1)OCCN(CC)CC)C(CC)C2=CC=C(C=C2)OCCN(CC)CC	0
 98 | C1=CC=C2C(=C1)C=CC(=O)O2	0
 99 | CC1CC(CC(C1)(C)C)OC(=O)C(C2=CC=CC=C2)O	0
100 | CCC1(C(=O)NC(=O)NC1=O)C2=CCCCC2	0
101 | CC(=O)OC1=CC=C(C=C1)C(=C2CCCCC2)C3=CC=C(C=C3)OC(=O)C	0
102 | CN1CCC(=C2C3=CC=CC=C3C=CC4=CC=CC=C42)CC1	0
103 | C1=CC2=C(C(=C1)O)C(=O)C3=C(C2=O)C=CC=C3O	0
104 | CC(CC1=CC=CC=C1)N	0
105 | CCNC(C)CC1=CC(=CC=C1)C(F)(F)F	0
106 | CCN(CC)CCOC1=CC2=C(C=C1)N=C(S2)N(C)C	0
107 | C1=CC(=CC=C1NC(=O)C2=C(C=CC(=C2)Br)O)Br	0
108 | CCN(CC)CCOC(=O)C1(CCCCC1)C2CCCCC2	0
109 | CC=C(C1=CC=C(C=C1)O)C(=CC)C2=CC=C(C=C2)O	0
110 | CCC(=C(CC)C1=CC=C(C=C1)O)C2=CC=C(C=C2)O	0
111 | CC(C)(COC(=O)C(C1=CC=CC=C1)(C2=CC=CC=C2)O)N(C)C	0
112 | C1CN(CCC1(C2=CC=CC=C2)C(=O)O)CCC(C#N)(C3=CC=CC=C3)C4=CC=CC=C4	0
113 | CC(C)[C@@]1(NC(=O)[C@@H]2C[C@@H]3c4cccc5[nH]cc(c45)C[C@H]3N(C)C2)O[C@@]2(O)[C@@H]3CCCN3C(=O)[C@H](Cc3ccccc3)N2C1=O	0
114 | CC(C)C[C@H]1C(=O)N2CCC[C@H]2[C@]2(O)O[C@](NC(=O)[C@@H]3C[C@@H]4c5cccc6[nH]cc(c56)C[C@H]4N(C)C3)(C(C)C)C(=O)N12	0
115 | CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4C(=O)[C@H](Cc4ccccc4)N3C2=O)C[C@@H]2c3cccc4[nH]cc(c34)C[C@H]21	0
116 | CN[C@@H]1[C@H](O[C@H]2[C@H](O[C@@H]3[C@@H](NC(=N)N)[C@H](O)[C@@H](NC(=N)N)[C@H](O)[C@H]3O)O[C@@H](C)[C@]2(O)CO)O[C@@H](CO)[C@H](O)[C@H]1O	0
117 | C1=C(OC(=C1)[N+](=O)[O-])C=CC2=CN=C(N=N2)N(CO)CO	0
118 | I[Sn](CC)(CC)I	0
119 | CC(CCC1=CC=CC=C1)NCC(C2=CC(=C(C=C2)O)C(=O)N)O	0
120 | CCC(C)CC(C)N	0
121 | C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])O	0
122 | CCCCCC(C=CC1C(CC(=O)C1CC=CCCCC(=O)O)O)O	0
123 | CC(CC1=CC=CC=C1)N2CCN(CC2)C(C)CC3=CC=CC=C3	0
124 | CN(C)CCOC(C1=CC=CC=C1)C2=CC=CC=C2	0
125 | CCOC(=O)C1(CCN(CC1)CCC(C#N)(C2=CC=CC=C2)C3=CC=CC=C3)C4=CC=CC=C4	0
126 | Cc1c(N(C)CS(=O)(=O)O)c(=O)n(-c2ccccc2)n1C	0
127 | CCN1C(=CC=CC=Cc2sc3ccccc3[n+]2CC)Sc2ccccc21	0
128 | CN(CCC1=CC=C(C=C1)NS(=O)(=O)C)CCOC2=CC=C(C=C2)NS(=O)(=O)C	0
129 | O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c1c[nH]c2ccccc12	0
130 | C5=C(C(OC3CC1N2C(CC(C1)C(C2)=O)C3)=O)C4=C(C=CC=C4)[NH]5	0
131 | C1CN(CCC1N2C3=C(C=C(C=C3)Cl)NC2=O)CCCN4C5=CC=CC=C5NC4=O	0
132 | CC1C2C(C3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O	0
133 | C2=C(C(C1=NC=CC=C1)(OCCN(C)C)C)C=CC=C2	0
134 | C1CN(CC=C1N2C3=CC=CC=C3NC2=O)CCCC(=O)C4=CC=C(C=C4)F	0
135 | CN1C2=C(C3=CC=CC=C3S1(=O)=O)OC(=O)N(C2=O)C4=CC=CC=N4	0
136 | C1=CC(=CC=C1S(=O)(=O)NC=NCCSCC2=CSC(=N2)N=C(N)N)Br	0
137 | CCC1CN2CCC3=CC(=C(C=C3C2CC1CC4C5=CC(=C(C=C5CCN4)OC)OC)OC)OC	0
138 | CN1CCCCC1CCC2=CC=CC=C2NC(=O)C3=CC=C(C=C3)OC	0
139 | CN[C@@H](C)[C@H](O)c1:c:c:c:c:c:1	0
140 | CNCC(C1=CC(=C(C=C1)O)O)O	0
141 | C=C1CC[C@H](O)C/C1=C/C=C1\CCC[C@@]2(C)[C@H]1CC[C@@H]2[C@H](C)/C=C/[C@H](C)C(C)C	0
142 | C(C(C(CO[N+](=O)[O-])O[N+](=O)[O-])O[N+](=O)[O-])O[N+](=O)[O-]	0
143 | CCC1C(C(C(C(=O)C(CC(C(C(C(C(C(=O)O1)C)OC2CC(C(C(O2)C)O)(C)OC)C)OC3C(C(CC(O3)C)N(C)C)O)(C)O)C)C)O)(C)O	0
144 | COC(=O)CCc1ccc(OCC(O)CNC(C)C)cc1	0
145 | CCO	0
146 | C#CC(O)(/C=C/Cl)CC	0
147 | CC12CCC3C(C1CCC2(C#C)O)CCC4=C3C=CC(=C4)O	0
148 | CCON=O	0
149 | C(CCl)Cl	0
150 | CCC1(CCC2C1(CCC3C2CCC4=CCCCC34)C)O	0
151 | CCNC1=NC2=C(C=C(C=C2)Cl)C(O1)(C)C3=CC=CC=C3	0
152 | CCNCC(C1=CC(=CC=C1)O)O	0
153 | CCOC(=O)C1=CN=CN1C(C)C2=CC=CC=C2	0
154 | CCOC(=O)C=C(C)C=CC=C(C)C=CC1=C(C(=C(C=C1C)OC)C)C	0
155 | C1=CC(=C(C(=C1C(=O)C2=CC(=C(C(=C2)O)O)O)O)O)O	0
156 | CCCCOCC(CN1C(=O)C(C(=O)NC1=O)(CC)C2=CC=CC=C2)OC(=O)N	0
157 | C1=CC=C(C=C1)C(COC(=O)N)COC(=O)N	0
158 | CCC(C1=CC=CC=C1)C(=O)OCCN2CCOC(C2C)C3=CC=CC=C3	0
159 | C1=CC=C(C(=C1)CC(=O)O)OC2=C(C=C(C=C2)Cl)Cl	0
160 | C1=CC(=CC=C1C2=NC(=CS2)CC(=O)O)Cl	0
161 | CC(CC1=CC=CC=C1)NCCN2C=NC3=C2C(=O)N(C(=O)N3C)C	0
162 | CCNC(C)CC1=CC(=CC=C1)C(F)(F)F	0
163 | CC(CC1=CC=C(C=C1)O)NCC(C2=CC(=CC(=C2)O)O)O	0
164 | CC(CC1=CC=CC=C1)NCCC#N	0
165 | O=C1NCC2(CCN(CCc3ccccc3)CC2)O1	0
166 | CC(=CCC1C(=O)N(N(C1=O)C2=CC=CC=C2)C3=CC=CC=C3)C	0
167 | C1CN(CCN1CC2=CC3=C(C=C2)OCO3)C(=O)COC4=CC=C(C=C4)Cl	0
168 | C1=CC=C(C(=C1)C(=O)OCC(CO)O)NC2=C3C=CC=C(C3=NC=C2)C(F)(F)F	0
169 | CN1C=C(C(=O)C2=C1C=C(C=C2)F)S(=O)C	0
170 | CS(=O)(=O)NC1=C(C=C2C(=C1)CCC2=O)OC3=C(C=C(C=C3)F)F	0
171 | COC(=O)NC1=NC2=C(N1)C=C(C=C2)C(=O)C3=CC=C(C=C3)F	0
172 | CC1CCC2=C3N1C=C(C(=O)C3=CC(=C2)F)C(=O)O	0
173 | CN1C(=O)CN=C(C2=C1C=CC(=C2)[N+](=O)[O-])C3=CC=CC=C3F	0
174 | CCOC(=O)Nc1ccc(NCc2ccc(F)cc2)nc1N	0
175 | CC1=CC2=C(C=C1)C(=NC(=O)N2C(C)C)C3=CC=C(C=C3)F	0
176 | C(C(F)(F)F)OCC(F)(F)F	0
177 | COCCCCC(=NOCCN)C1=CC=C(C=C1)C(F)(F)F	0
178 | CN(CC1=C(C=CC=C1Cl)NC(=O)C2=CC=CC=C2)CC(=O)N3CCOCC3	0
179 | C1COC(=O)N1N=CC2=CC=C(O2)[N+](=O)[O-]	0
180 | CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC(=C(C(=C2)OC)OC)OC	0
181 | CC1CN(CCN1)C2=C(C=C3C(=C2OC)N(C=C(C3=O)C(=O)O)C4CC4)F	0
182 | CC1=CC(=C(C=C1)C)OCCCC(C)(C)C(=O)O	0
183 | CC(C(CN1C=NC=N1)(C2=C(C=C(C=C2)F)F)O)S(=O)(=O)C	0
184 | CC(C1CCC(C(O1)OC2C(CC(C(C2O)OC3C(C(C(CO3)(C)O)NC)O)N)N)N)NC	0
185 | C1=CC=C(C(=C1)C(=O)OCC(CO)O)NC2=C3C=CC(=CC3=NC=C2)Cl	0
186 | CCC1(CCC(=O)NC1=O)C2=CC=CC=C2	0
187 | CC1CN(CCN1)C2=C(C(=C3C(=C2)N(C=C(C3=O)C(=O)O)C4CC4)C)F	0
188 | C1CCCN(CCC1)CCN=C(N)N	0
189 | CCC1(C(=O)NC(=O)NC1=O)C2=CCCCCC2	0
190 | C1=C(C(=C(C(=C1Cl)Cl)CC2=C(C(=CC(=C2Cl)Cl)Cl)O)O)Cl	0
191 | CCC(C1=CC=C(C=C1)O)C(CC)C2=CC=C(C=C2)O	0
192 | CC1(C(=O)NC(=O)N(C1=O)C)C2=CCCCC2	0
193 | C1NC2=CC(=C(C=C2S(=O)(=O)N1)S(=O)(=O)N)Cl	0
194 | CN1CCC23C4C1CC5=C2C(=C(C=C5)O)OC3C(=O)CC4	0
195 | CC(C)CC1=CC=C(C=C1)CC(=O)O	0
196 | C1CNCCC1CCC2=CNC3=CC=CC=C32	0
197 | CC1=C(C2=C(N1C(=O)C3=CC=C(C=C3)Cl)C=CC(=C2)OC)CC(=O)O	0
198 | CC(C1=CC=C(C=C1)N2CC3=CC=CC=C3C2=O)C(=O)O	0
199 | C1CN(CCC1NC(=O)C2=CC=CC=C2)CCC3=CNC4=CC=CC=C43	0
200 | C/C=C(/C)C(=O)O[C@H]1C(C)=C[C@]23C(=O)[C@@H](C=C(CO)[C@@H](O)[C@]12O)[C@H]1[C@@H](C[C@H]3C)C1(C)C	0
201 | CC(=O)NCC1=C(C(=C(C(=C1I)C(=O)O)I)NC(=O)C)I	0
202 | CCOC(=O)CCCCCCCCC(C)C1=CC=CC=C1I	0
203 | CC(C)NNC(=O)C1=CC=NC=C1	0
204 | CC(C)Nc1ncccn1	0
205 | CC1=CC(=NO1)C(=O)NNCC2=CC=CC=C2	0
206 | CC(C)NCC(C1=CC(=C(C=C1)O)O)O	0
207 | CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC(=O)O)C)C	0
208 | CC1=CC(=NO1)NC(=O)C2=C(C3=CC=CC=C3S(=O)(=O)N2C)O	0
209 | CC(=O)N1CCN(CC1)C2=CC=C(C=C2)OCC3COC(O3)(CN4C=CN=C4)C5=C(C=C(C=C5)Cl)Cl	0
210 | CC(C1=CC(=CC=C1)C(=O)C2=CC=CC=C2)C(=O)O	0
211 | C1CN2C(=CC=C2C(=O)C3=CC=CC=C3)C1C(=O)O	0
212 | CS(=O)(=O)C1=CC(=CC2=C1N(C3=C2CCC3CC(=O)O)CC4=CC=C(C=C4)Cl)F	0
213 | C1=CC(=CC=C1C#N)C(C2=CC=C(C=C2)C#N)N3C=NC=N3	0
214 | CCC(C(CC(C)N(C)C)(C1=CC=CC=C1)C2=CC=CC=C2)OC(=O)C	0
215 | CC(CC1=CC=CC=C1)N	0
216 | C1CSC2=NC(CN21)C3=CC=CC=C3	0
217 | C1=CC(=C(C=C1C(CN)O)O)O	0
218 | C1(C(C(C(C(C1Cl)Cl)Cl)Cl)Cl)Cl	0
219 | CN(C)C(=O)C(CCN1CCC(CC1)(C2=CC=C(C=C2)Cl)O)(C3=CC=CC=C3)C4=CC=CC=C4	0
220 | CC1=CC(=C(C=C1)NC2=C(C=CC=C2Cl)F)CC(=O)O	0
221 | CC12CCC3C(C1CCC2(C#C)O)CCC4=CCCCC34	0
222 | C1CN2C(=N1)C3=CC=CC=C3C2(C4=CC=C(C=C4)Cl)O	0
223 | CC(C1=CC=CC=C1)NN	0
224 | CC1=CC(=CC=C1)CN2CCN(CC2)C(C3=CC=CC=C3)C4=CC=C(C=C4)Cl	0
225 | CN(C)CC(OC1=CC=CC=C1)OC2=CC=CC=C2	0
226 | CC(CC1=CC=CC=C1)NCCCCl	0
227 | CC1=CC2C(CCC3(C2CCC3(C(=O)C)OC(=O)C)C)C4(C1=CC(=O)CC4)C	0
228 | CCN(CC)CCCC(C)NC1=C2C=C(C=CC2=NC3=C1C=CC(=C3)Cl)OC	0
229 | CN1CCCC(C1)CN2C3=CC=CC=C3SC4=CC=CC=C42	0
230 | CC1=CC=CC=C1OCC(CO)O	0
231 | CCCC(C)(COC(=O)N)COC(=O)N	0
232 | O=C(Nc1ccccc1)c1cc(Br)cc(Br)c1O	0
233 | CC(CC1=CC=CC=C1)NC	0
234 | [C@H]23[C@@H]([C@@]1(C(=CC(=O)C=C1)CC2)C)CC[C@]4([C@H]3CC[C@]4(C)O)C	0
235 | CN(C)CCN(CC1=CC=CS1)C2=CC=CC=N2	0
236 | CC1=CC=CC=C1N2C(=NC3=CC=CC=C3C2=O)C	0
237 | CN1CCC2=CC(=C(C=C2C1CCC3=CC=C(C=C3)Cl)OC)OC	0
238 | COC(F)(F)C(Cl)Cl	0
239 | COC(=O)C(C1CCCCN1)C2=CC=CC=C2	0
240 | CCC1(C(=O)C(CNC1=O)C)CC	0
241 | CC1=CC(=C(C(=C1OC(=O)C)C)C)OCC(CNC(C)C)O	0
242 | CC(=O)NC1=C(C(=C(C(=C1I)C(=O)NC2C(C(C(OC2O)CO)O)O)I)N(C)C(=O)C)I	0
243 | CN1CCN2C(C1)C3=CC=CC=C3CC4=CC=CC=C42	0
244 | CC(C)C1C2=C(CCC1(CCN(C)CCCC3=NC4=CC=CC=C4N3)OC(=O)COC)C=C(C=C2)F	0
245 | CCCCN1CC(C(C(C1CO)O)O)O	0
246 | CC1=CC(=NN=C1NCCN2CCOCC2)C3=CC=CC=C3	0
247 | CN(C)C1C2CC3CC4=C(C=CC(=C4C(=C3C(=O)C2(C(=C(C1=O)C(=O)N)O)O)O)O)N(C)C	0
248 | CCOC(=NC1=C[N+](=NO1)N2CCOCC2)[O-]	0
249 | CC1=CC(=C(C=C1OC(=O)C)C(C)C)OCCN(C)C	0
250 | CC(C1=CC(=C(C=C1)Cl)Cl)N2C(=O)CC(=N2)N	0
251 | CCCCCCCCCC(=O)OC1CCC2C1(CCC3C2CCC4=CC(=O)CCC34)C	0
252 | CC12CCC3C(C1CCC2OC(=O)CCC4=CC=CC=C4)CCC5=CC(=O)CCC35	0
253 | CCC1=NN(C(=O)N1CCOC2=CC=CC=C2)CCCN3CCN(CC3)C4=CC(=CC=C4)Cl	0
254 | CC1=C2C(=NC=C1)N(C3=C(C=CC=N3)C(=O)N2)C4CC4	0
255 | C1=CC=C(C=C1)CNC(=O)CCNNC(=O)C2=CC=NC=C2	0
256 | CO[C@]12C[C@@H](COC(=O)c3cncc(Br)c3)CN(C)[C@@H]1Cc1cn(C)c3cccc2c13	0
257 | CC1=C(C(C(=C(N1)C)C(=O)OC)C2=CC=CC=C2[N+](=O)[O-])C(=O)OC	0
258 | C1=CC(=CC=C1C(=O)NN=CC2=CC=C(O2)[N+](=O)[O-])O	0
259 | CCN(CC)C(=O)C1=CN=CC=C1	0
260 | CS(=O)(=O)NC1=C(C=C(C=C1)[N+](=O)[O-])OC2=CC=CC=C2	0
261 | CN(C)CC1=CC=C(O1)CSCCNC(=C[N+](=O)[O-])NCC2=CC3=C(C=C2)OCO3	0
262 | CC1=NC(=CN1C2=CC=C(C=C2)[N+](=O)[O-])[N+](=O)[O-]	0
263 | C1=C(OC(=C1)[N+](=O)[O-])C=NNC(=O)N	0
264 | C1=C(OC(=C1)[N+](=O)[O-])C=CC(=NN=C(N)N)C=CC2=CC=C(O2)[N+](=O)[O-]	0
265 | C1=CC2=C(C=CC(=C2N=C1)O)[N+](=O)[O-]	0
266 | CN1CC(C2=C(C1)C(=CC=C2)N)C3=CC=CC=C3	0
267 | CN1CCC2=CC3=C(C(=C2C1C4C5=C(C(=C(C=C5)OC)OC)C(=O)O4)OC)OCO3	0
268 | CC1=C(C=CC2=C1OC(=O)C(=C2O)NC(=O)C3=CC(=C(C=C3)O)CC=C(C)C)OC4C(C(C(C(O4)(C)C)OC)OC(=O)N)O	0
269 | CC1=CN=C(C(=C1OC)C)CS(=O)C2=NC3=C(N2)C=C(C=C3)OC	0
270 | CC(C)NCC(C1=CC(=CC(=C1)O)O)O	0
271 | CCC(CC)(C1=CC=CC=C1)C(=O)OCCOCCN(CC)CC	0
272 | CCN(CC)CCC1=NC(=NO1)C2=CC=CC=C2	0
273 | CC(CN1C2=CC=CC=C2S(=O)(=O)C3=CC=CC=C31)CN(C)C	0
274 | COc1ccc2c3c1O[C@H]1C(=O)CC[C@@]4(O)[C@@H](C2)N(C)CC[C@]314	0
275 | CCCCC1C(=O)N(N(C1=O)C2=CC=C(C=C2)O)C3=CC=CC=C3	0
276 | CC(=O)OC1=CC=C(C=C1)C2(C3=CC=CC=C3NC2=O)C4=CC=C(C=C4)OC(=O)C	0
277 | C1=CC=C2C(=C1)C(C(=O)N2)(C3=CC=C(C=C3)O)C4=CC=C(C=C4)O	0
278 | CN(C)CC(=O)OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(=O)O	0
279 | C1C(C(C(C(C1N)OC2C(C(C(C(O2)CO)O)O)N)OC3C(C(C(O3)CO)OC4C(C(C(C(O4)CN)O)O)N)O)O)N	0
280 | CCC(=O)NS(=O)(=O)C1=CC=C(C=C1)C2=C(ON=C2C3=CC=CC=C3)C	0
281 | CN(CC#C)CC1=CC=CC=C1	0
282 | C1=CC=C(C=C1)C2C(=O)N=C(O2)N	0
283 | CCCC(C)C1(C(=O)NC(=O)NC1=O)CC	0
284 | O[C@@H]1CO[C@@H](O[C@@H]2CO[C@@H](O)[C@H](OS(O)(=O)=O)[C@H]2OS(O)(=O)=O)[C@H](OS(O)(=O)=O)[C@H]1OS(O)(=O)=O	0
285 | C1CCC2=NN=NN2CC1	0
286 | CCCN1CC(CC2C1CC3=CNC4=CC=CC2=C34)CSC	0
287 | C1CCC(CC1)C(CC2CCCCN2)C3CCCCC3	0
288 | CCOC1=CC=C(C=C1)NC(=O)C	0
289 | CC1=CC(=O)N(N1C)C2=CC=CC=C2	0
290 | C1=CC=C(C=C1)N=NC2=C(N=C(C=C2)N)N	0
291 | CC1C(OCCN1C)C2=CC=CC=C2	0
292 | C1=CC=C(C=C1)CCN=C(N)N=C(N)N	0
293 | C1=CC=C(C=C1)NNC(=O)N	0
294 | CC1C(OCCN1)C2=CC=CC=C2	0
295 | CCC1(C(=O)NC(=O)NC1=O)C2=CC=CC=C2	0
296 | C1=CC=C(C=C1)O	0
297 | C1=CC=C2C(=C1)C(=O)OC2(C3=CC=C(C=C3)O)C4=CC=C(C=C4)O	0
298 | CC(COC1=CC=CC=C1)NN	0
299 | CC(C)(CC1=CC=CC=C1)N	0
300 | CCCCC1C(=O)N(N(C1=O)C2=CC=CC=C2)C3=CC=CC=C3	0
301 | CNCC(C1=CC(=CC=C1)O)O	0
302 | CC(C(C1=CC=CC=C1)O)N	0
303 | CC(C(C1=CC=CC=C1)O)N	0
304 | [H][C@@](C)(N)[C@]([H])(O)C1=CC=CC=C1	0
305 | C1=CC=C(C(=C1)C(=O)NC2=CC=C(C=C2)S(=O)(=O)NC3=NC=CS3)C(=O)O	0
306 | CC(=NO)C1=CC=C(C=C1)OCC(=O)N2CCCCC2	0
307 | CCC1=CN=C(C=C1)CCOC2=CC=C(C=C2)CC3C(=O)NC(=O)S3	0
308 | C1CN(CCC1C(=O)N)CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)Cl	0
309 | CC[N+]1(CCCC(C1)OC(=O)C(C2=CC=CC=C2)(C3=CC=CC=C3)O)C	0
310 | C1CNCCN1	0
311 | C1CCNC(C1)C(C2=CC=CC=C2)(C3=CC=CC=C3)O	0
312 | CC(C1=CC(=C(C=C1)N2CC=CC2)Cl)C(=O)O	0
313 | C1C(O1)CCl	0
314 | CCCCCCC(CC=CCCCCCCCOC(=O)OCCOCC(COCCOC(=O)OCCCCCCCC=CCC(CCCCCC)O)OCCOC(=O)OCCCCCCCC=CCC(CCCCCC)O)O	0
315 | C=CN1CCCC1=O	0
316 | CC(C)NCC(COC1=CC=C(C=C1)NC(=O)C)O	0
317 | CCCNC1CCC2=C(C1)SC(=N2)N	0
318 | CC(CC1=CC=CC=C1)NCCC(C2=CC=CC=C2)C3=CC=CC=C3	0
319 | CC(C)(C)C1=CC(=CC(=C1O)C(C)(C)C)SC(C)(C)SC2=CC(=C(C(=C2)C(C)(C)C)O)C(C)(C)C	0
320 | CCCN(CCC)C(=O)C(CCC(=O)O)NC(=O)C1=CC=CC=C1	0
321 | CC(C)NCC(C1=CC2=CC=CC=C2C=C1)O	0
322 | CCCOC(=O)CC1=CC(=C(C=C1)OCC(=O)N(CC)CC)OC	0
323 | CC(C)C1=C(C(=CC=C1)C(C)C)O	0
324 | CCC(=O)OC(CC1=CC=CC=C1)(C2=CC=CC=C2)C(C)CN(C)C	0
325 | CC1=C(C(=O)N(N1C)C2=CC=CC=C2)C(C)C	0
326 | CC(CC1(C(=O)NC(=O)NC1=O)CC=C)O	0
327 | CC(C(C1=CC=CC=C1)O)NC	0
328 | C(C1(C(C=CNC1=O)=O)CC)C	0
329 | CC1=NC=C(C(=C1O)CO)CSSCC2=CN=C(C(=C2CO)O)C	0
330 | CCCC(C(=O)C1=CC=C(C=C1)C)N2CCCC2	0
331 | C1CC2CCCN2C1	0
332 | CNC(=C[N+](=O)[O-])NCCSCC1=CC=C(O1)CN(C)C	0
333 | CCC(=O)OC1C(CC2C1(CCC3C2CCC4C3(CC(C(C4)OC(=O)C)N5CCCCC5)C)C)[N+]6(CCCCC6)CC=C	0
334 | CCN1CCCC1CNC(=O)C2=C(C=CC(=C2OC)Br)OC	0
335 | COC(=O)[C@H]1[C@H]2C[C@@H]3c4[nH]c5cc(OC)ccc5c4CCN3C[C@H]2C[C@@H](OC(=O)c2cc(OC)c(OC)c(OC)c2)[C@@H]1OC	0
336 | [C@@H]2([N]1N=C(C(N)=O)N=C1)O[C@H](CO)[C@H]([C@H]2O)O	0
337 | CC1=C(N(N=C1C(=O)NN2CCCCC2)C3=C(C=C(C=C3)Cl)Cl)C4=CC=C(C=C4)Cl	0
338 | CS(=O)(=O)C1=CC=C(C=C1)C2=C(C(=O)OC2)C3=CC=CC=C3	0
339 | CN(CCOC1=CC=C(C=C1)CC2C(=O)NC(=O)S2)C3=CC=CC=N3	0
340 | CC1C2CCC3(C=CC(=O)C(=C3C2OC1=O)C)C	0
341 | CCCC(C)C1(C(=O)NC(=O)NC1=O)CC=C	0
342 | CC(CC1=CC=CC=C1)N(C)CC#C	0
343 | C1CN(CCC1C2=CN(C3=C2C=C(C=C3)Cl)C4=CC=C(C=C4)F)CCN5CCNC5=O	0
344 | CC(C)CC(C1(CCC1)C2=CC=C(C=C2)Cl)N(C)C	0
345 | CC1=CC2=C(C=C1CC(=O)C3=C(C=CS3)S(=O)(=O)NC4=C(C(=NO4)C)Cl)OCO2	0
346 | C1=C(C(=O)NC(=O)N1C2C(C(C(O2)CO)O)O)C=CBr	0
347 | CC(C)NCC(C1=CC=C(C=C1)NS(=O)(=O)C)O	0
348 | CC1CN(CC(N1)C)C2=C(C(=C3C(=C2F)N(C=C(C3=O)C(=O)O)C4CC4)N)F	0
349 | C1CCN2CC3CC(C2C1)CN4C3CCCC4	0
350 | C1CN2CC3=CCOC4CC(=O)N5C6C4C3CC2C61C7=CC=CC=C75	0
351 | C1=CC(=CC=C1N)S(=O)(=O)NC(=O)N	0
352 | CC(=CC(=O)NS(=O)(=O)C1=CC=C(C=C1)N)C	0
353 | COC1=NC(=NC(=C1)NS(=O)(=O)C2=CC=C(C=C2)N)OC	0
354 | CC1=CC(=NC(=N1)NS(=O)(=O)C2=CC=C(C=C2)N)C	0
355 | C1=CC(=CC=C1N)S(=O)(=O)N=C(N)N	0
356 | CC1=NN=C(S1)NS(=O)(=O)C2=CC=C(C=C2)N	0
357 | COC1=NN=C(C=C1)NS(=O)(=O)C2=CC=C(C=C2)N	0
358 | COC1=CN=C(N=C1)NS(=O)(=O)C2=CC=C(C=C2)N	0
359 | C1=CC(=CC=C1N)S(=O)(=O)N	0
360 | C1=CC(=CC=C1N)S(=O)(=O)NC2=NC=CS2	0
361 | CC1=CC(=NC(=N1)C)NS(=O)(=O)C2=CC=C(C=C2)N	0
362 | CCCCCCCCNC(C)C(C1=CC=C(C=C1)SC(C)C)O	0
363 | CC(C(C1=CC=C(C=C1)O)O)NC	0
364 | CC(C1=CC=C(C=C1)C(=O)C2=CC=CS2)C(=O)O	0
365 | CCCCC1(C(=O)N(N(C1=O)C2=CC=CC=C2)C3=CC=CC=C3)COC(=O)CCC(=O)O	0
366 | CCCCCN=C(N)NN=CC1=CNC2=C1C=C(C=C2)OC	0
367 | CCC[C@H](NC(=O)[C@@H]1[C@H]2CCC[C@H]2CN1C(=O)[C@@H](NC(=O)[C@@H](NC(=O)c3cnccn3)C4CCCCC4)C(C)(C)C)C(=O)C(=O)NC5CC5	0
368 | CC1CN(CCN1)C2=C(C=C3C(=C2)N(C=C(C3=O)C(=O)O)C4=C(C=C(C=C4)F)F)F	0
369 | CN1C2=C(C=C(C=C2)Cl)C(=NC(C1=O)O)C3=CC=CC=C3	0
370 | CC(C)N1CCN(CC1)C2=CC=C(C=C2)OCC3COC(O3)(CN4C=NC=N4)C5=C(C=C(C=C5)Cl)Cl	0
371 | CC(C)(C)C1=CC=C(C=C1)C(CCCN2CCC(CC2)C(C3=CC=CC=C3)(C4=CC=CC=C4)O)O	0
372 | CC(CC(C1=CC=CC=C1)C2=CC=CC=C2)NC(C)(C)C	0
373 | CCC(=O)OC1CCC2C1(CCC3C2CCC4=CC(=O)CCC34C)C	0
374 | C1=CC=C(C=C1)NC(=O)C2=C(C(=C(C(=C2Cl)Cl)Cl)Cl)O	0
375 | CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O	0
376 | CN1C(=O)CN=C(C2=C1C=CC(=C2)Cl)C3=CCCCC3	0
377 | C1CC(=O)NC(=O)C1N2C(=O)C3=CC=CC=C3C2=O	0
378 | CN1CCC(CC1)N(CC2=CC=CS2)C3=CC=CC=C3	0
379 | CC[C@H](C)[C@]1(CC)C(=NC(=S)NC1=O)[O-]	0
380 | CN1CCCCC1CCN2C3=CC=CC=C3SC4=C2C=C(C=C4)SC	0
381 | C1=CSC(=C1)C(=O)C2=C(C(=C(C=C2)OCC(=O)O)Cl)Cl	0
382 | CC1=CC(=C(C2=C1C=CC=N2)O)Br	0
383 | C1C(NCS1)C(=O)O	0
384 | CC1=C(C(=CC=C1)C)NC(=O)C(C)N	0
385 | CC1=CC=C(C=C1)C(=O)C2=CC(=C(C(=C2)O)O)[N+](=O)[O-]	0
386 | CN(CC(=O)O)C(=S)C1=CC=CC2=C1C=CC(=C2C(F)(F)F)OC	0
387 | [C@H]2(C1=CC(=CC=C1N([C@H](CC)C2)C(=O)OCC)C(F)(F)F)N(C(=O)OC)CC3=CC(=CC(=C3)C(F)(F)F)C(F)(F)F	0
388 | C1C(C1N)C2=CC=CC=C2	0
389 | C1CN(CCN1CCCN2C(=O)N3C=CC=CC3=N2)C4=CC(=CC=C4)Cl	0
390 | CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC(=O)O)C)C	0
391 | CC(=O)N1C2=CC=CC=C2C(C1=O)(C3=CC=C(C=C3)OC(=O)C)C4=CC=C(C=C4)OC(=O)C	0
392 | CC1=NN=C2N1C3=C(C=C(C=C3)Cl)C(=NC2)C4=CC=CC=C4Cl	0
393 | O=C(Nc1ccc(Br)cc1)c1cc(Br)cc(Br)c1O	0
394 | CC(Cl)(Cl)Cl	0
395 | CN(C)CCOC1=CC=C(C=C1)CNC(=O)C2=CC(=C(C(=C2)OC)OC)OC	0
396 | CCN(CC)CCOC1=CC=C(C=C1)C(CC2=CC=C(C=C2)Cl)(C3=CC=C(C=C3)C)O	0
397 | CC1=C(C2=C(CCC(O2)(C)COC3=CC=C(C=C3)CC4C(=O)NC(=O)S4)C(=C1O)C)C	0
398 | C1C2C(C2N)CN1C3=C(C=C4C(=O)C(=CN(C4=N3)C5=C(C=C(C=C5)F)F)C(=O)O)F	0
399 | C1=CC=C2C(=C1)C(=CN2)CC(C(=O)O)N	0
400 | CC(=O)O[C@]1(C(C)=O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3[C@@H](c3ccc(N(C)C)cc3)C[C@@]21C	0
401 | CCOC(=O)N	0
402 | CC1=C(C(=NO1)C2=CC=CC=C2)C3=CC=C(C=C3)S(=O)(=O)N	0
403 | COC1=CC(=CC(=C1OC)C(=O)NCC2CCCN2CC=C)S(=O)(=O)N	0
404 | CCC=C(C)C1(C(=O)NC(=O)NC1=O)CC	0
405 | CCC12CCCN3C1C4=C(CC3)C5=CC=CC=C5N4C(C2)(C(=O)OC)O	0
406 | C=CCl	0
407 | [C@]2(OC1=C(C(=C(C(=C1CC2)C)O)C)C)(CCC[C@@H](CCC[C@@H](CCCC(C)C)C)C)C	0
408 | CCOC(C(=O)C1=CC=C(C=C1)C2=CC=CC=C2)NC3=CC=C(C=C3)C(=O)O	0
409 | CCOC(=O)CNC(C1CCCCC1)C(=O)N2CCC2C(=O)NCC3=CC=C(C=C3)C(=NO)N	0
410 | CN(C)CC=C(C1=CC=C(C=C1)Br)C2=CN=CC=C2	0
411 | COC(CN1CCN(CC1)CC(C(C2=CC=CC=C2)OC)O)C3=CC=CC=C3	0
412 | CC1=C(N(C(=C1)CC(=O)O)C)C(=O)C2=CC=C(C=C2)Cl	0
413 | CN1CCN(CC1)C(=O)OC2C3=NC=CN=C3C(=O)N2C4=NC=C(C=C4)Cl	0
414 | 
415 | 
416 | 


--------------------------------------------------------------------------------
/Tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "fa1cbc05",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# D-GCAN Deep Dive"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "d4a486bf",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "In this tutorial, we take a deep dive into D-GCAN and show how it builds a drug-likeness prediction model from scratch.\n",
 17 |     "\n",
 18 |     "Let's start!"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "d7cacf2e",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Part I: Overview of D-GCAN and Data"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "id": "43a44a94",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "The drug-likeness has been widely used as a criterion to distinguish drug-like molecules from non-drugs. Developing reliable computational methods to predict drug-likeness of compounds is crucial to triage unpromising molecules and accelerate the drug discovery process.In this study, a deep learning method was developed to predict drug-likeness based on the graph convo-lutional attention network (D-GCAN) directly from molecular structures. The model combined the ad-vantages of graph convolution and attention mechanism. Results showed that the D-GCAN outper-formed other state-of-the-art models for drug-likeness prediction. Molecular graph was used as encoding method for drug-likeness prediction.\n",
 35 |     "\n",
 36 |     "A dataset with enough drugs and non-drugs is the prerequisite to train accurate deep neural network models for prediction of drug-likeness.In this study, D-GCAN model was trained on the dataset released by Beker, which consists of drug and non-drug sets (abbrevi-ated as: Drugs and Non-drugs). The Drugs set includes 2136 FDA small-molecule drugs assembled from Drugbank. The Non-drugs was chosen from ZINC15. Compounds with a maximum fingerprint-based Tanimoto similarity to drugs above 0.85 were removed, and standard binary classification was used to itera-tively refine the set of reliable negative set. Since the negative set is much larger than the positive set, it was randomly down-sampled to create a balanced dataset for model training. The dataset was randomly divided into training, validation, and test sets at ratio 8:1:1. In addition, two additional datasets, the non-US dataset and the bRo5 dataset, were used to test the performance of the model. The non-US dataset composes of 1281 word-wide drugs from Drugbank and an equal size of non-drugs from ZINC15. The bRo5 dataset includes 135 FDA and non-US drugs beyond Ro5 space (bRo5). The GDB-13 data-base was used to test the ability of D-GCAN in screening large-scale data. It consists of about 977 million drug-like small molecules according to Lipinski’s rule. All molecules contain up to 13 heavy atoms , and they were stored in the canonical SMILES. All the independent test datasets and validation dataset were not used in the training process.\n"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "id": "e597ea85",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## Part II: To train the model"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 1,
 50 |    "id": "07e8ed86",
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "import train"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 2,
 60 |    "id": "dcca4e9f",
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "name": "stdout",
 65 |      "output_type": "stream",
 66 |      "text": [
 67 |       "The code uses a GPU!\n",
 68 |       "----------------------------------------------------------------------------------------------------\n",
 69 |       "Just a moment......\n",
 70 |       "----------------------------------------------------------------------------------------------------\n",
 71 |       "../dataset/data_train.txt\n",
 72 |       "../dataset/data_test.txt\n",
 73 |       "The preprocess has finished!\n",
 74 |       "# of training data samples: 3802\n",
 75 |       "# of test data samples: 428\n",
 76 |       "----------------------------------------------------------------------------------------------------\n",
 77 |       "Creating a model.\n",
 78 |       "# of model parameters: 311698\n",
 79 |       "----------------------------------------------------------------------------------------------------\n",
 80 |       "Start training.\n",
 81 |       "The result is saved in the output directory every epoch!\n",
 82 |       "The training will finish in about 0 hours 21 minutes.\n",
 83 |       "----------------------------------------------------------------------------------------------------\n",
 84 |       "Epoch\tTime(sec)\tLoss_train\tLoss_test\tAUC_train\tAUC_test\n",
 85 |       "1\t9.334350300000011\t318.02376973629\t33.23613902926445\t0.6330387783115992\t0.5116822429906542\n",
 86 |       "2\t16.232699300000007\t275.59704649448395\t28.36697283387184\t0.7726390395642837\t0.5233644859813084\n",
 87 |       "3\t22.533227600000004\t258.0953543186188\t26.692712754011154\t0.8227923594659818\t0.530373831775701\n",
 88 |       "4\t30.035969300000005\t244.29262351989746\t25.99587020277977\t0.8555440089003034\t0.5934579439252337\n",
 89 |       "5\t36.20996820000002\t235.61571648716927\t26.206634640693665\t0.8711811167445258\t0.544392523364486\n",
 90 |       "6\t42.15839070000001\t235.10905063152313\t24.74921104311943\t0.8782201330617486\t0.5771028037383178\n",
 91 |       "7\t48.091756000000004\t230.60763642191887\t24.482909947633743\t0.8858224754798858\t0.6308411214953271\n",
 92 |       "8\t53.9427063\t225.9473716020584\t25.168260991573334\t0.894445889698231\t0.5794392523364486\n",
 93 |       "9\t59.887297399999994\t220.88472372293472\t23.143073588609695\t0.9038094737308211\t0.6845794392523364\n",
 94 |       "10\t65.88160690000001\t220.29008296132088\t23.10640263557434\t0.9080117951159034\t0.6542056074766355\n",
 95 |       "11\t71.7950055\t220.04156962037086\t23.67304638028145\t0.905873895764607\t0.7289719626168224\n",
 96 |       "12\t77.81804110000002\t214.85031658411026\t23.34747040271759\t0.9159177330794608\t0.8200934579439252\n",
 97 |       "13\t83.67658890000001\t212.33444252610207\t23.14932319521904\t0.9178444716275156\t0.735981308411215\n",
 98 |       "14\t89.6064955\t211.54040449857712\t22.778073489665985\t0.9219235005645715\t0.7593457943925234\n",
 99 |       "15\t95.5943882\t208.26400744915009\t22.901916056871414\t0.9267551530985012\t0.7663551401869159\n",
100 |       "16\t101.6206004\t209.3945328295231\t23.913705557584763\t0.9246417461863752\t0.6915887850467289\n",
101 |       "17\t107.65320110000002\t206.03158766031265\t23.282782286405563\t0.930114906901056\t0.7313084112149533\n",
102 |       "18\t113.61952920000002\t207.53857171535492\t22.225304275751114\t0.9226543992295261\t0.8037383177570093\n",
103 |       "19\t120.54484979999998\t204.79183167219162\t23.475462794303894\t0.926265719441185\t0.735981308411215\n",
104 |       "20\t126.48899880000002\t205.36031165719032\t22.78501933813095\t0.9291473863661524\t0.8107476635514018\n",
105 |       "21\t132.3617932\t202.3321330845356\t23.385528802871704\t0.9302150906635376\t0.7897196261682243\n",
106 |       "22\t138.31940950000003\t202.55410113930702\t23.08760157227516\t0.9293607611751943\t0.7686915887850467\n",
107 |       "23\t144.17849070000003\t198.95897144079208\t22.36356022953987\t0.9377836694932141\t0.8294392523364486\n",
108 |       "24\t150.04248400000003\t197.13710144162178\t23.3654263317585\t0.9351247869019417\t0.8084112149532711\n",
109 |       "25\t155.89731129999998\t256.36723348498344\t22.31619429588318\t0.7880461675559591\t0.8411214953271028\n",
110 |       "26\t161.73105990000002\t199.1333883702755\t22.19395723938942\t0.9381437221865521\t0.8200934579439252\n",
111 |       "27\t167.57376660000003\t195.61116680502892\t21.885735362768173\t0.9357744592290832\t0.8621495327102804\n",
112 |       "28\t173.4351409\t196.21020331978798\t21.808892458677292\t0.937718217946731\t0.8808411214953271\n",
113 |       "29\t179.35259220000003\t196.93134278059006\t22.267054110765457\t0.9385652135408595\t0.8481308411214953\n",
114 |       "30\t185.28482499999998\t195.89555063843727\t22.040870487689972\t0.9386104622844113\t0.8434579439252337\n",
115 |       "31\t191.16947059999998\t194.0237057507038\t22.781775504350662\t0.9417760754533178\t0.8387850467289719\n",
116 |       "32\t197.0987102\t193.68072113394737\t22.449314266443253\t0.9423293001527663\t0.8294392523364486\n",
117 |       "33\t203.01129849999998\t192.5338954925537\t22.377066612243652\t0.9452480516748955\t0.8621495327102804\n",
118 |       "34\t209.6004434\t192.58278796076775\t23.285291463136673\t0.9402174153696283\t0.8714953271028038\n",
119 |       "35\t216.03077720000002\t196.01435166597366\t24.061037868261337\t0.9359749651294087\t0.8014018691588785\n",
120 |       "36\t222.4218536\t193.9636361002922\t22.313345968723297\t0.936061864857086\t0.8434579439252337\n",
121 |       "37\t228.7779303\t192.51033294200897\t22.285043627023697\t0.9435566896185268\t0.8317757009345794\n",
122 |       "38\t235.1833634\t188.01407945156097\t22.830698162317276\t0.94882975955897\t0.8014018691588785\n",
123 |       "39\t241.73452799999998\t193.91294729709625\t22.711496233940125\t0.9402640478668054\t0.8294392523364486\n",
124 |       "40\t248.1350879\t192.2110168337822\t21.79123494029045\t0.9453036785706378\t0.8785046728971962\n",
125 |       "41\t254.51809260000002\t189.3926584124565\t23.183754086494446\t0.9464263178869528\t0.822429906542056\n",
126 |       "42\t260.67076799999995\t197.67854461073875\t24.210958123207092\t0.9356472922709057\t0.8714953271028038\n",
127 |       "43\t266.8244926\t195.8016073703766\t22.462971657514572\t0.9351026468439347\t0.8107476635514018\n",
128 |       "44\t273.0223882\t191.97943636775017\t23.224840223789215\t0.940738813735692\t0.8247663551401869\n",
129 |       "45\t279.0850431\t190.848837941885\t22.82283341884613\t0.9461044567936768\t0.8060747663551402\n",
130 |       "46\t285.1555958\t190.04618108272552\t22.433310955762863\t0.9440021199105542\t0.7967289719626168\n",
131 |       "47\t291.1587971\t190.5216095149517\t22.426137387752533\t0.9457633615250072\t0.822429906542056\n",
132 |       "48\t297.2169235\t185.92078268527985\t22.22221177816391\t0.9480603925432284\t0.8247663551401869\n",
133 |       "49\t303.3877255\t187.782156676054\t22.887968957424164\t0.94423528239644\t0.8364485981308412\n",
134 |       "50\t309.40409650000004\t187.28414443135262\t21.483285009860992\t0.9458896982310093\t0.8714953271028038\n",
135 |       "51\t315.44450670000003\t185.18417713046074\t21.38184556365013\t0.9481489527752565\t0.8785046728971962\n",
136 |       "52\t321.7301946\t182.16105404496193\t24.673764526844025\t0.9509743009276684\t0.7920560747663551\n",
137 |       "53\t327.84551209999995\t188.7527618408203\t23.513393253087997\t0.9457229559191445\t0.8154205607476636\n",
138 |       "54\t334.0279252\t185.709531635046\t21.631520986557007\t0.9463949066796555\t0.8808411214953271\n",
139 |       "55\t340.16017209999995\t185.12931755185127\t22.429152816534042\t0.944120984346979\t0.8294392523364486\n",
140 |       "56\t346.29319970000006\t182.88407680392265\t21.58257967233658\t0.9498509697345405\t0.8761682242990654\n",
141 |       "57\t357.14253529999996\t182.04424741864204\t21.475889027118683\t0.9525705991099698\t0.8808411214953271\n",
142 |       "58\t363.8143368\t182.934487760067\t21.883195608854294\t0.949658766355968\t0.8714953271028038\n",
143 |       "59\t370.1603794\t184.17358297109604\t21.290808767080307\t0.9471534804171187\t0.883177570093458\n",
144 |       "60\t376.4684089\t181.42354640364647\t21.597694754600525\t0.949361674452587\t0.9042056074766355\n",
145 |       "61\t382.88365710000005\t187.25566163659096\t21.6785786151886\t0.9452393340270551\t0.866822429906542\n",
146 |       "62\t389.3225355000001\t181.59250125288963\t21.666670441627502\t0.9521657127991676\t0.8598130841121495\n",
147 |       "63\t396.0132287\t179.8839019536972\t22.01644539833069\t0.9538880709367459\t0.852803738317757\n",
148 |       "64\t402.58914460000005\t182.93770709633827\t22.33838379383087\t0.9484886642903003\t0.8504672897196262\n",
149 |       "65\t409.2413564000001\t181.58496183156967\t22.23741576075554\t0.950153319901698\t0.8785046728971962\n",
150 |       "66\t415.7737618\t182.49673774838448\t21.934344708919525\t0.9507491642128103\t0.8785046728971962\n",
151 |       "67\t422.29757770000003\t180.1727076768875\t22.335491836071014\t0.9534794484911551\t0.852803738317757\n",
152 |       "68\t428.7057098\t182.3468733727932\t21.559545934200287\t0.953210031660283\t0.8855140186915887\n",
153 |       "69\t435.1430776000001\t177.6970148384571\t21.813909739255905\t0.9542144984169858\t0.8785046728971962\n",
154 |       "70\t441.59579740000004\t179.4230616092682\t21.47458705306053\t0.9503143888236987\t0.8925233644859814\n",
155 |       "71\t447.9559312\t187.4236896932125\t21.997405976057053\t0.9434714503952\t0.8714953271028038\n",
156 |       "72\t454.36015280000004\t185.64457353949547\t21.80859535932541\t0.9493346912568912\t0.8995327102803738\n",
157 |       "73\t460.8502926\t183.28448390960693\t22.7678345143795\t0.942662231275046\t0.8504672897196262\n",
158 |       "74\t467.7673221\t181.35295176506042\t21.99883532524109\t0.9524445391546925\t0.8317757009345794\n",
159 |       "75\t474.20574880000004\t180.42559936642647\t22.20555028319359\t0.9545189242145815\t0.8621495327102804\n",
160 |       "76\t480.7348668000001\t177.4391260445118\t21.684407979249954\t0.9532330019704651\t0.8878504672897196\n",
161 |       "77\t488.22190610000007\t187.34195244312286\t22.080274641513824\t0.9482340536232203\t0.8738317757009346\n",
162 |       "78\t498.7163915\t184.663908213377\t22.014076620340347\t0.9470819403546837\t0.8691588785046729\n"
163 |      ]
164 |     },
165 |     {
166 |      "name": "stdout",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "79\t507.72747260000006\t178.98830798268318\t21.47413921356201\t0.9541025527486882\t0.8598130841121495\n",
170 |       "80\t516.8672275\t178.3373854458332\t21.658688694238663\t0.9525628500896672\t0.8901869158878505\n",
171 |       "81\t526.2272582\t176.8597036600113\t22.024581998586655\t0.9535130737042532\t0.8644859813084113\n",
172 |       "82\t537.2631751\t177.2030012011528\t21.766023725271225\t0.957143212965218\t0.8691588785046729\n",
173 |       "83\t548.0884053000001\t176.38141465187073\t21.79708757996559\t0.9572966712422787\t0.8714953271028038\n",
174 |       "84\t559.8119843000001\t174.46399101614952\t21.843281388282776\t0.956292066110213\t0.8574766355140186\n",
175 |       "85\t569.1060591\t175.65917918086052\t21.409487038850784\t0.9559244027719352\t0.9018691588785047\n",
176 |       "86\t576.5913633\t176.77976202964783\t21.49503728747368\t0.9538743717758542\t0.8785046728971962\n",
177 |       "87\t584.3266782000001\t179.85141596198082\t21.427332252264023\t0.9514658102154229\t0.9018691588785047\n",
178 |       "88\t595.5166383000001\t178.52282038331032\t21.403560250997543\t0.9539066132353267\t0.8644859813084113\n",
179 |       "89\t605.9234084000001\t177.05544209480286\t21.411171078681946\t0.9535158412115041\t0.9042056074766355\n",
180 |       "90\t613.9635365\t176.31908676028252\t21.366370409727097\t0.955243734363584\t0.8878504672897196\n",
181 |       "91\t623.9597695\t176.20382365584373\t21.893729746341705\t0.9558952055704386\t0.8761682242990654\n",
182 |       "92\t634.0385619\t175.1233125925064\t22.482848435640335\t0.9574447328802002\t0.8621495327102804\n",
183 |       "93\t644.2752579\t176.93210792541504\t21.549375027418137\t0.9527253027652932\t0.8948598130841121\n",
184 |       "94\t652.8900821000001\t173.00296890735626\t21.5932075381279\t0.9579943598202227\t0.8855140186915887\n",
185 |       "95\t667.6088093000001\t179.9282302260399\t23.50808882713318\t0.9537645017379945\t0.8341121495327103\n",
186 |       "96\t676.2838057\t174.56020081043243\t22.648311734199524\t0.9556187315960767\t0.8714953271028038\n",
187 |       "97\t685.9453472\t176.7171704173088\t21.891200184822083\t0.9524900646489693\t0.897196261682243\n",
188 |       "98\t694.2790274\t178.38612964749336\t22.245244562625885\t0.9540746009254544\t0.8808411214953271\n",
189 |       "99\t702.7636145\t177.73075929284096\t24.06598174571991\t0.9527554685943277\t0.897196261682243\n",
190 |       "100\t713.3440201000001\t177.71725061535835\t23.09346652030945\t0.9563492151349435\t0.8434579439252337\n",
191 |       "101\t722.6864448\t173.766254901886\t22.383565932512283\t0.9588301470099851\t0.8785046728971962\n",
192 |       "102\t731.0422685000001\t173.67894527316093\t21.84225881099701\t0.9581018774769188\t0.8855140186915887\n",
193 |       "103\t739.0269975\t175.92625331878662\t22.260210156440735\t0.956732791639914\t0.8691588785046729\n",
194 |       "104\t746.7896567\t174.88757956027985\t21.68474268913269\t0.95813674806828\t0.8901869158878505\n",
195 |       "105\t754.1008847\t176.1177335381508\t22.008816480636597\t0.9573001306263422\t0.8785046728971962\n",
196 |       "106\t761.2548227000001\t174.49301874637604\t21.83292892575264\t0.9588928310492174\t0.897196261682243\n",
197 |       "107\t768.4392893\t172.7093889117241\t21.64242872595787\t0.9596258053446101\t0.8878504672897196\n",
198 |       "108\t775.9572361\t177.16424638032913\t21.55477637052536\t0.9525133117098767\t0.9065420560747663\n",
199 |       "109\t784.4066813000001\t178.4102607667446\t21.33096119761467\t0.9554701164567051\t0.9042056074766355\n",
200 |       "110\t794.4927042\t174.0306807756424\t21.91256058216095\t0.9572584796422166\t0.8714953271028038\n",
201 |       "111\t802.6228819\t174.55561447143555\t21.62583690881729\t0.9593071268846725\t0.9088785046728972\n",
202 |       "112\t810.7530621000001\t172.49658674001694\t22.11896824836731\t0.9582462029800518\t0.8901869158878505\n",
203 |       "113\t818.6795158\t173.48215851187706\t21.262643307447433\t0.9584646976775077\t0.897196261682243\n",
204 |       "114\t827.9049124000001\t174.29942700266838\t21.171885669231415\t0.9568379569154472\t0.9088785046728972\n",
205 |       "115\t837.2002482\t171.870591878891\t21.214154481887817\t0.9590825436712644\t0.9158878504672897\n",
206 |       "116\t845.4472041\t176.74994710087776\t22.71759131550789\t0.9544714614652291\t0.8995327102803738\n",
207 |       "117\t853.6806366000001\t184.73798117041588\t21.845041394233704\t0.9432161478513075\t0.8995327102803738\n",
208 |       "118\t868.1707754\t177.2356958091259\t21.59891825914383\t0.9571596796333606\t0.9112149532710281\n",
209 |       "119\t875.8412158\t176.63044354319572\t21.46969723701477\t0.9547083600859034\t0.9088785046728972\n",
210 |       "120\t884.2734249\t172.92364439368248\t21.97585704922676\t0.9586079161777404\t0.9065420560747663\n",
211 |       "121\t892.1201223\t180.26964315772057\t21.951832473278046\t0.948673395399296\t0.9018691588785047\n",
212 |       "122\t899.5396582999999\t173.81644931435585\t21.459405571222305\t0.9592527453671928\t0.8925233644859814\n",
213 |       "123\t906.7295165\t176.33700492978096\t21.565876573324203\t0.9538397779352181\t0.8761682242990654\n",
214 |       "124\t914.7522263999999\t172.69166892766953\t21.18627032637596\t0.9598193924768083\t0.9112149532710281\n",
215 |       "125\t923.6481627000001\t171.38375091552734\t21.528594940900803\t0.9606258440897115\t0.8995327102803738\n",
216 |       "126\t932.1785648\t171.18208953738213\t22.116858184337616\t0.9604675426749618\t0.8785046728971962\n",
217 |       "127\t939.5199154\t171.14211875200272\t22.178175538778305\t0.9611412923151859\t0.883177570093458\n",
218 |       "128\t946.4239123\t169.92953234910965\t21.887178242206573\t0.9611555449775278\t0.8808411214953271\n",
219 |       "129\t954.5059573000001\t174.96560329198837\t22.6447791159153\t0.9555907797728429\t0.8761682242990654\n",
220 |       "130\t961.8763295000001\t178.70918104052544\t22.300085812807083\t0.9529475335975381\t0.8808411214953271\n",
221 |       "131\t969.1789554\t174.0145247578621\t22.113102048635483\t0.9579599043549494\t0.8808411214953271\n",
222 |       "132\t976.3019077000001\t172.59970355033875\t21.501632899045944\t0.9596043571634159\t0.9018691588785047\n",
223 |       "133\t984.5187846000001\t172.19679167866707\t22.107417851686478\t0.9594648747979719\t0.8808411214953271\n",
224 |       "134\t993.8344443000001\t173.3250037431717\t23.72211918234825\t0.9588380344056502\t0.8341121495327103\n",
225 |       "135\t1002.1266977\t173.15374860167503\t22.544156223535538\t0.9600561527221201\t0.8738317757009346\n",
226 |       "136\t1009.9465580000001\t171.82129180431366\t22.224039256572723\t0.9606888048796689\t0.8738317757009346\n",
227 |       "137\t1018.5052494000001\t171.67953670024872\t22.0196373462677\t0.9615298503332077\t0.8901869158878505\n",
228 |       "138\t1028.1396465\t172.37993958592415\t22.44371086359024\t0.9589339285318929\t0.8808411214953271\n",
229 |       "139\t1037.3752773\t174.99391075968742\t21.463502824306488\t0.9590761784045874\t0.9018691588785047\n",
230 |       "140\t1046.0815085\t174.17848363518715\t21.192844033241272\t0.959691256891093\t0.9042056074766355\n",
231 |       "auc: 0.9042056074766356\n",
232 |       "bacc: 0.9042056074766356\n",
233 |       "pre: 0.9303482587064676\n",
234 |       "rec: 0.8738317757009346\n",
235 |       "f1: 0.9012048192771085\n",
236 |       "mcc: 0.8099069874057296\n",
237 |       "sp: 0.9345794392523364\n",
238 |       "q_: 0.8810572687224669\n",
239 |       "acc: 0.9042056074766355\n"
240 |      ]
241 |     }
242 |    ],
243 |    "source": [
244 |     "\n",
245 |     "\n",
246 |     "tes = train.train('../dataset/data_test.txt', #test set   \n",
247 |     "    radius = 1,        #hops of radius subgraph: 1, 2 \n",
248 |     "    dim = 52,          #dimension of graph convolution layers\n",
249 |     "    layer_hidden = 4,  #Number of graph convolution layers\n",
250 |     "    layer_output = 10, #Number of dense layers\n",
251 |     "    dropout = 0.45,    #drop out rate :0-1\n",
252 |     "    batch_train = 8,   # batch of training set\n",
253 |     "    batch_test = 8,    #batch of test set\n",
254 |     "    lr =3e-4,          #learning rate: 1e-5,1e-4,3e-4, 5e-4, 1e-3, 3e-3,5e-3\n",
255 |     "    lr_decay = 0.85,   #Learning rate decay:0.5, 0.75, 0.85, 0.9\n",
256 |     "    decay_interval = 25,#Number of iterations for learning rate decay:10,25,30,50\n",
257 |     "    iteration = 140,    #Number of iterations \n",
258 |     "    N = 5000,           #length of embedding: 2000,3000,5000,7000 \n",
259 |     "    dataset_train='../dataset/data_train.txt') #training set\n",
260 |     "\n"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "id": "d4a50302",
266 |    "metadata": {},
267 |    "source": [
268 |     "## Part III: To test the performance of the D-GCAN on independent model"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "id": "581857d7",
274 |    "metadata": {},
275 |    "source": [
276 |     "We have provided the trained model. And it can be used directly as follow:\n",
277 |     "\n",
278 |     "We test the trained model on bRo5 dataset.\n",
279 |     "\n"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "id": "3d3ee166",
286 |    "metadata": {},
287 |    "outputs": [],
288 |    "source": []
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 4,
293 |    "id": "68722dcd",
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "import predict"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 5,
303 |    "id": "97d307e5",
304 |    "metadata": {},
305 |    "outputs": [
306 |     {
307 |      "name": "stdout",
308 |      "output_type": "stream",
309 |      "text": [
310 |       "The code uses a GPU!\n",
311 |       "../dataset/bRo5.txt\n",
312 |       "SMILESis error\n",
313 |       "bacc: 0.9580740740740741\n",
314 |       "pre: 0.9696969696969697\n",
315 |       "rec: 0.9481481481481482\n",
316 |       "f1: 0.9588014981273408\n",
317 |       "mcc: 0.9155786319049269\n",
318 |       "sp: 0.968\n",
319 |       "q_: 0.9453125\n",
320 |       "acc: 0.9576923076923077\n"
321 |      ]
322 |     }
323 |    ],
324 |    "source": [
325 |     "test = predict.predict('../dataset/bRo5.txt',\n",
326 |     "    radius = 1,\n",
327 |     "    property = True,   #True if drug-likeness is known \n",
328 |     "    dim = 52 ,\n",
329 |     "    layer_hidden = 4,\n",
330 |     "    layer_output = 10,\n",
331 |     "    dropout = 0.45,\n",
332 |     "    batch_train = 8,\n",
333 |     "    batch_test = 8,\n",
334 |     "    lr = 3e-4,\n",
335 |     "    lr_decay = 0.85,\n",
336 |     "    decay_interval = 25 ,\n",
337 |     "    iteration = 140,\n",
338 |     "    N = 5000)"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": null,
344 |    "id": "5fe34a0c",
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "Feedbacks would also be appreciated and you can send me an email (jinyusun@csu.edu.cn)!"
349 |    ]
350 |   }
351 |  ],
352 |  "metadata": {
353 |   "kernelspec": {
354 |    "display_name": "Python 3 (ipykernel)",
355 |    "language": "python",
356 |    "name": "python3"
357 |   },
358 |   "language_info": {
359 |    "codemirror_mode": {
360 |     "name": "ipython",
361 |     "version": 3
362 |    },
363 |    "file_extension": ".py",
364 |    "mimetype": "text/x-python",
365 |    "name": "python",
366 |    "nbconvert_exporter": "python",
367 |    "pygments_lexer": "ipython3",
368 |    "version": "3.8.8"
369 |   }
370 |  },
371 |  "nbformat": 4,
372 |  "nbformat_minor": 5
373 | }
374 | 


--------------------------------------------------------------------------------