├── gnn_utils.py ├── gnn_model.py ├── Example2.csv ├── README.md ├── Example1.csv ├── upload.html └── app.py /gnn_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import dgl 3 | import torch 4 | from rdkit import Chem 5 | from rdkit.Chem import Descriptors, AllChem, MACCSkeys, RDKFingerprint 6 | from torch.utils.data import Dataset 7 | def mol_to_graph(mol): 8 | if mol is None: 9 | return None 10 | num_atoms = mol.GetNumAtoms() 11 | g = dgl.graph(([], [])) 12 | g.add_nodes(num_atoms) 13 | 14 | bond_types = [] 15 | for bond in mol.GetBonds(): 16 | start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() 17 | bond_type = bond.GetBondTypeAsDouble() 18 | bond_types.extend([bond_type, bond_type]) 19 | g.add_edges([start, end], [end, start]) 20 | 21 | h_feats = [atom.GetAtomicNum() for atom in mol.GetAtoms()] 22 | g.ndata['h'] = torch.tensor(h_feats).unsqueeze(1).float() 23 | g.edata['e'] = torch.tensor(bond_types).unsqueeze(1).float() 24 | 25 | return g 26 | 27 | 28 | class MoleculeDataset(Dataset): 29 | def __init__(self, smiles_list, labels): 30 | self.smiles_list = smiles_list 31 | self.labels = labels 32 | 33 | def __len__(self): 34 | return len(self.smiles_list) 35 | 36 | def __getitem__(self, idx): 37 | smiles = self.smiles_list[idx] 38 | mol = Chem.MolFromSmiles(smiles) 39 | graph = mol_to_graph(mol) 40 | 41 | if graph is None: 42 | return None, None, None 43 | 44 | # Extract Molecular Descriptors and Fingerprints 45 | descriptor_names = [ 46 | "MolWt", 47 | "TPSA", 48 | "NumHDonors", 49 | "NumHAcceptors", 50 | "MolLogP", 51 | "NumRotatableBonds", 52 | ] 53 | descriptors = [ 54 | Descriptors.MolWt(mol), 55 | Descriptors.TPSA(mol), 56 | Descriptors.NumHDonors(mol), 57 | Descriptors.NumHAcceptors(mol), 58 | Descriptors.MolLogP(mol), 59 | Descriptors.NumRotatableBonds(mol), 60 | ] 61 | fingerprints = [ 62 | int(bit) for fp in [ 63 | AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=512).ToBitString(), 64 | MACCSkeys.GenMACCSKeys(mol).ToBitString(), 65 | RDKFingerprint(mol, fpSize=512).ToBitString() 66 | ] for bit in fp 67 | ] 68 | fingerprint_names = [f"Fingerprint_{i}" for i in range(len(fingerprints))] 69 | 70 | feature_names = descriptor_names + fingerprint_names 71 | 72 | #print("Feature names:") 73 | #print(feature_names) 74 | 75 | features = torch.tensor(descriptors + fingerprints).float() 76 | label = self.labels[idx] 77 | 78 | return graph, features, label 79 | 80 | 81 | def collate(samples): 82 | valid_samples = [s for s in samples if s[0] is not None] 83 | graphs, features, labels = map(list, zip(*valid_samples)) 84 | batched_graph = dgl.batch(graphs) 85 | return batched_graph, torch.stack(features), torch.tensor(labels) -------------------------------------------------------------------------------- /gnn_model.py: -------------------------------------------------------------------------------- 1 | import dgl 2 | import torch 3 | import torch.nn as nn 4 | from rdkit import Chem 5 | from rdkit.Chem import Descriptors, AllChem, MACCSkeys, RDKFingerprint 6 | import pandas as pd 7 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 8 | from torch.utils.data import Dataset, DataLoader 9 | from sklearn.model_selection import StratifiedShuffleSplit 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | from sklearn.metrics import roc_auc_score, roc_curve 13 | 14 | # GNN model definition 15 | class GNNLayer(nn.Module): 16 | def __init__(self, in_feats, out_feats): 17 | super(GNNLayer, self).__init__() 18 | self.linear = nn.Linear(in_feats, out_feats) 19 | self.bn = nn.BatchNorm1d(out_feats) 20 | self.residual = (in_feats == out_feats) 21 | 22 | def forward(self, g, h, e): 23 | g.ndata['h'] = h 24 | g.edata['e'] = e 25 | g.update_all(dgl.function.u_mul_e('h', 'e', 'm'), dgl.function.mean('m', 'h')) 26 | h = self.linear(g.ndata['h']) 27 | h = self.bn(h) 28 | if self.residual: 29 | h += g.ndata['h'] 30 | return h 31 | 32 | 33 | class GNN(nn.Module): 34 | def __init__(self, in_feats, hidden_size, num_classes, dropout=0.5, feature_size=1197): 35 | super(GNN, self).__init__() 36 | self.gnn1 = GNNLayer(in_feats, hidden_size) 37 | self.gnn2 = GNNLayer(hidden_size, hidden_size) 38 | self.gnn3 = GNNLayer(hidden_size, hidden_size) 39 | self.dropout = nn.Dropout(dropout) 40 | self.fc_feature = nn.Linear(feature_size, hidden_size) 41 | self.fc_combine = nn.Linear(hidden_size * 2, hidden_size) 42 | self.fc = nn.Linear(hidden_size, num_classes) 43 | 44 | def forward(self, g, features): 45 | h = g.ndata['h'] 46 | e = g.edata['e'] 47 | h = self.gnn1(g, h, e) 48 | h = self.gnn2(g, h, e) 49 | h = self.gnn3(g, h, e) 50 | h = self.dropout(h) 51 | g.ndata['h'] = h 52 | h_agg = dgl.mean_nodes(g, 'h') 53 | features_out = torch.relu(self.fc_feature(features)) 54 | combined = torch.cat((h_agg, features_out), dim=1) 55 | combined = torch.relu(self.fc_combine(combined)) 56 | return self.fc(combined) 57 | def get_features(self, g, features): 58 | h = g.ndata['h'] 59 | e = g.edata['e'] 60 | h = self.gnn1(g, h, e) 61 | h = self.gnn2(g, h, e) 62 | h = self.gnn3(g, h, e) 63 | h = self.dropout(h) 64 | g.ndata['h'] = h 65 | h_agg = dgl.mean_nodes(g, 'h') 66 | features_out = torch.relu(self.fc_feature(features)) 67 | combined = torch.cat((h_agg, features_out), dim=1) 68 | combined = torch.relu(self.fc_combine(combined)) 69 | return combined 70 | 71 | class GNNLayer(nn.Module): 72 | def __init__(self, in_feats, out_feats): 73 | super(GNNLayer, self).__init__() 74 | self.linear1 = nn.Linear(in_feats, out_feats) 75 | self.linear2 = nn.Linear(out_feats, out_feats) 76 | self.bn = nn.BatchNorm1d(out_feats) 77 | self.residual = (in_feats == out_feats) 78 | 79 | def forward(self, g, h, e): 80 | g.ndata['h'] = h 81 | g.edata['e'] = e 82 | g.update_all(dgl.function.u_mul_e('h', 'e', 'm'), dgl.function.mean('m', 'h')) 83 | h = torch.relu(self.linear1(g.ndata['h'])) 84 | h = torch.relu(self.linear2(h)) 85 | h = self.bn(h) 86 | if self.residual: 87 | h += g.ndata['h'] 88 | return h 89 | -------------------------------------------------------------------------------- /Example2.csv: -------------------------------------------------------------------------------- 1 | SMILES 2 | s1ncc2c3c(n(OC)c12)cccc3 3 | s1nc2cc(ccc2n1)C(=O)NC(=S)Nc1ccc(cc1)C(C)C 4 | s1nc(C=2C[NH+](CCC=2)C)c(SCCCF)n1 5 | s1cc(nc1C(C#N)=CNc1ccc(F)cc1F)-c1ccc(OC)cc1 6 | s1c[n+](Cc2cnc(nc2N)C)c(C)c1CCO 7 | s1c2c(nc1CN1CCN(CC1)C(=O)[C@@H]1NC(=O)CC1)cccc2 8 | s1c2c(nc1CN1CCN(CC1)C(=O)[C@@H]1NC(=O)CC1)cccc2 9 | s1c2N(S(=O)(=O)C(=CNc3ncccn3)C(=O)c2c(C)c1C)C 10 | s1c(ccc1-c1sccc1)-c1sccc1 11 | s1c(C)c(cc1C)C(=O)\C=C\c1cc(O)ccc1 12 | s1c(C)c(cc1C(OCC(=O)Nc1c(C)c(ccc1[N+](=O)[O-])C)=O)CCC 13 | s1c(C(=O)\C=C\c2cc(O)ccc2)c(nc1C(=S)N)C 14 | s1c(-c2ccccc2)c(Cc2ccccc2)c(C(=O)[O-])c1N 15 | o1nc(OCc2ccccc2)cc1CO 16 | o1nc(CC)c(n1)N 17 | o1cccc1\C=C(/C)\C1=NC2CC3(C4OCC2C1C4O)c1c(N(OC)C3=O)cccc1 18 | o1cccc1CO 19 | o1cccc1C=O 20 | o1cccc1C(Oc1ccc(cc1OCC)C=C1C(=NN(C1=O)c1ccccc1)C)=O 21 | o1cccc1C(N(C(=O)Cn1nnc2c1cccc2)c1c2c(ccc1)cccc2)C(=O)NC(CC)(C)C 22 | o1cccc1-c1nc(cc2c1[nH]c1c2cccc1)C(O)=O 23 | o1ccc(CO)c1-c1nccc2c1[nH]c1c2cccc1 24 | o1cc(cc1)[C@H]1[NH+]2[C@@H](CC[C@@H](C2)C)[C@@H](CC1)C 25 | o1cc(cc1)[C@H]1[C@]2([C@@]3(O[C@@H]3C1=O)[C@]1([C@H](CC2)[C@@]2([C@@H](C[C@H]1OC(=O)C)C(C)(C)C(=O)C=C2)C)C)C 26 | o1cc(cc1)[C@H]1[C@]2(C(=CC1=O)[C@]1([C@H](CC2)[C@@]2([C@H](C[C@H]1OC(=O)C)C(C)(C)C(=O)C=C2)C)C)C 27 | o1cc(cc1)[C@H]1[C@]2(C(=CC1=O)[C@]1([C@H](CC2)[C@@]2([C@@H](C[C@H]1OC(=O)C)C(C)(C)C(=O)C=C2)C)C)C 28 | o1cc(cc1)[C@@H]1C[C@H]2O[C@@]23[C@@]1(C)[C@@H](OC(=O)C)[C@@H](OC(=O)C)[C@H]([C@]1(C=CC(=O)C(C)(C)[C@@H]1CC(OC)=O)C)C3=C 29 | o1cc(cc1)[C@@H]1CC=C2[C@]1(CC[C@H]1[C@@]2(C)[C@H](OC(=O)C)C[C@@H]2[C@@]1(C=CC(=O)C2(C)C)C)C 30 | o1cc(cc1)C=1CC[C@H](C)C=1C(=O)\C=C(\C)/C 31 | o1cc(cc1)C1C2(C(=CC1=O)C1(C(CC2)C2(C(CC1OC(=O)C)C(C)(C)C(=O)C=C2)C)C)C 32 | o1cc(cc1)C(=O)CCC(O)C 33 | o1cc(c2c1CC(C=C)(C)C(C(C)=C)C2=O)C 34 | o1cc(c2c1-c1c(c3c(cc1)c(ccc3)C)C(=O)C2=O)C 35 | o1cc(c2c1-c1c(c3c(cc1)c(ccc3)C)C(=O)C2=O)C 36 | o1cc(c2c1-c1c(c3c(cc1)c(ccc3)C)C(=O)C2=O)C 37 | o1cc(c2c1-c1c(c3c(cc1)C(CCC3)(C)C)C(=O)C2=O)C 38 | o1c2nc3c(cccc3OC)c(OC)c2cc1 39 | o1c2nc3c(cccc3O)c(OC)c2cc1 40 | o1c2nc3c(ccc(OC\C=C(\CCC(O)C(O)(C)C)/C)c3OC)c(OC)c2cc1 41 | o1c2nc3c(ccc(OC)c3OC)c(OC)c2cc1 42 | o1c2nc3c(ccc(OC)c3OC)c(OC)c2cc1 43 | o1c2nc3c(ccc(OC)c3)c(OC)c2cc1 44 | o1c2nc3c(cc(OC)cc3)c(OC)c2cc1 45 | o1c2nc3c(c(OC)cc(OC)c3OC)c(OC)c2cc1 46 | o1c2nc3c(CC[C@@H]([O-])[C@@]3(OC)C\C=C(\C)/C)c(OC)c2cc1 47 | o1c2nc3c(CCC(O)C3(OC)CC=C(C)C)c(OC)c2cc1.O=C([O-])c1ccc[n+](c1)C 48 | o1c2cc(O)ccc2cc1-c1ccc(OC)cc1O 49 | o1c2cc(O)cc3[C@@H]4c5c([C@H]6c7c(-c(c23)c1-c1ccc(O)cc1)cc(O)cc7O[C@@H]6c1ccc(O)cc1)cc(O)cc5O[C@H]4c1ccc(O)cc1 50 | o1c2cc(O)cc3[C@@H]4c5c([C@@H]6c7c(-c(c23)c1-c1ccc([O-])cc1)cc(O)cc7O[C@H]6c1ccc([O-])cc1)cc(O)cc5O[C@H]4c1ccc([O-])cc1 51 | o1c2cc(O)c3c(N(c4c(cccc4)C3=O)C)c2cc1 52 | o1c2cc(O)c3c(N(c4c(ccc(O)c4OC)C3=O)C)c2cc1 53 | o1c2c3c(O[C@H](CC3=O)c3ccc(O)cc3)cc(O)c2c(C(=O)c2ccc(O)cc2O)c1-c1ccc(O)cc1 54 | o1c2c(occ2)cc1 55 | o1c2c(cc1[C@H](O)C)C(=O)c1c(cccc1)C2=O 56 | o1c2c(cc1C)cccc2 57 | o1c2c(cc1C)C(=O)C=C(OC)C2=O 58 | o1c2c(cc1C(O)C)C(=O)c1c(cccc1)C2=O 59 | o1c2c(cc1-c1ccc(O)cc1O)c(OC)c(C\C=C(\C)/C)c(OC)c2 60 | o1c2c(cc1)c(OC)c1c(cccc1)c2OC 61 | o1c2c(cc1)c(OC)c(cc2)C(=O)\C=C\c1ccc(cc1)C 62 | o1c2c(cc1)c(OC)c(cc2)C(=O)C(C(=O)c1ccccc1)C(C=C)(C)C 63 | o1c2c(cc1)c(C(OC)=O)c(O)c1c2cccc1 64 | o1c2c(cc1)C(=O)c1c(ccc(O)c1)C2=O 65 | o1c2c(cc(cc2OC)\C=C\C)c(C)c1-c1cc(OC)c(O)c(O)c1 66 | o1c2c(cc(cc2OC)\C=C\C)c(C)c1-c1cc(OC)c(O)c(O)c1 67 | o1c2c(cc(cc2OC)\C=C\C)c(C)c1-c1cc(OC)c(O)c(O)c1 68 | o1c2c(cc(cc2OC)C=O)c(C)c1-c1cc(OC)c(O)c(O)c1 69 | o1c2c(cc(cc2OC)C=CC)c(C)c1-c1cc(OC)c(O)c(O)c1 70 | o1c2c(c3c1cccc3)c(OC)c(OC)c(O)c2OC 71 | o1c2c(c3c1cccc3)c(OC)c(O)c(OC)c2OC 72 | o1c2c(c3c1c1c(cccc1)c(O)c3OC)C(=O)c1c(cccc1)C2=O 73 | o1c2c(c3c1c(C(=O)CC(C)C)c(O)c(C)c3O)c(O)c(C)c(O)c2C(=O)C(CC)C 74 | o1c2c(c3[nH]c4cc(O)ccc4c3cc2C=O)cc1 75 | o1c2c(c3O[C@@H]([C@@H]([O-])Cc3c(O)c2)c2ccc([O-])cc2)c(C(=O)c2c(O)cc(O)cc2O)c1-c1ccc([O-])cc1 76 | o1c2c(c3O[C@@H]([C@@H](O)Cc3c(O)c2)c2ccc(O)cc2)c(C(=O)c2c(O)cc(O)cc2O)c1-c1ccc(O)cc1 77 | o1c2c(c3O[C@@H](CCc3c(O)c2)c2ccc([O-])cc2)c(C(=O)c2c(O)cc(O)cc2O)c1-c1ccc([O-])cc1 78 | o1c2c(c3O[C@@H](CCc3c(O)c2)c2ccc(O)cc2)c(C(=O)c2c(O)cc(O)cc2O)c1-c1ccc(O)cc1 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VirtuDockDL: A Deep Learning-based Python Pipeline for Virtual Screening 2 | 3 | VirtuDockDL is a comprehensive solution for streamlining the process of drug discovery and molecular analysis. With VirtuDockDL, you can harness the power of deep learning to perform virtual screening, evaluate molecular activities, and predict binding affinities with unprecedented accuracy and speed. 4 | 5 | ## Features 6 | 7 | - **Graph Neural Network-Based Ligand Prioritization:** Streamline drug discovery with our GNN model, prioritizing ligands for speed and accuracy. 8 | - **Descriptor Analysis:** Analyze molecular descriptors to predict pharmacological profiles and drug-likeness. 9 | - **Re-screening:** Refine your ligand search iteratively, utilizing new data for targeted identification. 10 | - **Protein Refinement:** Facilitates protein refinement by uploading PDB files. 11 | - **Molecular Docking:** Predict ligand interactions with state-of-the-art simulations, focusing on optimal compounds. 12 | - **Scalable Data Processing:** Efficiently process and analyze data across all scales, ensuring fast, reliable drug discovery results. 13 | 14 | ## Installations 15 | 16 | ### Prerequisites 17 | - Python 3.8 or higher 18 | - [PyTorch](https://pytorch.org/) 19 | - [RDKit](https://www.rdkit.org/) 20 | - [OpenMM](https://openmm.org/) 21 | - [Flask](https://flask.palletsprojects.com/) 22 | 23 | ### Installing Required Libraries 24 | ```sh 25 | pip install flask torch torchvision torchaudio 26 | pip install pandas numpy scikit-learn matplotlib rdkit biopython dgl 27 | pip install openmm 28 | 29 | ``` 30 | 31 | # Usage 32 | ## Running the Application 33 | Clone the repository: 34 | ```sh 35 | git clone https://github.com/yourusername/VirtuDockDL.git 36 | cd VirtuDockDL 37 | ``` 38 | Set up your environment and install dependencies 39 | 40 | Run the Flask application 41 | ```sh 42 | python app.py 43 | ``` 44 | Open your web browser and navigate to http://127.0.0.1:5000 to access VirtuDockDL. 45 | 46 | # Uploading Files 47 | ## Upload CSV File for Ligand Prioritization 48 | Navigate to the Ligand Prioritization tab and upload your CSV file containing data of active and inactive molecules. 49 | ```sh 50 |
51 | 52 | 53 | 54 |
55 | ``` 56 | ## Upload PDB File for Protein Refinement 57 | Navigate to the Protein Refinement tab and upload your PDB file. 58 | ```sh 59 |
60 | 61 | 62 | 63 |
64 | ``` 65 | ## Upload ZIP File for Molecular Docking 66 | Navigate to the Molecular Docking tab and upload your ZIP file containing ligand structures. 67 | ```sh 68 |
69 | 70 | 71 | 72 | 73 | 74 |
75 | ``` 76 | # Main Functionalities 77 | ## Ligand Prioritization 78 | ```sh 79 | 80 | def predict(): 81 | smiles = request.form['smiles'] 82 | model = GNN() 83 | model.load_state_dict(torch.load('model.pth')) 84 | model.eval() 85 | 86 | mol_data = mol_to_graph(smiles) 87 | with torch.no_grad(): 88 | output = model(mol_data.x, mol_data.edge_index, mol_data.batch) 89 | activity = torch.sigmoid(output) 90 | 91 | return jsonify({'activity': activity.item()}) 92 | ``` 93 | # Protein Refinement 94 | ```sh 95 | def refine_protein(): 96 | file = request.files['file'] 97 | pdb = PDBFile(file) 98 | forcefield = ForceField('amber99sb.xml') 99 | modeller = Modeller(pdb.topology, pdb.positions) 100 | modeller.addHydrogens(forcefield) 101 | system = forcefield.createSystem(modeller.topology) 102 | integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.004*picosecond) 103 | simulation = Simulation(modeller.topology, system, integrator) 104 | simulation.context.setPositions(modeller.positions) 105 | simulation.minimizeEnergy() 106 | 107 | refined_file = 'refined_protein.pdb' 108 | with open(refined_file, 'w') as f: 109 | PDBFile.writeFile(simulation.topology, simulation.context.getState(getPositions=True).getPositions(), f) 110 | 111 | return send_file(refined_file, as_attachment=True) 112 | 113 | ``` 114 | # Molecular Docking 115 | ```sh 116 | 117 | def dock(): 118 | protein_file = request.files['protein_file'] 119 | ligand_file = request.files['ligand_file'] 120 | output_dir = 'docking_results' 121 | 122 | protein_path = f"{output_dir}/protein.pdb" 123 | ligand_path = f"{output_dir}/ligand.pdbqt" 124 | protein_file.save(protein_path) 125 | ligand_file.save(ligand_path) 126 | 127 | subprocess.run(['vina', '--receptor', protein_path, '--ligand', ligand_path, '--out', f'{output_dir}/out.pdbqt']) 128 | 129 | return send_file(f'{output_dir}/out.pdbqt', as_attachment=True) 130 | ``` 131 | # Tips for Success 132 | 133 | - Ensure your input files are correctly formatted and contain all necessary information. 134 | - Utilize the "De Novo Molecule Generation" feature to explore new ligands based on specified criteria. 135 | - Take advantage of our re-screening feature to iteratively refine your search for the optimal ligand. 136 | 137 | # Contributing 138 | We welcome contributions! Please fork the repository and submit pull requests for any enhancements or bug fixes. 139 | 140 | # Contact Us 141 | If you have any inquiries or encounter any issues, we encourage you to contribute to this project by opening an issue in our GitHub repository. For direct assistance or detailed inquiries, please feel free to reach out to our team: 142 | - Ms. Fatima Noor: [fatima.noor@imbb.uol.edu.pk](mailto:fatima.noor@imbb.uol.edu.pk) 143 | - Dr. Muhammad Tahir ul Qamar: [tahirulqamar@gcuf.edu.pk](mailto:tahirulqamar@gcuf.edu.pk) 144 | 145 | We are dedicated to supporting our community and enhancing the project with your valuable feedback. 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /Example1.csv: -------------------------------------------------------------------------------- 1 | SMILES,Activity 2 | [C-]#[S+],1 3 | [H]/N=C(\N)/SCC(=O)N(C)[C@@H]1CCS(=O)(=O)C1,0 4 | [Li+].[Cl-],1 5 | [Li+].[F-],1 6 | [Li+].[Li+].C(=O)([O-])[O-],1 7 | C(C(=O)O)NC(=O)C(=O)O,1 8 | C(C(C1C(=C(C(=O)O1)O)O)O)O,1 9 | C(C1C(C(C(O1)(CO)OP(=O)(O)O)O)O)OP(=O)(O)O,1 10 | C(CCNCCCN)CN,1 11 | C=CCNC1=NC=NC2=C1C=C(C=C2)Br,1 12 | C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])O,1 13 | C1=CC(=C(C=C1C=CC(=O)O)O)O,1 14 | C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O,1 15 | C1=CC(=C(C=C1C2=C(C(=O)C3=C(O2)C=C(C=C3)O)O)O)O,1 16 | C1=CC(=C(C=C1C2=CC(=O)C3=C(C=C(C=C3O2)O)O)O)O,1 17 | C1=CC(=C(C=C1Cl)C2=NC3=NC=CN=C3C(=N2)NC4=CC=NC=C4)F,1 18 | C1=CC(=C(C=C1I)F)NC2=C(C=CC(=C2F)F)C(=O)NOCC(CO)O,1 19 | C1=CC(=CC=C1C(=O)NC2=CC3=C(C=C2)NC=C3)F,1 20 | C1=CC(=CC=C1C=CC2=CC(=CC(=C2)O)O)O,1 21 | C1=CC(=CC=C1C2=CC(=O)C3=C(C=C(C=C3O2)O)O)O,1 22 | C1=CC(=CC=C1C2=NC(=C(N2)C3=CC=NC=C3)C4=CC=C(C=C4)F)[N+](=O)[O-],1 23 | C1=CC(=CN=C1)C(=O)N,1 24 | C1=CC=C(C=C1)C(=CC(=O)O)CCC2=CC=C(C=C2)Cl,1 25 | C1=CC=C(C=C1)C(=O)NC2=CC3=C(C=C2)NC=C3,1 26 | C1=CC=C(C=C1)C(=O)NC2=NC=C3C(=C2)C=CN3,1 27 | C1=CC=C(C=C1)C2=CC(=O)C3=C(O2)C=C(C=C3)O,1 28 | C1=CC=C(C=C1)CNC(=O)C2=CSC(=N2)NC3=NC=NC=C3,1 29 | C1=CC=C(C=C1)N2C3=NC=NC(=C3C=N2)N,1 30 | C1=CC=C(C=C1)NC(=O)CCCCCCC(=O)NO,1 31 | C1=CC=C2C(=C1)C(=CC=N2)C3=C(NN=C3)C4=CC=CC=N4,1 32 | C1=CC=C2C(=C1)C(=CN2)C=CC(=O)NC3=CC=CC(=C3)C(=O)N,1 33 | C1=CC=C2C(=C1)C(=O)N(C2=O)C(CC3=CNC4=CC=CC=C43)C(=O)O,1 34 | C1=CC=C2C(=C1)C(=O)OC23C4=CC(=C(C(=C4OC5=C(C(=C(C=C35)Br)[O-])Br)Br)[O-])Br.[Na+].[Na+],1 35 | C1=CC=C2C(=C1)C=CC(=C2C=NNC(=O)C3=CC=NC=C3)O,1 36 | C1=CC=C2C(=C1)C3=NNC4=CC=CC(=C43)C2=O,1 37 | C1=CC2=C3C(=C1)C(=O)N(C(=O)C3=CC=C2)CCCCCC(=O)NO,1 38 | C1=CN(C(=O)N=C1)C2C(C(C(O2)CO)O)O,1 39 | C1=CN(C2=NC=NC(=C21)N)C3C(C(C(O3)CO)O)O,1 40 | C1=CN=C(C2=C1N(C=N2)C3C=C(C(C3O)O)CO)N,1 41 | C1=NC(=NC(=O)N1C2C(C(C(O2)CO)O)O)N,1 42 | C1=NC2=C(N=C(N=C2N1C3C(C(C(O3)CO)O)O)Cl)N,1 43 | C1C(C(C(CC1(C(=O)O)O)OC(=O)C=CC2=CC(=C(C=C2)O)O)O)O,1 44 | C1C(C(OC1N2C=C(C(=O)NC2=O)Br)CO)O,1 45 | C1C(C(OC1N2C=NC(=NC2=O)N)CO)O,1 46 | C1C(C(OC2=CC(=CC(=C21)O)O)C3=CC(=C(C(=C3)O)O)O)OC(=O)C4=CC(=C(C(=C4)O)O)O,1 47 | C1C(C1N)C2=CC=CC=C2,1 48 | C1C(C1N)C2=CC=CC=C2,1 49 | c1c(cc(cc1Sc2c3c([n-]cn3)ncn2)S(=O)(=O)N)N,0 50 | c1c(cn[nH]1)CNC(=O)CC(=O)O,0 51 | c1c(cn2c(c(nc2c1Cl)C3CC3)N)Cl,0 52 | c1c(n[nH]n1)c2nnnn2CC[NH3+],0 53 | c1c(nc(s1)N=C(N)N)CSCC/C(=N/S(=O)(=O)N)/[O-],0 54 | c1c(nc(s1)N2CC[C@H]3CCCC[C@@H]3C2)CN,0 55 | C1C2=C(C3=CC=CC=C3NC1=O)NC4=C2C=C(C=C4)[N+](=O)[O-],1 56 | C1C2=C(C3=CC=CC=C3NC1=O)NC4=C2C=C(C=C4)Br,1 57 | C1C2C(C(C(O2)N3C4=NC=NC(=C4N=C3Br)N)O)OP(=O)(O1)O,1 58 | C1CCC(CC1)N2C=NC3=C(N=C(N=C32)OC4=CC=CC5=CC=CC=C54)NC6=CC=C(C=C6)N7CCOCC7,1 59 | C1CCC(CC1)NC2=NC(=NC3=C2NC=N3)NC4=CC=C(C=C4)N5CCOCC5,1 60 | C1CCN(CC1)CCOC2=CC=C(C=C2)C3=CN4C(=C(C=N4)C5=CC=NC=C5)N=C3,1 61 | C1CN(CCN1)C2=CC=C(C=C2)C3=CN4C(=C(C=N4)C5=CC=NC6=CC=CC=C56)N=C3,1 62 | C1CN(CCN1)CC2=CSC3=NC(=CN23)C4=CC=CC=C4NC(=O)C5=NC6=CC=CC=C6N=C5.Cl,1 63 | C1CNCCN(C1)S(=O)(=O)C2=CC=CC3=C2C=CC=C3I.Cl,1 64 | C1COC2=C(O1)C=CC(=C2)C3=C(NC(=N3)C4=CC=C(C=C4)C(=O)N)C5=CC=CC=N5,1 65 | C1COCCC1NC(=O)C2=CC=C(C=C2)C3=NC=CC(=C3)C4=C(NN=C4)C5=CC=CC=N5,1 66 | C1COCCN1C2=CC(=O)C3=C(O2)C(=CC=C3)C4=CC=CC=C4,1 67 | C1OC2=C(O1)C=C(C=C2)C3=C(NC(=N3)C4=CC=C(C=C4)C(=O)N)C5=CC=CC=N5,1 68 | CC(=CC1=CC=C(C=C1)C(=O)O)C2=CC3=C(C=C2)C(CCC3(C)C)(C)C,1 69 | CC(=O)c1ccc(NC(=O)c2ccc(OC(C)C)cc2)cc1C,0 70 | CC(=O)c1ccc(NC(=O)c2cccc(OCc3ccccc3)c2)cc1,0 71 | CC(=O)c1ccc(NC(=O)CCc2ccc([N+](=O)[O-])cc2)cc1,0 72 | CC(=O)c1ccc(NC(=O)CCNC(=O)C2CC2)cc1,0 73 | CC(=O)c1ccc(NC(=O)CCNC(=O)c2ccccc2)cc1,0 74 | CC(=O)c1ccc(NC(=O)Nc2ccccc2C(=O)N(C)C)cc1,0 75 | CC(=O)c1ccc(OCC(=O)N(C)Cc2ccsc2)cc1,0 76 | CC(=O)c1ccc(S(=O)(=O)Nc2cccc(C(N)=O)c2)cc1,0 77 | CC(=O)c1cccc(NC(=O)CCc2ccc(O)cc2)c1,0 78 | CC(=O)N(C)C1CCCN(C(=O)C(C)Oc2ccccc2)C1,0 79 | CC(=O)N(C)Cc1ccc(C(=O)Nc2ccccc2C(C)C)s1,0 80 | CC(=O)N(Cc1ccc(C)o1)c1nc(-c2ccsc2)cs1,0 81 | CC(=O)N(O)Cc1cccc(NC(=O)c2cccc(-n3cncn3)c2)c1,0 82 | CC(=O)N[C@@H](CS)C(=O)[O-],0 83 | CC(=O)N1CCC(C(=O)Nc2ccc(S(=O)(=O)N(C)C)cc2)CC1,0 84 | CC(=O)N1CCC(C(=O)NCc2ccc(CC(C)C)cc2)CC1,0 85 | CC(=O)N1CCC(Oc2ccc(C#N)cc2)CC1C,0 86 | CC(=O)N1CCC(Oc2cccc(C(F)(F)F)c2)CC1,0 87 | CC(=O)N1CCc2cc(C(=O)N3CCC(Oc4ccccc4)CC3)ccc21,0 88 | CC(=O)N1CCc2cc(NC(=O)c3cnc(C)cc3C)ccc21,0 89 | CC(=O)N1CCc2cc(NS(=O)(=O)c3ccc(C)c(C)c3)ccc21,0 90 | CC(=O)N1CCc2cc(S(=O)(=O)N3CCCCCC3)ccc21,0 91 | CC(=O)N1CCN(C(=O)c2ccc(S(C)(=O)=O)cc2)CC1,0 92 | CC(=O)N1CCN(C(=O)c2cccc(OCc3cscn3)c2)CC1,0 93 | CC(=O)N1CCN(C(=O)CCc2nc(-c3cccnc3)no2)CC1,0 94 | CC(=O)N1CCN(C(=O)COc2ccc3ccccc3c2)CC1,0 95 | CC(=O)N1CCN(c2nnc(SCc3ccccc3)s2)CC1,0 96 | CC(=O)NC(C(=O)N1CCCC1C(=O)NCCN1CCCC1)c1ccccc1,0 97 | CC(=O)NC(C(=O)NC(C)c1ccc(O)cc1)c1ccccc1,0 98 | CC(=O)NC(CS)C(=O)O,1 99 | CC(=O)Nc1cc(C(=O)N(C)CC(F)(F)F)ccc1C,0 100 | CC(=O)Nc1cc(C(=O)N(C)Cc2ccccn2)ccc1C,0 101 | CC(=O)Nc1cc(C(=O)N2CCCC(N3CCCC3=O)C2)ccc1F,0 102 | CC(=O)Nc1cc(NC(=O)c2ccc(NC(=O)c3ccccc3)cc2)ccc1Cl,0 103 | CC(=O)Nc1cc(NC(=O)c2cccc(C(F)(F)F)c2)ccc1F,0 104 | CC(=O)Nc1cc(NC(=O)c2nc3ccccc3s2)ccc1Cl,0 105 | CC(=O)OC1C(C2C(CCC(C2(C3(C1(OC(CC3=O)(C)C=C)C)O)C)O)(C)C)O,1 106 | CC(C(=O)NC(C1=CC=CC=C1)C(=O)OC(C)(C)C)NC(=O)CC2=CC(=CC(=C2)F)F,1 107 | CC(C(=O)NC1C(=O)N(C2=CC=CC=C2C(=N1)C3=CC=CC=C3)C)NC(=O)CC4=CC(=CC(=C4)F)F,1 108 | CC(C)(C)C1=CC(=CC(=C1)C(=O)C=CC2=CC=C(C=C2)C(=O)O)C(C)(C)C,1 109 | CC(C)(C)C1=NC2=C(N1)C3=C(C=C(C=C3)F)C4=C2C=CNC4=O,1 110 | CC(C)C1(C)CCC2(C)CCC3(C)C(CCC4C5(C)CCC(OC(=O)CC(C)(C)C(=O)O)C(C)(C)C5CCC43C)C2C1,0 111 | CC(C)c1[nH]nc(n1)[C@@H]2C[C@@H]3CCCC[C@@H]3[NH2+]2,0 112 | CC(C)c1c(=O)[nH]c(nc1[O-])CNC,0 113 | CC(C)c1c(c(n(n1)C)N2CCC(C2)(C)C)N,0 114 | CC(C)c1c(-c2ccc(F)cc2)cnn1Cc1ccc(C(=O)Nc2ccccc2)cc1,0 115 | CC(C)c1c(ncnc1N)c2ccc(c(c2)F)F,0 116 | CC(C)c1cc(C(=O)N(C)Cc2ccccc2)on1,0 117 | CC(C)c1cc(C(=O)Nc2c(C(C)C)nn(C)c2C(N)=O)ccc1F,0 118 | CC(C)c1ccc(-c2nnc(NC(=O)Cc3ccc(F)cc3)o2)cc1,0 119 | CC(C)c1ccc(CC(=O)Nc2cccc(-c3nnn[nH]3)c2)cc1,0 120 | CC(C)c1ccc(CNC(=O)Nc2cccc(C(N)=O)c2)cc1,0 121 | CC(C)c1ccc(N2CC(C(=O)Nc3ccc(C(=O)N4CCCCC4)cc3)CC2=O)cc1,0 122 | CC(C)c1ccc(NC(=O)c2cc(C(C)C)nc3ccccc23)cc1,0 123 | CC(C)C1CCC2(C)CCC3(C)C(CCC4C5(C)CCC(C)(O)C(C)(C)C5CCC43C)C12,0 124 | CC(C)c1ccc2c(c1)C(=O)CC2c1ccccc1-c1ccccn1,0 125 | CC(C)c1ccccc1N(CC(=O)NC1CCCCC1)S(C)(=O)=O,0 126 | CC(C)C1COc2ccc(NC(=O)NC3CCCCC3)cc2N1,0 127 | CC(C)c1nc(nc(n1)[O-])N,0 128 | CC(C)c1nc(sn1)NC2[C@H]3[C@@H]2C[NH2+]C3,0 129 | CC(C)CCC(C(C)C1CCC2C1(CCC3C2CC=C4C3(CCC(C4)O)C)C)O,1 130 | CC(C)CCCC(C)C1CCC2C1(CCCC2=CC=C3CC(CCC3=C)O)C,1 131 | CC(C)CN1C2=C(C(=O)N(C1=O)C)NC=N2,1 132 | CC(C)N(CCCNC(=O)NC1=CC=C(C=C1)C(C)(C)C)CC2C(C(C(O2)N3C=C(C4=C(N=CN=C43)N)Br)O)O,1 133 | CC(C)N(CCCNC(=O)NC1=CC=C(C=C1)C(C)(C)C)CC2C(C(C(O2)N3C=CC4=C(N=CN=C43)N)O)O,1 134 | CC(C)N1C2=NC=NC(=C2C(=N1)C3=CC4=C(N3)C=CC(=C4)O)N,1 135 | CC(C)N1CCC(CC1)NC2=NC(=NC3=CC(=C(C=C32)OC)OCCCN4CCCC4)C5CCCCC5,1 136 | CC(C=C(C)C=CC(=O)NO)C(=O)C1=CC=C(C=C1)N(C)C,1 137 | CC(C1CCC(CC1)C(=O)NC2=CC=NC=C2)N,1 138 | CC1(CCC(C2=C1C=CC(=C2)C(=O)NC3=CC=C(C=C3)C(=O)O)(C)C)C,1 139 | CC1=C(C(=CC=C1)Cl)NC(=O)C2=CN=C(S2)NC3=CC(=NC(=N3)C)N4CCN(CC4)CCO,1 140 | CC1=C(C(=NO1)C)C2=C(C=C3C(=C2)N=CC4=C3N(C(=O)N4)C(C)C5=CC=CC=N5)OC,1 141 | CC1=C(C(C(=C(N1)C)[N+](=O)[O-])C2=CC=CC=C2C(F)(F)F)C(=O)OC,1 142 | CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC(=O)O)C)C,1 143 | CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CCO)C)C,1 144 | CC1=C(C2=C(N1)C=CC(=C2)O)CCN,1 145 | CC1=CC=C(C=C1)C2=C(N3C=C(C=CC3=N2)C)CC(=O)N(C)C,1 146 | CC1=CC=C(C=C1)C2=CN3C4=C(CCCC4)SC3=N2.Br,1 147 | CC1=CC=C(C=C1)C2=NN(C3=NC=NC(=C23)N)C(C)(C)C,1 148 | CC1=CN=C(N1)C2=CN=C(N=C2C3=C(C=C(C=C3)Cl)Cl)NCCNC4=NC=C(C=C4)C#N,1 149 | CC1=NC(=CC=C1)C2=C(C=NN2)C3=NC4=C(C=C3)N=CC=C4,1 150 | CC1=NC(=CC=C1)C2=C(N=C(N2)C(C)(C)C)C3=CC4=C(C=C3)OCO4,1 151 | CC1=NC(=CC=C1)C2=NN(C=C2C3=CC=NC4=CC=CC=C34)C(=S)NC5=CC=CC=C5,1 152 | CC1=NC2=CC=CC=C2C(=C1)NC(=O)NC3=C(C=CC(=C3)Cl)OC,1 153 | CC1C(C(C(C(O1)OC2=C(OC3=C(C2=O)C(=CC(=C3CC=C(C)C)OC4C(C(C(C(O4)CO)O)O)O)O)C5=CC=C(C=C5)OC)O)O)O,1 154 | CC1CC2C3CCC4=CC(=O)C=CC4(C3(C(CC2(C1(C(=O)CO)O)C)O)F)C,1 155 | Cc1ccc(CCC(=O)Nc2nnc(C3CC3)s2)cc1,0 156 | Cc1ccc(CN(C)C(=O)c2ccc(NC(=O)NC(C)C)cc2)cc1,0 157 | Cc1ccc(CN2CCc3[nH]nc(-c4cccs4)c3C2)s1,0 158 | Cc1ccc(CN2CCCC3(CNC(=O)C3)C2)cc1,0 159 | Cc1ccc(CN2CCN(CC(=O)NC3CCCCCC3)CC2)o1,0 160 | Cc1ccc(CNC(=O)c2ccccc2S(=O)(=O)N2CCOCC2)s1,0 161 | Cc1ccc(CNC(=O)Nc2cccc(-c3csc(C)n3)c2)o1,0 162 | Cc1ccc(CNC(=O)Nc2ccccc2C(F)(F)F)s1,0 163 | Cc1ccc(CNC(=O)NCc2ccc(-n3cccn3)cc2)o1,0 164 | Cc1ccc(CNC(=O)S(=O)(=O)c2ccc(C)cc2)cc1,0 165 | Cc1ccc(F)c(C(=O)N2CCN(Cc3ccc(F)cc3)CC2)c1,0 166 | Cc1ccc(F)cc1S(=O)(=O)N(Cc1ccccc1)C1CC1,0 167 | CC1CCC(N(C)C(=O)c2cccc(-c3nc(C(C)C)co3)c2)CC1,0 168 | Cc1ccc(N(CC(=O)Nc2ccccc2)Cc2ccccc2)cc1,0 169 | Cc1ccc(-n2c(C(=O)C(N)=O)nc3ccccc32)cc1,0 170 | Cc1ccc(-n2c(C)cc(C(=O)C(C)NC(=O)C(C)NC(=O)c3ccc(C(=O)O)c(O)c3)c2N)cc1,0 171 | Cc1ccc(N2CC(C(=O)Nc3nc(C)oc3C)CC2=O)cc1,0 172 | Cc1ccc(-n2cc(C(=O)NCC(C)C)c(=O)[nH]c2=O)cc1,0 173 | Cc1ccc(-n2ccc(C(=O)Nc3cccc(C#N)c3)n2)cc1,0 174 | Cc1ccc(-n2cccc2CN(C)C(=O)c2c[nH]c3ccccc23)cc1,0 175 | Cc1ccc(-n2cccn2)c(C(=O)NCc2ccccn2)c1,0 176 | Cc1ccc(N2CCN(C(=O)c3ccnc(OCC(F)F)c3)CC2)cc1C,0 177 | Cc1ccc(N2CCN(C(=O)CC(c3ccccc3)c3ccccc3)CC2)cc1C,0 178 | Cc1ccc(-n2nc(C)c(C(=O)NCc3ccco3)c2C)cc1,0 179 | Cc1ccc(-n2nc(C)cc2NC(=O)Nc2cc(C)on2)cc1,0 180 | Cc1ccc(NC(=O)c2ccc(C#N)cc2)c(OC(C)C)c1,0 181 | Cc1ccc(NC(=O)c2cccc(OC(F)F)c2)c(F)c1,0 182 | Cc1ccc(NC(=O)c2cccc(OCc3cc(C)cc(C)c3)c2)nc1,0 183 | Cc1ccc(NC(=O)C2CCCN(S(=O)(=O)c3ccc4c(c3)CCC4)CC2)cc1,0 184 | Cc1ccc(NC(=O)C2CCCN2C(=O)C(C)n2cccn2)cc1,0 185 | Cc1ccc(NC(=O)Cc2csc(-c3nc4ccccc4s3)n2)cc1,0 186 | Cc1ccc(NC(=O)CN2C(=O)C3C4CCC(C4)C3C2=O)cc1,0 187 | CC1CCC(NC(=O)CNC(=O)c2cccc(C#N)c2)CC1,0 188 | Cc1ccc(NC(=O)COC(=O)c2ccc(C(N)=O)cc2)cc1,0 189 | Cc1ccc(NC(=O)CS(=O)(=O)c2cccc([N+](=O)[O-])c2)cc1,0 190 | Cc1ccc(NC(=O)CSc2n[nH]c(-c3ccccc3C)n2)cc1,0 191 | Cc1ccc(NC(=O)N(C)Cc2nnc3n2CCCCC3)cc1,0 192 | Cc1ccc(NC(=O)NC(C)c2ccccc2)c(O)c1,0 193 | Cc1ccc(NC(=O)NC2CCc3ccccc32)c(C)c1,0 194 | Cc1ccc(NC(S)Cc2ccc(Nc3ccnc(C)c3)cc2)cc1,0 195 | Cc1ccc(O)c(-c2csc(NC(=O)NC(C)C)n2)c1,0 196 | Cc1ccc(o1)[C@H](C)n2c(c(c(c2N)C#N)C)C,0 197 | Cc1ccc(OC(C)C(=O)NC(C)C)c(SC(C)C)c1,0 198 | Cc1ccc(OC(C)C(=O)OCC(=O)Nc2ccccc2)cc1,0 199 | Cc1ccc(OC2CCN(C(=O)c3ccc(F)c(C#N)c3)CC2)nc1,0 200 | Cc1ccc(OCC(=O)c2c(C)nc3sc(C)cn23)cc1,0 201 | Cc1ccc(OCC(=O)N(C)c2cccc3ccccc23)cc1C,0 202 | Cc1ccc(OCC(=O)N(Cc2ccc(F)cc2)C2CC2)cc1,0 203 | Cc1ccc(OCC(=O)N2CC(C)OC(C)C2)c(C)c1,0 204 | Cc1ccc(OCC(=O)NC2CC(C)(C)Cc3ccccc32)cc1C,0 205 | Cc1ccc(OCCNC(=O)C(=O)NCC(c2ccccc2)N2CCOCC2)cc1F,0 206 | Cc1ccc(S(=O)(=O)N(C)C(C)c2ccc(OC(F)(F)F)cc2)cc1,0 207 | Cc1ccc(S(=O)(=O)N(C)C)c(NC(=O)CSc2nnc(C)n2C)c1,0 208 | Cc1ccc(S(=O)(=O)N(C)CC(=O)Nc2ccccc2C(=O)O)cc1,0 209 | Cc1ccc(S(=O)(=O)N(C)CCC(=O)Nc2ccccc2)cc1,0 210 | Cc1ccc(S(=O)(=O)N2CCC(C(=O)NCC3CCCO3)C2)cc1,0 211 | Cc1ccc(S(=O)(=O)NC2(CC(=O)NCc3ccccc3C(F)(F)F)CCCC2)cc1,0 212 | Cc1ccc(S(=O)(=O)NC2CCN(C(=O)CC(C)C)C2)cc1,0 213 | Cc1ccc(S(=O)(=O)Nc2sc3c(c2C#N)CCC3)c(C)c1,0 214 | Cc1ccc(s1)c2cc(n[nH]2)[N-]S(=O)(=O)c3cccc(c3)C#N,0 215 | Cc1ccc(s1)c2nc([n-]n2)NC(=O)c3cc4c(s3)CCOC4,0 216 | Cc1ccc2c(c1)N[C@@H]([N-]S2(=O)=O)C(=O)Nc3ccc(cc3)C(=O)N,0 217 | Cc1ccc2c(c1)N[C@H]([N-]S2(=O)=O)C(=O)Nc3ccc(cc3)C(=O)N,0 218 | Cc1ccc2c(c1)nc(CC(=O)NCCc1ccccn1)n2C,0 219 | CC1CCC2CC(C(=CC=CC=CC(CC(C(=O)C(C(C(=CC(C(=O)CC(OC(=O)C3CCCCN3C(=O)C(=O)C1(O2)O)C(C)CC4CCC(C(C4)OC)O)C)C)O)OC)C)C)C)OC,1 220 | Cc1ccc2cccc(C(=O)NCCCN3CCCCCC3)c2c1,0 221 | Cc1ccc2cnc(Nc3ccc(S(N)(=O)=O)cc3)nc2c1,0 222 | Cc1ccc2oc(-c3c[nH]nc3C(F)(F)F)nc2c1,0 223 | Cc1cccc(C(=O)N2CCC3(CCCN3C(=O)c3ccc(F)cc3)C2)c1,0 224 | Cc1cccc(C)c1NC(=O)C(C)Nc1cccc(F)c1,0 225 | Cc1cccc(C)c1NC(=O)CCCC(=O)Nc1ccccc1[N+](=O)[O-],0 226 | Cc1cccc(C)c1OCC(=O)Nc1ccc(N2CCNC2=O)cc1,0 227 | Cc1cccc(C)c1OCC(=O)Nc1ccc2c(c1)oc(=O)n2C,0 228 | Cc1cccc(c1)/C=N\n2c(nnc2[S-])c3c4c(n[nH]3)CCC4,0 229 | Cc1cccc(c1N)Oc2c(cccn2)Cl,0 230 | Cc1cccc(c1O)[N-]S(=O)(=O)c2cnn(c2)c3ccccc3F,0 231 | Cc1cccc(CN(C)C(=O)CCc2ccc(C#N)cc2)n1,0 232 | Cc1cccc(CN2CCN(C(=O)CCc3nc(C)c(C)s3)CC2)n1,0 233 | Cc1cccc(CNC(=O)c2ccc(N3CCNC3=O)cc2)c1,0 234 | Cc1cccc(CNC(=O)CCc2c[nH]c3ccccc23)c1,0 235 | Cc1cccc(COC(=O)N2CCC(Cc3ccccc3)CC2)c1,0 236 | Cc1cccc(COC2CC(=O)N(c3cc(C)on3)C2)c1,0 237 | Cc1cccc(n1)c2c(c[nH]n2)c3ccc4c(n3)cccn4,0 238 | CCCC(=O)[O-].[Na+],1 239 | CCCC(CCC)C(=O)O,1 240 | CCCCCC(=O)C=CC1C(CC(=O)C1CCCCCCC(=O)O)O,1 241 | CNC(=O)C1=CN=C(C=C1)NN.[Tc],1 242 | COC1=C(C=C(C=C1)C=CC(=O)NC2=CC=CC=C2C(=O)O)OC,1 243 | COC1=C(C=C(C=C1)C2CC(=O)NC2)OC3CCCC3,1 244 | COC1=C(C=C2C(=C1)CC3=C2NN=C3NC4=CC(=CC=C4)F)OC,1 245 | COC1=C(C=CC(=C1)C=CC(=O)CC(=O)C=CC2=CC(=C(C=C2)O)OC)O,1 246 | COCCOC1=CN=CC(=C1)C2=NC=C3C=CC4=C(C3=C2)NC5=C4C(=O)N=CC5,1 247 | CS(=O)(=O)NC1=C(C=C(C=C1)[N+](=O)[O-])OC2=CC=CC=C2,1 248 | O=C(O)c1ccc(-c2ccc(NC(=O)c3ccco3)s2)cc1,0 249 | O=c1c2ccccc2nc(C=Nc2ccccc2C(F)(F)F)n1Cc1ccccc1Cl,0 250 | O=C1CC(C(=O)Nc2ccc(-c3ccccc3)cc2)C(=O)c2ccccc21,0 251 | O=C1CC(c2ccccc2)CN(CCCOCCOCCO)C1,0 252 | O=c1ccc2cc([N+](=O)[O-])ccc2[nH]1,0 253 | O=C1CCc2cc(OC(=O)c3ccc(N4CCOCC4)cc3)ccc2N1,0 254 | O=Cc1cc(-c2ccc(-c3ccc(F)cc3)cc2)ccc1OCCc1ccc(C(=O)O)cc1,0 255 | O=S(=O)(c1ccc(-n2ccnn2)cc1)N1CCCCC1,0 256 | O=S(=O)(NCC(O)c1ccccc1)c1cccc(Cl)c1,0 257 | OCC1c2c(cnn2-c2ccccc2)CC(O)N1C1CCCC1,0 258 | CCCCCCCCC=CCCCCCCCC(=O)OCC(COP(=O)(O)O)O,1 259 | CCCCCCCCCCCCCC=CC(C(COP(=O)(O)O)N)O,1 260 | CCN(CC)C1=CC=C(C=C1)C=NNC(=O)C2=CC=C(C=C2)O,1 261 | O=S(=O)(NCC1CN2CCCCC2CO1)c1ccccc1,0 262 | Oc1ccc(Cc2cc(C3CCCCC3)c(C3CC3)o2)cc1,0 263 | CCNC(=O)C1CN(C2CC3=CNC4=CC=CC(=C34)C2=C1)C,1 264 | CCOC(=O)C1=CC=C(C=C1)NC(=O)OC2C(OC(C(C2OC(=O)NC3=CC4=C(C=C3)OCO4)OC(=O)NC5=CC6=C(C=C5)OCO6)OC)CO,1 265 | CN(C)CCCN1C=C(C2=C1C=CC(=C2)OC)C3=C(C(=O)NC3=O)C4=CNC5=CC=CC=C54,1 266 | COCCOC1=CN=CC(=C1)C2=NC=C3C=CC4=C(C3=C2)NC5=C4C(=O)N=CC5,1 267 | CN1C=C(C2=CC=CC=C21)C=C3C4=C(C=CC(=C4)S(=O)(=O)N)NC3=O,1 268 | CN1C=CN=C1C(=O)NC2=CN(C(=C2)C(=O)NC3=CN(C(=C3)C(=O)NC4=CN(C(=C4)C(=O)NCCC(=O)NCCCN(C)C)C)C)C,1 269 | -------------------------------------------------------------------------------- /upload.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | VirtuDockDL: Automated Virtual Screening 7 | 8 | 9 | 185 | 186 | 187 |
188 |

VirtuDockDL: A Deep Learning-based Python Pipeline for Virtual Screening

189 | 190 | 191 | 208 |
209 | 210 |
211 |
212 |
213 | 214 |
215 |

Welcome to VirtuDockDL

216 |

VirtuDockDL is your comprehensive solution for streamlining the process of drug discovery and molecular analysis. With our platform, you can harness the power of deep learning to perform virtual screening, evaluate molecular activities, and predict binding affinities with unprecedented accuracy and speed.

217 |
218 | Learn How to Use 219 | Get Started 220 |
221 |
222 |

Core Features:

223 | 224 |
225 |
226 |
227 |

Graph Neural Network-Based Ligand Prioritization

228 |

Streamline drug discovery with our GNN model, prioritizing ligands for speed and accuracy.

229 |
230 |
231 |
232 |
233 |

Descriptor Analysis

234 |

Analyze molecular descriptors to predict pharmacological profiles and drug-likeness.

235 |
236 |
237 |
238 |
239 |

Re-screening

240 |

Refine your ligand search iteratively, utilizing new data for targeted identification.

241 |
242 |
243 |
244 |
245 |
246 |
247 |

Protein Refinement

248 |

Train sophisticated models to classify compounds based on your data.

249 |
250 |
251 |
252 |
253 |

Molecular Docking

254 |

Predict ligand interactions with state-of-the-art simulations, focusing on optimal compounds.

255 |
256 |
257 |
258 |
259 |

Scalable Data Processing

260 |

Efficiently process and analyze data across all scales, ensuring fast, reliable drug discovery results.

261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 | 270 |
271 |
272 |

Upload CSV File

273 |
274 | 275 | (Example) 276 | 277 |
278 | 279 |
280 |
281 | 282 | {% if generated_molecules %} 283 |

Generated Molecules

284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | {% for smiles, activity in generated_molecules %} 293 | 294 | 295 | 296 | 297 | {% endfor %} 298 | 299 |
SMILESActivity
{{ smiles }}{{ activity }}
300 | Download CSV 301 | {% endif %} 302 | 303 | 304 |
305 | {{ clusters_table|safe }} 306 |
307 | 308 | 309 | {% if plot_file_path %} 310 | Cluster Plot 311 | 312 | 313 | 314 | {% endif %} 315 | 316 | 317 |
318 | {% with messages = get_flashed_messages() %} 319 | {% if messages %} 320 |
    321 | {% for message in messages %} 322 |
  • {{ message }}
  • 323 | {% endfor %} 324 |
325 | {% endif %} 326 | 327 | {% endwith %} 328 |
329 |
330 | 331 | 332 |
333 |
334 |

Rescreening

335 |
336 |
337 | 338 | (Example) 339 | 340 |
341 | 342 |
343 |
344 | 345 | 346 | 347 | 348 | {% if success %} 349 |

Analysis successful!

350 |
351 | {{ cluster_table|safe }} 352 |
353 | 354 | 355 | {% if plot_file_p %} 356 |
357 | Cluster Plot 358 | 359 | 360 | 361 |
362 | {% endif %} 363 | 364 | {% if sdf_zip_file %} 365 |
366 |

Download the compounds in SDF format:

367 | 368 | 369 | 370 |
371 | {% endif %} 372 | 373 | {% endif %} 374 | 375 |
376 |
377 |
378 |

De Novo Molecule Generation

379 |
380 |
381 | 382 | 383 |
384 |
385 | 386 |
387 | 388 | 389 |
390 |
391 | 392 | 393 |
394 |
395 | 396 |
397 |
398 |
399 |
400 |
401 |
402 | 403 | 404 | 405 |
406 |
407 |

Upload Protein File

408 |
409 |
410 | 411 | 412 | 413 |
414 |
415 |
416 | 417 | {% if download_links %} 418 |
419 |

Results:

420 |
421 |

Generated Plots

422 | {% if download_links.ramachandran_plot %} 423 | Ramachandran Plot 424 | {% endif %} 425 | {% if download_links.sasa_per_residue_plot %} 426 | SASA Plot 427 | {% endif %} 428 |
429 |
430 |

Download Processed Files

431 | Download Stripped Protein 432 | Download Fixed Protein 433 | Download Minimized Protein 434 |
435 |
436 | {% endif %} 437 |
438 | 439 | 440 | 441 |
442 |
443 |

Upload Files for Docking

444 |
445 |
446 | 447 | 448 |
449 |
450 | 451 | 452 |
453 |

Docking Parameters

454 | 455 |
456 |
457 |
458 | 459 | 460 |
461 |
462 |
463 |
464 | 465 | 466 |
467 |
468 |
469 |
470 | 471 | 472 |
473 |
474 |
475 |
476 |
477 |
478 | 479 | 480 |
481 |
482 |
483 |
484 | 485 | 486 |
487 |
488 |
489 |
490 | 491 | 492 |
493 |
494 |
495 |
496 | 497 | 498 |
499 |
500 | 501 | 502 |
503 |
504 | 505 | 506 |
507 | 508 |
509 |
510 | 511 | 512 |
513 |

Docking Results

514 |

No results yet. Please upload files and start docking.

515 |
516 | 517 | 518 |
519 |

Docking Chart

520 | 521 |
522 |
523 |
524 | 525 | 526 |
527 |
528 |

Welcome to VirtuDockDL – Your Automated Virtual Screening Companion

529 |

VirtuDockDL leverages the power of deep learning to streamline the drug discovery process, making it faster, more accurate, and accessible. Whether you're refining protein structures, prioritizing ligands, or diving deep into molecular docking, VirtuDockDL is here to guide you every step of the way.

530 | 531 |

Getting Started:

532 |
    533 |
  • Upload Your Data: Begin by uploading your protein files and ligand datasets. VirtuDockDL accepts PDB files for proteins and CSV files for ligands. Ensure your ligand files are formatted correctly, with 'SMILES' and 'Activity' columns for virtual screening.
  • 534 |
  • Ligand Prioritization: Use our Graph Neural Network (GNN) model to efficiently prioritize ligands. This process helps in narrowing down potential candidates by predicting their pharmacological profiles.
  • 535 |
  • Protein Refinement: Upload your protein structures for refinement. Our platform will optimize your proteins to ensure accurate docking results, improving the prediction of ligand interactions.
  • 536 |
  • Molecular Docking: With your ligands prioritized and protein refined, proceed to the Molecular Docking tab. Here, VirtuDockDL simulates the interaction between your ligands and protein targets, helping identify the most promising compounds.
  • 537 |
  • Analysis and Download Results: Once docking is complete, analyze the results directly on VirtuDockDL. You can download the detailed reports and visualizations for further analysis.
  • 538 |
539 | 540 |

Tips for Success:

541 |

Ensure your input files are correctly formatted and contain all necessary information. Utilize the "De Novo Molecule Generation" feature to explore new ligands based on specified criteria, enhancing your drug discovery process. Take advantage of our re-screening feature to iteratively refine your search for the optimal ligand.

542 | 543 |

Technical Support:

544 |

Should you encounter any issues or have questions, please refer to our FAQ section or reach out to our support team. VirtuDockDL is continuously evolving, and your feedback is invaluable to us.

545 | 546 |

Disclaimer:

547 |

VirtuDockDL is designed for research purposes only. Users are responsible for the interpretation of the results, and it is recommended to corroborate the findings with experimental data.

548 | 549 |

Let's Revolutionize Drug Discovery Together

550 |

VirtuDockDL is more than a tool; it's your partner in the quest to discover new and effective therapeutics. Explore the possibilities, push the boundaries of what's achievable, and embark on a journey of innovation and discovery.

551 |
552 |
553 |
554 |
555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 752 | 753 | 754 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import random 2 | from flask import Flask, render_template, request, flash, redirect, url_for, send_from_directory, after_this_request 3 | from flask import send_from_directory, jsonify 4 | import os 5 | from flask import session 6 | import logging 7 | import json 8 | from openmm.app import PDBFile, Modeller, ForceField, Simulation, PME, HBonds 9 | from openmm import LangevinMiddleIntegrator 10 | from openmm.unit import kelvin, picosecond, picoseconds, nanometer 11 | from pathlib import Path 12 | from Bio.PDB import PDBIO 13 | from torch_geometric.data import Data 14 | from flask import Flask, request, jsonify 15 | from rdkit.Chem import PandasTools 16 | import zipfile 17 | import uuid 18 | import subprocess 19 | import shutil 20 | import time 21 | import requests 22 | from werkzeug.utils import secure_filename 23 | from flask import send_from_directory 24 | import torch.nn.functional as F 25 | from torch_geometric.nn import GCNConv, global_mean_pool 26 | from torch_geometric.data import Data 27 | from Bio.PDB import PDBParser 28 | import numpy as np 29 | from werkzeug.utils import secure_filename 30 | import csv 31 | import dgl 32 | import torch 33 | import torch.nn as nn 34 | from rdkit import Chem 35 | from rdkit.Chem import Descriptors, AllChem, MACCSkeys, RDKFingerprint 36 | import pandas as pd 37 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 38 | from torch.utils.data import Dataset, DataLoader 39 | from sklearn.model_selection import StratifiedShuffleSplit 40 | import numpy as np 41 | from flask import Flask 42 | from gnn_model import GNN 43 | from flask import Flask, request, render_template, flash, send_file 44 | import matplotlib.pyplot as plt 45 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 46 | from torch.utils.data import Dataset, DataLoader 47 | from sklearn.model_selection import StratifiedShuffleSplit 48 | import numpy as np 49 | import matplotlib.pyplot as plt 50 | from sklearn.metrics import roc_auc_score, roc_curve 51 | from torch.utils.data import DataLoader 52 | from gnn_model import GNN # Import GNN class from gnn_model.py 53 | from gnn_utils import mol_to_graph, MoleculeDataset, collate 54 | from sklearn.mixture import GaussianMixture 55 | from sklearn.metrics import silhouette_score, davies_bouldin_score 56 | from gnn_model import GNN 57 | from gnn_utils import collate, mol_to_graph, MoleculeDataset 58 | import torch 59 | from torch_geometric.data import Data 60 | from Bio.PDB import PDBParser, CaPPBuilder 61 | import mdtraj as md 62 | import openmm 63 | from openmm.app import * # This will import the necessary 'app' module classes and functions 64 | from openmm import * 65 | from openmm.unit import * 66 | from openmm.app import PDBFile, Modeller, ForceField 67 | from pdbfixer import PDBFixer 68 | from simtk.openmm.app import PDBFile 69 | import matplotlib.pyplot as plt 70 | from io import BytesIO 71 | from flask import Flask, send_from_directory, url_for, current_app, flash, redirect, render_template 72 | from datetime import datetime 73 | 74 | app = Flask(__name__) 75 | app.config['SECRET_KEY'] = 'your_secret_key' 76 | app.config['UPLOADED_FILES_DIR'] = 'uploaded_files' 77 | app.config['GENERATED_FILES_DIR'] = 'generated_files' 78 | app.config['uploaded_files_dir'] = 'uploaded_files' 79 | app.config['generated_files_dir'] = 'generated_files' 80 | app.config['UPLOAD_FOLDER'] = 'uploads' 81 | app.config['DOCKING_RESULTS_DIR'] = 'docking_results' 82 | app.config['ALLOWED_EXTENSIONS'] = {'csv', 'zip', 'pdb', 'sdf'} 83 | 84 | # Ensure directories exist 85 | for directory in [app.config['GENERATED_FILES_DIR'], app.config['UPLOADED_FILES_DIR'], app.config['generated_files_dir'], app.config['uploaded_files_dir'], app.config['UPLOAD_FOLDER'], app.config['DOCKING_RESULTS_DIR']]: 86 | os.makedirs(directory, exist_ok=True) 87 | 88 | # Directory setup 89 | for directory in [app.config['GENERATED_FILES_DIR'], app.config['UPLOADED_FILES_DIR'], app.config['generated_files_dir'], app.config['uploaded_files_dir'],app.config['UPLOAD_FOLDER']]: 90 | os.makedirs(directory, exist_ok=True) 91 | 92 | def save_data_to_csv(data, filename): 93 | """Save data to CSV format.""" 94 | with open(filename, 'w', newline='') as csv_file: 95 | fieldnames = ['SMILES', 'Activity'] 96 | writer = csv.DictWriter(csv_file, fieldnames=fieldnames) 97 | writer.writeheader() 98 | for smiles, activity in data: 99 | writer.writerow({'SMILES': smiles, 'Activity': activity}) 100 | 101 | 102 | def preprocess_csv(file): 103 | """Preprocess the uploaded CSV file.""" 104 | try: 105 | # Read the CSV file into a DataFrame 106 | df = pd.read_csv(file) 107 | 108 | # Check if the CSV file contains the required columns 109 | if 'SMILES' not in df.columns or 'Activity' not in df.columns: 110 | flash('The CSV file must have "SMILES" and "Activity" columns.', 'error') 111 | return None 112 | 113 | # Canonicalize and validate SMILES strings 114 | df['SMILES'] = df['SMILES'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), canonical=True) if Chem.MolFromSmiles(x) is not None else None) 115 | 116 | # Drop rows with None (invalid SMILES) values 117 | df.dropna(subset=['SMILES'], inplace=True) 118 | 119 | # Save the preprocessed data in CSV format 120 | timestamp = int(time.time()) 121 | filename = f'uploaded_data_{timestamp}.csv' 122 | save_data_to_csv(df.values.tolist(), filename) 123 | #flash('CSV file uploaded and saved successfully.', 'success') 124 | 125 | return df 126 | except Exception as e: 127 | flash(f'Error processing the CSV file: {str(e)}', 'error') 128 | return None 129 | 130 | 131 | def train_and_evaluate_model(train_dataloader, test_dataloader, model, optimizer, criterion, scheduler): 132 | """Train and evaluate the GNN model.""" 133 | best_val_loss = float('inf') 134 | patience = 10 135 | stop_counter = 0 136 | checkpoint_path = 'best_model.pth' 137 | for epoch in range(50): 138 | model.train() 139 | train_loss = 0 140 | for batched_graph, batched_features, batched_labels in train_dataloader: 141 | if batched_graph is None: 142 | continue 143 | optimizer.zero_grad() 144 | outputs = model(batched_graph, batched_features) 145 | loss = criterion(outputs, batched_labels) 146 | train_loss += loss.item() 147 | loss.backward() 148 | optimizer.step() 149 | train_loss /= len(train_dataloader) 150 | print(f"Epoch {epoch + 1}, Train Loss: {train_loss:.4f}") 151 | model.eval() 152 | val_loss = 0 153 | for batched_graph, batched_features, batched_labels in test_dataloader: 154 | if batched_graph is None: 155 | continue 156 | with torch.no_grad(): 157 | outputs = model(batched_graph, batched_features) 158 | loss = criterion(outputs, batched_labels) 159 | val_loss += loss.item() 160 | val_loss /= len(test_dataloader) 161 | scheduler.step(val_loss) 162 | print(f"Epoch {epoch + 1}, Validation Loss: {val_loss:.4f}") 163 | if val_loss < best_val_loss: 164 | best_val_loss = val_loss 165 | stop_counter = 0 166 | torch.save(model.state_dict(), checkpoint_path) 167 | else: 168 | stop_counter += 1 169 | if stop_counter >= patience: 170 | print("Early stopping triggered.") 171 | break 172 | model.load_state_dict(torch.load(checkpoint_path)) 173 | return model 174 | def download_clusters(): 175 | return send_file(os.path.join(app.config['GENERATED_FILES_DIR'], 'final_clusters.csv'), as_attachment=True, 176 | attachment_filename='final_clusters.csv') 177 | 178 | @app.route('/download', methods=['GET']) 179 | def download(): 180 | return send_file(os.path.join(app.config['GENERATED_FILES_DIR'], 'generated_molecules.csv'), as_attachment=True) 181 | 182 | @app.route('/', methods=['GET', 'POST']) 183 | def index(): 184 | timestamp = datetime.now().strftime("%Y%m%d%H%M%S") 185 | final_compounds_filename = f'final_compounds_{timestamp}.csv' 186 | final_clusters_filename = f'final_clusters_{timestamp}.csv' 187 | cluster_plot_filename = f'cluster_plot_{timestamp}.png' 188 | model = GNN(1, 64, 2) 189 | criterion = nn.CrossEntropyLoss() 190 | optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, weight_decay=5e-4) 191 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=5) 192 | virtual_screening = False 193 | uploaded_file_path = None 194 | generated_file_path = None 195 | generated_molecules = None 196 | plot_file_path = None 197 | 198 | if request.method == 'POST': 199 | if 'file' in request.files: 200 | file = request.files['file'] 201 | if file.filename != '': 202 | filename = f'Molecules_{timestamp}.csv' 203 | uploaded_file_path = os.path.join(app.config['UPLOADED_FILES_DIR'], filename) 204 | file.save(uploaded_file_path) 205 | #flash('CSV file uploaded and saved successfully.', 'success') 206 | 207 | # Load data and preprocess 208 | data = pd.read_csv(uploaded_file_path) 209 | smiles = data["SMILES"].tolist() 210 | labels = data["Activity"].astype(int).tolist() 211 | full_dataset = MoleculeDataset(smiles, labels) 212 | 213 | splitter = StratifiedShuffleSplit(n_splits=1, train_size=0.8, random_state=42) 214 | train_indices, test_indices = next(splitter.split(smiles, labels)) 215 | train_dataset = [full_dataset[i] for i in train_indices] 216 | test_dataset = [full_dataset[i] for i in test_indices] 217 | 218 | train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate) 219 | test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate) 220 | 221 | #flash('Dataset prepared and split into training and testing sets successfully.', 'success') 222 | 223 | # Train the model 224 | model = train_and_evaluate_model(train_dataloader, test_dataloader, model, optimizer, criterion, 225 | scheduler) 226 | 227 | all_predictions, all_targets = [], [] 228 | for batched_graph, batched_features, batched_labels in test_dataloader: 229 | if batched_graph is None: 230 | continue 231 | 232 | with torch.no_grad(): 233 | outputs = model(batched_graph, batched_features) 234 | _, predicted = torch.max(outputs, 1) 235 | all_predictions.extend(predicted.cpu().numpy()) 236 | all_targets.extend(batched_labels.cpu().numpy()) 237 | 238 | accuracy = accuracy_score(all_targets, all_predictions) 239 | precision = precision_score(all_targets, all_predictions) 240 | recall = recall_score(all_targets, all_predictions) 241 | f1 = f1_score(all_targets, all_predictions) 242 | 243 | print(f"Test Accuracy: {accuracy:.4f}") 244 | print(f"Precision: {precision:.4f}") 245 | print(f"Recall: {recall:.4f}") 246 | print(f"F1 Score: {f1:.4f}") 247 | 248 | # Evaluate the model on test data 249 | model.eval() 250 | all_probabilities = [] 251 | 252 | for batched_graph, batched_features, batched_labels in test_dataloader: 253 | if batched_graph is None: 254 | continue 255 | with torch.no_grad(): 256 | outputs = model(batched_graph, batched_features) 257 | probabilities = torch.softmax(outputs, dim=1) 258 | all_probabilities.extend(probabilities.cpu().numpy()) 259 | # Calculate AUC 260 | all_probabilities = np.array(all_probabilities) 261 | true_labels = np.array(all_targets) 262 | class_1_probs = all_probabilities[:, 1] 263 | 264 | auc = roc_auc_score(true_labels, class_1_probs) 265 | 266 | print(f"AUC: {auc:.4f}") 267 | final_compounds = [(smiles[idx], class_1_probs[i]) for i, idx in enumerate(test_indices) if 268 | class_1_probs[i] >= 0.7] 269 | sorted_compounds = sorted(final_compounds, key=lambda x: x[1], reverse=True) 270 | final_df = pd.DataFrame(sorted_compounds, columns=['Compound', 'Probability']) 271 | generated_file_path = os.path.join(app.config['GENERATED_FILES_DIR'], final_compounds_filename) 272 | final_df.to_csv(generated_file_path, index=False) 273 | 274 | print("File saved successfully!") 275 | final_df = pd.read_csv(generated_file_path) 276 | # Extract the probabilities for clustering 277 | X = final_df[['Probability']].values 278 | 279 | # Fit the Gaussian Mixture Model 280 | n_clusters = 3 # you can change this to the desired number of clusters 281 | gmm = GaussianMixture(n_components=n_clusters, random_state=42) 282 | final_df['Cluster'] = gmm.fit_predict(X) 283 | # Evaluate clustering performance 284 | silhouette_avg = silhouette_score(X, final_df['Cluster']) 285 | davies_bouldin = davies_bouldin_score(X, final_df['Cluster']) 286 | print(f"Silhouette Score: {silhouette_avg:.4f}") 287 | print(f"Davies-Bouldin Score: {davies_bouldin:.4f}") 288 | # Save final results in a file 289 | final_clusters_file_path = os.path.join(app.config['GENERATED_FILES_DIR'], final_clusters_filename) 290 | final_df.to_csv(final_clusters_file_path, index=False) 291 | print("Final clusters saved successfully!") 292 | 293 | # Extract the probabilities and clusters for plotting 294 | X = final_df[['Probability']].values 295 | clusters = final_df['Cluster'].values 296 | 297 | # Create a scatter plot 298 | plt.figure(figsize=(10, 6)) 299 | for cluster in range(n_clusters): 300 | cluster_points = X[clusters == cluster] 301 | plt.scatter(cluster_points[:, 0], cluster_points[:, 0], label=f"Cluster {cluster}") 302 | 303 | # Plot the centroids 304 | centroids = gmm.means_ 305 | plt.scatter(centroids[:, 0], centroids[:, 0], c='red', marker='X', label='Centroids') 306 | 307 | # Add labels and legend 308 | plt.xlabel('Probability') 309 | plt.ylabel('Probability') 310 | plt.title('Cluster Plot') 311 | plt.legend() 312 | 313 | # Save the plot as an image file 314 | plot_file_path = os.path.join('static', cluster_plot_filename) # Assuming your static folder is set up correctly 315 | plt.savefig(plot_file_path) 316 | plt.close() 317 | # Perform virtual screening 318 | file_path = os.path.join(app.config['GENERATED_FILES_DIR'], final_clusters_filename) 319 | if not os.path.exists(file_path): 320 | flash('File "final_clusters.csv" does not exist. Please generate the clusters first.', 'warning') 321 | else: 322 | # Read CSV files into pandas dataframes 323 | compounds_df = pd.read_csv(os.path.join(app.config['GENERATED_FILES_DIR'], 'final_compounds.csv')) 324 | clusters_df = pd.read_csv(file_path) 325 | # Convert dataframes to HTML tables 326 | compounds_table = compounds_df.to_html(classes='table table-striped table-bordered', index=False) 327 | clusters_table = clusters_df.to_html(classes='table table-striped table-bordered', index=False) 328 | 329 | virtual_screening = True 330 | # Pass data to template 331 | return render_template('upload.html', virtual_screening=virtual_screening, 332 | final_clusters_filename=final_clusters_filename, 333 | final_compounds_filename=final_compounds_filename, 334 | compounds_table=compounds_table, clusters_table=clusters_table, 335 | plot_file_path=plot_file_path[len('static/'):], 336 | generated_file_path=generated_file_path, 337 | final_clusters_file_path=final_clusters_file_path) # Added clusters_table 338 | # Add return statement for GET request 339 | return render_template('upload.html', virtual_screening=virtual_screening, 340 | uploaded_file_path=uploaded_file_path, 341 | generated_file_path=generated_file_path, 342 | generated_molecules=generated_molecules, 343 | plot_file_path=plot_file_path) 344 | 345 | def allow_files(filename): 346 | ALLOWED_EXTENSIONS = {'csv'} 347 | return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS 348 | # Ensure the static file serving route can handle the new library cluster plot image 349 | @app.route('/images/') 350 | def uploaded_file(filename): 351 | return send_from_directory('path to/PycharmProjects/pythonProject3/generated_files', filename) 352 | @app.route('/download/sdf_zip') 353 | def download_sdf_zip(): 354 | return send_from_directory(app.config['GENERATED_FILES_DIR'], 'compounds_sdf.zip', as_attachment=True) 355 | def get_compound_name_from_pubchem(smiles_string): 356 | # URL for the PubChem PUG-REST service 357 | url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles_string}/synonyms/JSON" 358 | try: 359 | response = requests.get(url) 360 | response.raise_for_status() 361 | data = response.json() 362 | name = data['InformationList']['Information'][0]['Synonym'][0] 363 | return name 364 | except requests.exceptions.HTTPError as http_err: 365 | print(f"HTTP error occurred: {http_err}") 366 | except Exception as err: 367 | print(f"An error occurred: {err}") 368 | return None 369 | 370 | @app.route('/rescreening', methods=['POST']) 371 | def rescreening(): 372 | if request.method == 'POST': 373 | if 'file' in request.files: 374 | file = request.files['file'] 375 | if file and allow_files(file.filename): 376 | filename = 'New_Library.csv' 377 | uploaded_file_path = os.path.join(app.config['UPLOADED_FILES_DIR'], filename) 378 | file.save(uploaded_file_path) 379 | #flash('CSV file uploaded and saved successfully.', 'success') 380 | # Load the trained model 381 | model = GNN(1, 64, 2) 382 | model.load_state_dict(torch.load("best_model.pth")) 383 | model.eval() 384 | # Create a dataset for the new library of compounds 385 | new_data = pd.read_csv(uploaded_file_path) 386 | new_smiles = new_data["SMILES"].tolist() 387 | new_dataset = MoleculeDataset(new_smiles, [0] * len(new_smiles)) # labels are not used in prediction 388 | # Use the model to predict the drug-like potential for each compound in the library 389 | new_dataloader = DataLoader(new_dataset, batch_size=32, shuffle=False, collate_fn=collate) 390 | 391 | all_probabilities = [] 392 | for batched_graph, batched_features, _ in new_dataloader: 393 | if batched_graph is None: 394 | continue 395 | 396 | with torch.no_grad(): 397 | outputs = model(batched_graph, batched_features) 398 | probabilities = torch.softmax(outputs, dim=1) 399 | 400 | all_probabilities.extend(probabilities.cpu().numpy()) 401 | # Evaluate the results and perform clustering if needed 402 | all_probabilities = np.array(all_probabilities) 403 | class_1_probs = all_probabilities[:, 1] 404 | # Save final compounds with their respective predicted probabilities 405 | final_compounds = [(new_smiles[i], class_1_probs[i]) for i in range(len(new_smiles))if class_1_probs[i] > 0.7] 406 | print(final_compounds) 407 | sorted_compounds = sorted(final_compounds, key=lambda x: x[1], reverse=True) 408 | final_df = pd.DataFrame(sorted_compounds, columns=['Compound', 'Probability']) 409 | generated_file_path = os.path.join(app.config['GENERATED_FILES_DIR'], 'new_library_predictions.csv') 410 | final_df.to_csv(generated_file_path, index=False) 411 | 412 | # Extract the probabilities for clustering 413 | X = final_df[['Probability']].values 414 | 415 | # Fit the Gaussian Mixture Model 416 | n_clusters = 3 # you can change this to the desired number of clusters 417 | gmm = GaussianMixture(n_components=n_clusters, random_state=42) 418 | final_df['Cluster'] = gmm.fit_predict(X) 419 | 420 | # Save final results in a file 421 | final_clusters_file_path = os.path.join(app.config['GENERATED_FILES_DIR'], 'new_library_clusters.csv') 422 | final_df.to_csv(final_clusters_file_path, index=False) 423 | # Convert SMILES to SDF 424 | compounds_sdf_dir = os.path.join(app.config['GENERATED_FILES_DIR'], 'compounds_sdf') 425 | if not os.path.exists(compounds_sdf_dir): 426 | os.makedirs(compounds_sdf_dir) 427 | 428 | # Save compounds to SDF with PubChem names 429 | for index, row in final_df.iterrows(): 430 | mol = Chem.MolFromSmiles(row['Compound']) 431 | compound_name = get_compound_name_from_pubchem( 432 | row['Compound']) or f"Compound_{index}" # Fetch compound name 433 | if mol: 434 | mol.SetProp("_Name", compound_name) 435 | mol.SetProp("Probability", str(row['Probability'])) 436 | mol.SetProp("Cluster", str(row['Cluster'])) 437 | 438 | # Create a filename from the compound name 439 | safe_filename = secure_filename( 440 | compound_name) # Use secure_filename to ensure it's safe for file paths 441 | sdf_filename = f"{safe_filename}.sdf" if safe_filename else f"Compound_{index}.sdf" 442 | sdf_path = os.path.join(compounds_sdf_dir, sdf_filename) 443 | 444 | with Chem.SDWriter(sdf_path) as writer: 445 | writer.write(mol) 446 | 447 | # Convert the molecule to SDF format 448 | sdf_path = os.path.join(compounds_sdf_dir, f"Compound_{index}.sdf") 449 | with Chem.SDWriter(sdf_path) as writer: 450 | writer.write(mol) 451 | 452 | # Zip the SDF directory 453 | sdf_zipfile_path = os.path.join(app.config['GENERATED_FILES_DIR'], 'compounds_sdf.zip') 454 | with zipfile.ZipFile(sdf_zipfile_path, 'w', zipfile.ZIP_DEFLATED) as zipf: 455 | for root, _, files in os.walk(compounds_sdf_dir): 456 | for file in files: 457 | file_path = os.path.join(root, file) 458 | zipf.write(file_path, arcname=os.path.relpath(file_path, compounds_sdf_dir)) 459 | 460 | # Clean up the individual SDF files after zipping by removing the directory 461 | shutil.rmtree(compounds_sdf_dir) 462 | 463 | # Extract the probabilities and clusters for plotting 464 | clusters = final_df['Cluster'].values 465 | 466 | # Create a scatter plot 467 | plt.figure(figsize=(10, 6)) 468 | for cluster in range(n_clusters): 469 | cluster_points = X[clusters == cluster] 470 | plt.scatter(cluster_points[:, 0], cluster_points[:, 0], label=f"Cluster {cluster}") 471 | 472 | # Plot the centroids 473 | centroids = gmm.means_ 474 | plt.scatter(centroids[:, 0], centroids[:, 0], c='red', marker='X', label='Centroids') 475 | 476 | # Add labels and legend 477 | plt.xlabel('Probability') 478 | plt.ylabel('Probability') 479 | plt.title('Cluster Plot') 480 | plt.legend() 481 | 482 | # Save the plot as an image file 483 | new_plot_file_path = os.path.join(app.config['GENERATED_FILES_DIR'], 'new_library_cluster_plot.png') 484 | plt.savefig(new_plot_file_path) 485 | plt.close() 486 | 487 | # Convert dataframes to HTML tables 488 | compound_table = final_df.to_html(classes='table table-striped table-bordered', index=False) 489 | cluster_table = final_df.to_html(classes='table table-striped table-bordered', index=False) 490 | #clusters_table = final_df[['Compound', 'Cluster']].to_html(classes='table table-striped table-bordered', 491 | #index=False) 492 | 493 | return render_template('upload.html', success=True, compound_table=compound_table, 494 | cluster_table=cluster_table, plot_file_p='new_library_cluster_plot.png', 495 | sdf_zip_file='compounds_sdf.zip') 496 | else: 497 | return render_template('upload.html') 498 | 499 | def generate_de_novo_molecules(num_molecules, apply_lipinski=True): 500 | generated_mol = [] 501 | elements = ['C', 'H', 'O', 'N'] # You can expand this list with more elements 502 | 503 | # Correctly obtain the directory path from app.config 504 | generated_files_dir = app.config['GENERATED_FILES_DIR'] 505 | 506 | while len(generated_mol) < num_molecules: 507 | compound = ''.join(random.choice(elements) for _ in range(5, 20)) # Generate a compound 508 | mol = Chem.MolFromSmiles(compound) 509 | if mol is None: 510 | continue 511 | 512 | activity = 0 513 | 514 | if apply_lipinski: 515 | molecular_weight = Descriptors.MolWt(mol) 516 | logP = Descriptors.MolLogP(mol) 517 | num_h_donors = Descriptors.NumHDonors(mol) 518 | num_h_acceptors = Descriptors.NumHAcceptors(mol) 519 | # Check if the molecule meets desired property criteria 520 | if 150 <= molecular_weight <= 500 and -2 <= logP <= 5 and num_h_donors <= 5 and num_h_acceptors <= 10: 521 | generated_mol.append((Chem.MolToSmiles(mol, canonical=True), activity)) 522 | else: 523 | generated_mol.append((Chem.MolToSmiles(mol, canonical=True), activity)) 524 | 525 | # Use the corrected directory path 526 | filename = 'Molecules.csv' 527 | generated_file_p = os.path.join(generated_files_dir, filename) 528 | save_data_to_csv(generated_mol, generated_file_p) 529 | 530 | return generated_mol 531 | 532 | @app.route('/generate', methods=['POST']) 533 | def generate_molecules(): 534 | num_molecules = int(request.form['num_molecules']) 535 | apply_lipinski = request.form.get('options') == 'lipinski' 536 | generated_molecules = generate_de_novo_molecules(num_molecules, apply_lipinski) 537 | 538 | # Assuming save_data_to_csv handles the saving process correctly 539 | filename = 'Molecules.csv' 540 | file_path = os.path.join(app.config['GENERATED_FILES_DIR'], filename) 541 | save_data_to_csv(generated_molecules, file_path) 542 | 543 | return send_file(file_path, as_attachment=True, download_name=filename) 544 | 545 | 546 | @app.route('/downloads/') 547 | def downloads(filename): 548 | directory = app.config['GENERATED_FILES_DIR'] 549 | try: 550 | return send_from_directory(directory, filename, as_attachment=True) 551 | except FileNotFoundError: 552 | return "File not found.", 404 553 | 554 | # Create a logger to capture the output typically sent to Flask's app.logger 555 | logging.basicConfig(level=logging.DEBUG) 556 | logger = logging.getLogger(__name__) 557 | 558 | def perform_protein_refinement(protein_file_path): 559 | timestamp = int(time.time()) 560 | # Updated file names with timestamp 561 | stripped_pdb_filename = f'protein_stripped_{timestamp}.pdb' 562 | fixed_pdb_filename = f'fixed_output_{timestamp}.pdb' 563 | minimized_pdb_filename = f'minimized_protein_{timestamp}.pdb' 564 | ramachandran_plot_filename = f'ramachandran_plot_{timestamp}.png' 565 | sasa_per_residue_plot_filename = f'sasa_per_residue_plot_{timestamp}.png' 566 | logger.debug(f"Starting protein refinement for: {protein_file_path}") 567 | traj = md.load(protein_file_path) 568 | protein = traj.topology.select('protein') 569 | stripped_traj = traj.atom_slice(protein) 570 | stripped_traj.save(stripped_pdb_filename) 571 | 572 | fixer = PDBFixer(stripped_pdb_filename) 573 | fixer.findMissingResidues() 574 | fixer.findMissingAtoms() 575 | fixer.addMissingAtoms() 576 | fixer.addMissingHydrogens(7.4) 577 | with open(fixed_pdb_filename, 'w') as f: 578 | PDBFile.writeFile(fixer.topology, fixer.positions, f) 579 | logger.debug("Protein fixed with PDBFixer. Saved to " + fixed_pdb_filename) 580 | 581 | pdb = PDBFile(fixed_pdb_filename) 582 | modeller = Modeller(pdb.topology, pdb.positions) 583 | forcefield = ForceField('amber14-all.xml', 'amber14/tip3pfb.xml') 584 | try: 585 | modeller.addHydrogens(forcefield) 586 | logger.debug("Added hydrogens to the model.") 587 | except Exception as e: 588 | logger.error(f"An error occurred while adding hydrogens: {e}") 589 | raise 590 | 591 | system = forcefield.createSystem(modeller.topology, nonbondedMethod=PME, nonbondedCutoff=1 * nanometer, constraints=HBonds) 592 | integrator = LangevinMiddleIntegrator(300 * kelvin, 1 / picosecond, 0.004 * picoseconds) 593 | simulation = Simulation(modeller.topology, system, integrator) 594 | simulation.context.setPositions(modeller.positions) 595 | simulation.minimizeEnergy(maxIterations=500) 596 | with open(minimized_pdb_filename, 'w') as f: 597 | state = simulation.context.getState(getPositions=True) 598 | PDBFile.writeFile(modeller.topology, state.getPositions(), f) 599 | logger.debug("Minimized protein structure saved to " + minimized_pdb_filename) 600 | 601 | traj = md.load(minimized_pdb_filename) # Corrected variable name 602 | # Generate and save Ramachandran plot 603 | phi, psi = md.compute_phi(traj), md.compute_psi(traj) 604 | phi_angles, psi_angles = np.rad2deg(md.compute_dihedrals(traj, phi[0])), np.rad2deg( 605 | md.compute_dihedrals(traj, psi[0])) 606 | 607 | plt.figure(figsize=(8, 6)) 608 | plt.scatter(phi_angles, psi_angles, s=2, c='blue', alpha=0.5) 609 | # For alpha helices 610 | plt.fill_betweenx(np.arange(-180, 50, 1), -100, -45, color='orange', alpha=0.25) 611 | plt.fill_betweenx(np.arange(-100, 180, 1), 45, 100, color='orange', alpha=0.25) 612 | 613 | # For beta sheets 614 | plt.fill_between(np.arange(-180, 180, 1), 135, 180, color='green', alpha=0.25) 615 | plt.fill_between(np.arange(-180, 180, 1), -180, -135, color='green', alpha=0.25) 616 | 617 | plt.xlim(-180, 180) 618 | plt.ylim(-180, 180) 619 | plt.xlabel('Phi (φ) angles (degrees)') 620 | plt.ylabel('Psi (ψ) angles (degrees)') 621 | plt.title('Ramachandran Plot with Highlighted Secondary Structure Regions') 622 | plt.grid(True) 623 | 624 | # Annotations for secondary structure types 625 | plt.text(-75, 150, 'β-sheet', horizontalalignment='center', verticalalignment='center', color='green', 626 | alpha=0.75) 627 | plt.text(-60, -60, 'α-helix', horizontalalignment='center', verticalalignment='center', color='orange', 628 | alpha=0.75) 629 | plt.text(60, 60, 'α-helix', horizontalalignment='center', verticalalignment='center', color='orange', 630 | alpha=0.75) 631 | plt.text(100, -160, 'β-sheet', horizontalalignment='center', verticalalignment='center', color='green', 632 | alpha=0.75) 633 | 634 | plt.savefig(f'static/{ramachandran_plot_filename}') 635 | plt.close() 636 | # Compute SASA and plot average SASA per residue 637 | sasa = md.shrake_rupley(traj, mode='residue') 638 | # Plot SASA for each residue 639 | plt.plot(np.mean(sasa, axis=0)) 640 | plt.title('Average Solvent Accessible Surface Area (SASA) per residue') 641 | plt.xlabel('Residue') 642 | plt.ylabel('SASA (nm²)') 643 | plt.savefig(f'static/{sasa_per_residue_plot_filename}') 644 | plt.close() 645 | 646 | return { 647 | 'stripped_pdb': stripped_pdb_filename, 648 | 'fixed_pdb': fixed_pdb_filename, 649 | 'minimized_pdb': minimized_pdb_filename, 650 | 'ramachandran_plot': f'static/{ramachandran_plot_filename}', 651 | 'sasa_per_residue_plot': f'static/{sasa_per_residue_plot_filename}' 652 | } 653 | 654 | 655 | def allowed_file(filename): 656 | ALLOWED_EXTENSIONS = {'pdb'} # Add or remove file extensions as needed. 657 | return '.' in filename and \ 658 | filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS 659 | 660 | 661 | @app.route('/protein_refinement', methods=['GET', 'POST']) 662 | def protein_refinement(): 663 | try: 664 | if request.method == 'POST': 665 | # Check if the post request has the file part 666 | if 'file' not in request.files: 667 | flash('No file part', 'error') 668 | return redirect(request.url) 669 | file = request.files['file'] 670 | if file.filename == '': 671 | flash('No selected file', 'error') 672 | return redirect(request.url) 673 | if file and allowed_file(file.filename): 674 | filename = secure_filename(file.filename) 675 | protein_file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename) 676 | file.save(protein_file_path) 677 | #flash('Protein file uploaded successfully.', 'success') 678 | perform_protein_refinement(protein_file_path) 679 | file2 = 'ramachandran_plot.png' 680 | file3 = 'sasa_per_residue_plot.png' 681 | result_files = perform_protein_refinement(protein_file_path) 682 | # Generate download links and visualization data for the protein refinement results 683 | download_links = { 684 | 'stripped_protein': url_for('uploa', filename=result_files['stripped_pdb']), 685 | 'fixed_protein': url_for('uploa', filename=result_files['fixed_pdb']), 686 | 'minimized_protein': url_for('uploa', filename=result_files['minimized_pdb']), 687 | 'ramachandran_plot': url_for('static', filename=os.path.basename(result_files['ramachandran_plot'])), 688 | 'sasa_per_residue_plot': url_for('static', filename=os.path.basename(result_files['sasa_per_residue_plot'])) 689 | } 690 | 691 | return render_template('upload.html', download_links=download_links, random=int(time.time()), active_tab='protein_refinement') 692 | except Exception as e: 693 | app.logger.error(f"An error occurred during protein refinement: {str(e)}") 694 | flash('An error occurred during processing.', 'error') 695 | return redirect(request.url) 696 | return render_template('upload.html', active_tab='protein_refinement') 697 | @app.route('/files/') 698 | def uploa(filename): 699 | # This sets the directory to your app's root directory 700 | directory = current_app.root_path 701 | return send_from_directory(directory, filename) 702 | 703 | 704 | def allowed_fil(filename): 705 | return '.' in filename and filename.rsplit('.', 1)[1].lower() in {'zip', 'pdb'} 706 | def convert_sdf_to_pdbqt(sdf_path, output_directory): 707 | # Function to convert SDF files in a specified directory to PDBQT format 708 | for root, dirs, files in os.walk(output_directory): 709 | for file in files: 710 | if file.endswith(".sdf"): # Check for .sdf files 711 | sdf_path = os.path.join(root, file) 712 | pdbqt_filename = file.replace('.sdf', '.pdbqt') 713 | pdbqt_path = os.path.join(root, pdbqt_filename) 714 | # Prepare the obabel command 715 | obabel_command = [ 716 | 'obabel', sdf_path, '-O', pdbqt_path, 717 | '--gen3d', '-h' # The -h flag adds hydrogens 718 | ] 719 | # Run the obabel command 720 | try: 721 | subprocess.run(obabel_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 722 | print(f"Conversion successful for {file}") 723 | except subprocess.CalledProcessError as e: 724 | print(f"An error occurred while converting {file}: {e.stderr.decode()}") 725 | 726 | def convert_protein(protein_pdb_path, protein_pdbqt_path): 727 | # Function to convert a PDB file to PDBQT 728 | obabel_command = [ 729 | 'obabel', protein_pdb_path, '-xr', '-O', protein_pdbqt_path # The -xr flag removes residues not recognized by AutoDock 730 | ] 731 | try: 732 | subprocess.run(obabel_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 733 | print(f"Conversion successful for {protein_pdb_path}") 734 | except subprocess.CalledProcessError as e: 735 | error_message = e.stderr.decode() if e.stderr else 'An error occurred.' 736 | print(f"An error occurred while converting {protein_pdb_path}: {error_message}") 737 | 738 | 739 | def clear_workspace(workspace_path): 740 | if os.path.exists(workspace_path): 741 | shutil.rmtree(workspace_path) 742 | os.makedirs(workspace_path) 743 | 744 | 745 | @app.route('/upload', methods=['POST']) 746 | def upload_files(): 747 | # Generate a unique job ID for this particular user's session or job 748 | job_id = uuid.uuid4().hex 749 | job_workspace = os.path.join(app.config['UPLOAD_FOLDER'], job_id) 750 | job_results_dir = os.path.join(app.config['DOCKING_RESULTS_DIR'], job_id) 751 | 752 | # Create directories for the job 753 | clear_workspace(job_workspace) # Clear previous data and create new workspace 754 | clear_workspace(job_results_dir) # Clear previous results and create new results directory 755 | 756 | # Save uploaded protein and ligand files 757 | protein_file = request.files.get('protein_file') 758 | ligand_zip = request.files.get('ligand_zip') 759 | 760 | if protein_file and allowed_fil(protein_file.filename) and ligand_zip and allowed_fil(ligand_zip.filename): 761 | protein_filename = secure_filename(protein_file.filename) 762 | ligand_zip_filename = secure_filename(ligand_zip.filename) 763 | 764 | protein_file_path = os.path.join(job_workspace, protein_filename) 765 | ligand_zip_path = os.path.join(job_workspace, ligand_zip_filename) 766 | 767 | protein_file.save(protein_file_path) 768 | ligand_zip.save(ligand_zip_path) 769 | 770 | # Unzip ligands and start conversion 771 | output_directory_path = os.path.join(job_workspace, 'refined_ligands') 772 | Path(output_directory_path).mkdir(parents=True, exist_ok=True) 773 | 774 | with zipfile.ZipFile(ligand_zip_path, 'r') as zip_ref: 775 | zip_ref.extractall(output_directory_path) 776 | 777 | # Convert and dock 778 | convert_sdf_to_pdbqt(sdf_path=ligand_zip_path, output_directory=output_directory_path) 779 | protein_pdbqt_path = protein_file_path.replace('.pdb', '.pdbqt') 780 | convert_protein(protein_file_path, protein_pdbqt_path) 781 | run_docking(protein_pdbqt_path, output_directory_path, job_results_dir) 782 | 783 | return jsonify({'job_id': job_id, 'message': 'Files uploaded, conversion started, and docking initiated!'}) 784 | else: 785 | return jsonify({'error': 'Invalid file type or missing files.'}), 400 786 | 787 | 788 | def run_docking(protein_pdbqt, ligand_directory_path, results_directory_path): 789 | print("Starting the docking process...") # Debug print 790 | for ligand_file in Path(ligand_directory_path).glob('*.pdbqt'): 791 | ligand_pdbqt = str(ligand_file) 792 | result_file_path = os.path.join(results_directory_path, ligand_file.stem + '_docked.pdbqt') 793 | # Extract docking parameters from the form 794 | center_x = request.form.get('center_x', type=float) 795 | center_y = request.form.get('center_y', type=float) 796 | center_z = request.form.get('center_z', type=float) 797 | size_x = request.form.get('size_x', type=float) 798 | size_y = request.form.get('size_y', type=float) 799 | size_z = request.form.get('size_z', type=float) 800 | exhaustiveness = request.form.get('exhaustiveness', type=int) 801 | num_modes = request.form.get('num_modes', type=int) 802 | energy_range = request.form.get('energy_range', type=int) 803 | 804 | # Configuration text for docking 805 | # Use these parameters in the docking configuration 806 | config_text = f"""receptor = {protein_pdbqt} 807 | ligand = {ligand_pdbqt} 808 | 809 | center_x = {center_x} 810 | center_y = {center_y} 811 | center_z = {center_z} 812 | size_x = {size_x} 813 | size_y = {size_y} 814 | size_z = {size_z} 815 | 816 | out = {result_file_path} 817 | exhaustiveness = {exhaustiveness} 818 | num_modes = {num_modes} 819 | energy_range = {energy_range} 820 | """ 821 | # Write configuration to a file 822 | config_file_path = os.path.join(results_directory_path, ligand_file.stem + '_config.txt') 823 | with open(config_file_path, 'w') as config_file: 824 | config_file.write(config_text) 825 | 826 | # Run Vina with output capture 827 | vina_command = ['vina', '--config', config_file_path] 828 | try: 829 | result = subprocess.run(vina_command, capture_output=True, text=True) 830 | if result.returncode != 0: # Check if the command was not successful 831 | print(f"Error in docking: {result.stderr}") # Log any errors 832 | else: 833 | print(f"Docking completed for {ligand_file.stem}. Output:\n{result.stdout}") # Log the success output 834 | except Exception as e: 835 | print(f"An exception occurred: {e}") # Log any exceptions 836 | finally: 837 | # Clean up the config file after docking 838 | os.remove(config_file_path) 839 | # Initialize an empty list to collect docking data 840 | docking_data = [] 841 | for file_name in Path(results_directory_path).glob('*_docked.pdbqt'): 842 | with open(file_name, 'r') as file: 843 | lines = file.readlines() 844 | # Extract data for all poses 845 | for line in lines: 846 | if line.startswith("REMARK VINA RESULT:"): 847 | # Parse out the binding affinity and RMSD 848 | parts = line.split() 849 | binding_affinity = float(parts[3]) # The fourth item on this line is the affinity 850 | rmsd_lb = float(parts[4]) # RMSD lower bound 851 | rmsd_ub = float(parts[5]) # RMSD upper bound 852 | # Store in the list with the 'file_name' key 853 | docking_data.append({ 854 | 'file_name': os.path.basename(file_name), # Use basename to get the file name only 855 | 'binding_affinity': binding_affinity, 856 | 'rmsd_lb': rmsd_lb, 857 | 'rmsd_ub': rmsd_ub 858 | }) 859 | 860 | # Check if docking data was collected 861 | if docking_data: 862 | # Convert list to DataFrame 863 | df = pd.DataFrame(docking_data) 864 | df_second_poses = df.groupby('file_name').nth(1) # This selects the second pose for each ligand 865 | df_second_poses['final_rmsd'] = df_second_poses['rmsd_ub'] - df_second_poses['rmsd_lb'] 866 | df_best_poses = df_second_poses 867 | print(df_best_poses) 868 | csv_file_path = os.path.join(results_directory_path, 'docking_results.csv') 869 | df_best_poses.to_csv(csv_file_path, index=False) 870 | else: 871 | print("No docking data to process.") 872 | 873 | def validate_docking_output(docked_file_path): 874 | if os.path.exists(docked_file_path) and os.path.getsize(docked_file_path) > 0: 875 | with open(docked_file_path, 'r') as file: 876 | for i in range(10): 877 | line = file.readline() 878 | if not line: 879 | break 880 | print(line.strip()) # Process line or check if it's as expected 881 | else: 882 | print(f"Docked file {docked_file_path} not found or is empty.") 883 | @app.route('/docking', methods=['GET']) 884 | def docking(): 885 | protein_file_path = request.args.get('protein_file_path', type=str) 886 | protein_pdbqt_path = os.path.join(app.config['UPLOADED_FILES_DIR'], protein_file_path) 887 | ligand_directory_path = os.path.join(app.config['GENERATED_FILES_DIR'], 'refined_ligands') 888 | results_directory_path = os.path.join(app.config['DOCKING_RESULTS_DIR']) 889 | 890 | run_docking(protein_pdbqt_path, ligand_directory_path, results_directory_path) 891 | return jsonify({'message': 'Docking completed!'}) 892 | 893 | @app.route('/list_docking_results') 894 | def list_docking_results(): 895 | results_files = Path(app.config['DOCKING_RESULTS_DIR']).glob('*_docked.pdbqt') 896 | results_list = [str(result) for result in results_files if result.is_file() and result.stat().st_size > 0] 897 | return jsonify(results_list) 898 | @app.route('/results/') 899 | def download_results(filename): 900 | results_directory_path = os.path.join(app.config['DOCKING_RESULTS_DIR']) 901 | return send_from_directory(directory=results_directory_path, filename=filename, as_attachment=True) 902 | @app.route('/analyze_results/', methods=['GET']) 903 | def analyze_results(job_id): 904 | # Directory where the results are stored 905 | results_directory = os.path.join(app.config['DOCKING_RESULTS_DIR'], job_id) 906 | filepath = os.path.join(results_directory, 'docking_results.csv') 907 | 908 | if os.path.isfile(filepath) and os.path.getsize(filepath) > 0: 909 | return send_file(filepath, as_attachment=True) # Send the file for download 910 | else: 911 | return jsonify({'message': 'Results not ready'}), 202 912 | 913 | 914 | @app.route('/chart_data/') # URL pattern includes job_id 915 | def chart_data(job_id): 916 | # Construct the file path using the job_id provided in the URL 917 | job_results_dir = os.path.join(app.config['DOCKING_RESULTS_DIR'], job_id) 918 | filepath = os.path.join(job_results_dir, 'docking_results.csv') 919 | 920 | if os.path.isfile(filepath): 921 | df = pd.read_csv(filepath) 922 | # Create a barplot of binding affinities 923 | binding_affinities = df['binding_affinity'].tolist() 924 | file_names = df['file_name'].tolist() 925 | chart_data = { 926 | 'labels': file_names, 927 | 'datasets': [{ 928 | 'label': 'Binding Affinity', 929 | 'data': binding_affinities, 930 | 'backgroundColor': 'rgba(0, 123, 255, 0.5)', 931 | 'borderColor': 'rgba(0, 123, 255, 1)', 932 | 'borderWidth': 1 933 | }] 934 | } 935 | return jsonify(chart_data) 936 | else: 937 | return jsonify({'message': 'Results not ready for job ' + job_id}), 202 938 | 939 | 940 | @app.route('/download_complexes/') 941 | def download_complexes(job_id): 942 | job_results_dir = os.path.join(app.config['DOCKING_RESULTS_DIR'], job_id) 943 | 944 | # Check if the job results directory exists 945 | if not os.path.isdir(job_results_dir): 946 | return abort(404, description="Job results not found.") 947 | 948 | # Create a BytesIO object to write the zip file in memory 949 | zip_in_memory = BytesIO() 950 | with zipfile.ZipFile(zip_in_memory, 'w', zipfile.ZIP_DEFLATED) as zipf: 951 | for root, dirs, files in os.walk(job_results_dir): 952 | for file in files: 953 | file_path = os.path.join(root, file) 954 | zipf.write(file_path, os.path.relpath(file_path, job_results_dir)) 955 | zip_in_memory.seek(0) 956 | zip_filename = f'{job_id}_results.zip' 957 | return send_file(zip_in_memory, download_name=zip_filename, as_attachment=True, mimetype='application/zip') 958 | 959 | 960 | if __name__ == "__main__": 961 | if not os.path.exists(app.config['UPLOADED_FILES_DIR']): 962 | os.makedirs(app.config['UPLOADED_FILES_DIR']) 963 | if not os.path.exists(app.config['GENERATED_FILES_DIR']): 964 | os.makedirs(app.config['GENERATED_FILES_DIR']) 965 | if not os.path.exists(app.config['generated_files_dir']): 966 | os.makedirs(app.config['generated_files_dir']) 967 | if not os.path.exists(app.config['uploaded_files_dir']): 968 | os.makedirs(app.config['uploaded_files_dir']) 969 | if not os.path.exists(app.config['UPLOAD_FOLDER']): 970 | os.makedirs(app.config['UPLOAD_FOLDER']) 971 | if not os.path.exists(app.config['DOCKING_RESULTS_DIR']): 972 | os.makedirs(app.config['DOCKING_RESULTS_DIR']) 973 | app.run(debug=True) --------------------------------------------------------------------------------