├── MIMOSA.png ├── result ├── jnk.pkl ├── qed.pkl ├── jnkgsk.pkl ├── qed_f_t.txt ├── jnk_f_t.txt └── jnkgsk_f_t.txt ├── src ├── __pycache__ │ ├── dpp.cpython-37.pyc │ ├── module.cpython-37.pyc │ ├── utils.cpython-37.pyc │ ├── chemutils.cpython-37.pyc │ ├── gnn_layer.cpython-37.pyc │ └── inference_utils.cpython-37.pyc ├── utils.py ├── download.py ├── clean.py ├── dpp.py ├── vocabulary.py ├── train.py ├── evaluate.py ├── gnn_layer.py ├── module.py ├── run.py ├── inference_utils.py └── chemutils.py ├── save_model └── GNN_epoch_0_validloss_1.61160.ckpt ├── data ├── vocabulary.txt └── substructure.txt ├── conda.yml ├── mimosa.yml └── README.md /MIMOSA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/MIMOSA.png -------------------------------------------------------------------------------- /result/jnk.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/result/jnk.pkl -------------------------------------------------------------------------------- /result/qed.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/result/qed.pkl -------------------------------------------------------------------------------- /result/jnkgsk.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/result/jnkgsk.pkl -------------------------------------------------------------------------------- /src/__pycache__/dpp.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/dpp.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/module.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/module.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/chemutils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/chemutils.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/gnn_layer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/gnn_layer.cpython-37.pyc -------------------------------------------------------------------------------- /save_model/GNN_epoch_0_validloss_1.61160.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/save_model/GNN_epoch_0_validloss_1.61160.ckpt -------------------------------------------------------------------------------- /src/__pycache__/inference_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/inference_utils.cpython-37.pyc -------------------------------------------------------------------------------- /result/qed_f_t.txt: -------------------------------------------------------------------------------- 1 | 0.495 0.0 2 | 0.514 0.025 3 | 0.612 0.064 4 | 0.731 0.065 5 | 0.815 0.041 6 | 0.856 0.033 7 | 0.889 0.022 8 | 0.912 0.018 9 | 0.921 0.019 -------------------------------------------------------------------------------- /result/jnk_f_t.txt: -------------------------------------------------------------------------------- 1 | 0.0 0.0 2 | 0.008 0.011 3 | 0.021 0.023 4 | 0.044 0.034 5 | 0.060 0.035 6 | 0.070 0.042 7 | 0.085 0.051 8 | 0.101 0.057 9 | 0.115 0.057 10 | 0.122 0.059 11 | 0.136 0.047 -------------------------------------------------------------------------------- /result/jnkgsk_f_t.txt: -------------------------------------------------------------------------------- 1 | 0.015 0.0 2 | 0.022 0.010 3 | 0.038 0.031 4 | 0.065 0.040 5 | 0.086 0.043 6 | 0.103 0.039 7 | 0.131 0.039 8 | 0.144 0.046 9 | 0.151 0.037 10 | 0.170 0.038 11 | 0.182 0.036 -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | 5 | class Molecule_Dataset(torch.utils.data.Dataset): 6 | def __init__(self, smiles_lst): 7 | self.smiles_lst = smiles_lst 8 | 9 | def __len__(self): 10 | return len(self.smiles_lst) 11 | 12 | def __getitem__(self, idx): 13 | return self.smiles_lst[idx] 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/download.py: -------------------------------------------------------------------------------- 1 | from tdc.generation import MolGen 2 | data = MolGen(name = 'ZINC') 3 | # from random import shuffle 4 | # smiles_lst = data.get_data()['smiles'].to_list() 5 | # shuffle(smiles_lst) 6 | # smiles_lst = smiles_lst[:data_size] 7 | # with open("data/zinc_" + str(data_size) + ".txt", 'w') as fout: 8 | # for smiles in smiles_lst: 9 | # fout.write(smiles + '\n') 10 | 11 | 12 | """ 13 | python src/download.py 14 | """ 15 | 16 | 17 | -------------------------------------------------------------------------------- /data/vocabulary.txt: -------------------------------------------------------------------------------- 1 | C 1158545 2 | O 500212 3 | N 280451 4 | C1=CC=CC=C1 257945 5 | F 79430 6 | S 51103 7 | Cl 42872 8 | C1=CC=NC=C1 27852 9 | C1CCCCC1 20256 10 | C1=CNN=C1 18920 11 | C1=CSC=C1 17515 12 | C1CCNCC1 15912 13 | C1CC1 15462 14 | C1CCCC1 14328 15 | Br 12722 16 | C1=CSC=N1 12617 17 | C1COCCN1 11924 18 | C1CNCCN1 11701 19 | C1=COC=C1 11274 20 | C1CCCN1 9739 21 | C1=CN=CN=C1 7964 22 | C1CC[NH+]CC1 7948 23 | C1CCNC1 7634 24 | C1CCCNC1 7277 25 | C1=CCCC=C1 6243 26 | C1=NN=CN1 5748 27 | C1CNCC1 5513 28 | C1CCOC1 5310 29 | C1=CNC=N1 5201 30 | C1=NOC=N1 5141 31 | -------------------------------------------------------------------------------- /src/clean.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import os 3 | # from chemutils import vocabulary, smiles2word 4 | from chemutils import is_valid, logp_modifier 5 | smiles_database = "data/zinc.tab" 6 | clean_smiles_database = "data/zinc_clean.txt" 7 | 8 | 9 | with open(smiles_database, 'r') as fin: 10 | lines = fin.readlines()[1:] 11 | smiles_lst = [i.strip().strip('"') for i in lines] 12 | 13 | clean_smiles_lst = [] 14 | for smiles in tqdm(smiles_lst): 15 | if is_valid(smiles): 16 | clean_smiles_lst.append(smiles) 17 | clean_smiles_set = set(clean_smiles_lst) 18 | with open(clean_smiles_database, 'w') as fout: 19 | for smiles in clean_smiles_set: 20 | fout.write(smiles + '\n') 21 | 22 | 23 | -------------------------------------------------------------------------------- /src/dpp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | np.random.seed(1) 4 | 5 | class DPPModel(object): 6 | def __init__(self, smiles_lst, sim_matrix, f_scores, top_k, lamb): 7 | self.smiles_lst = smiles_lst 8 | self.sim_matrix = sim_matrix # (n,n) 9 | self.lamb = lamb 10 | self.f_scores = np.exp(f_scores) * self.lamb # (n,) 11 | self.max_iter = top_k 12 | self.n = len(smiles_lst) 13 | self.kernel_matrix = self.f_scores.reshape((self.n, 1)) \ 14 | * sim_matrix * self.f_scores.reshape((1, self.n)) 15 | self.log_det_V = np.sum(f_scores) * self.lamb 16 | self.log_det_S = np.log(np.linalg.det(np.mat(self.kernel_matrix))) 17 | 18 | def dpp(self): 19 | c = np.zeros((self.max_iter, self.n)) 20 | d = np.copy(np.diag(self.kernel_matrix)) ### diagonal 21 | j = np.argmax(d) 22 | Yg = [j] 23 | _iter = 0 24 | Z = list(range(self.n)) 25 | while len(Yg) < self.max_iter: 26 | Z_Y = set(Z).difference(set(Yg)) 27 | for i in Z_Y: 28 | if _iter == 0: 29 | ei = self.kernel_matrix[j, i] / np.sqrt(d[j]) 30 | else: 31 | ei = (self.kernel_matrix[j, i] - np.dot(c[:_iter, j], c[:_iter, i])) / np.sqrt(d[j]) 32 | c[_iter, i] = ei 33 | d[i] = d[i] - ei * ei 34 | d[j] = 0 35 | j = np.argmax(d) 36 | Yg.append(j) 37 | _iter += 1 38 | 39 | return [self.smiles_lst[i] for i in Yg], self.log_det_V, self.log_det_S 40 | 41 | 42 | 43 | if __name__ == "__main__": 44 | rank_score = np.random.random(size=(100)) 45 | item_embedding = np.random.randn(100, 5) 46 | item_embedding = item_embedding / np.linalg.norm(item_embedding, axis=1, keepdims=True) 47 | sim_matrix = np.dot(item_embedding, item_embedding.T) 48 | 49 | dpp = DPPModel(smiles_lst=list(range(100)), sim_matrix = sim_matrix, f_scores = rank_score, top_k = 10) 50 | Yg = dpp.dpp() 51 | print(Yg) 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /src/vocabulary.py: -------------------------------------------------------------------------------- 1 | # from chemutils import smiles2word 2 | 3 | import os 4 | from collections import defaultdict 5 | from tqdm import tqdm 6 | from rdkit import Chem, DataStructs 7 | 8 | 9 | def smiles2mol(smiles): 10 | mol = Chem.MolFromSmiles(smiles) 11 | if mol is None: 12 | return None 13 | Chem.Kekulize(mol) 14 | return mol 15 | 16 | ## input: smiles, output: word lst; 17 | def smiles2word(smiles): 18 | mol = smiles2mol(smiles) 19 | if mol is None: 20 | return None 21 | word_lst = [] 22 | 23 | cliques = [list(x) for x in Chem.GetSymmSSSR(mol)] 24 | cliques_smiles = [] 25 | for clique in cliques: 26 | clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True) 27 | cliques_smiles.append(clique_smiles) 28 | atom_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()] 29 | return cliques_smiles + atom_not_in_rings_list 30 | 31 | 32 | 33 | all_vocabulary_file = "data/substructure.txt" 34 | rawdata_file = "data/zinc.tab" 35 | select_vocabulary_file = "data/vocabulary.txt" 36 | 37 | if not os.path.exists(all_vocabulary_file): 38 | with open(rawdata_file) as fin: 39 | lines = fin.readlines()[1:] 40 | smiles_lst = [line.strip().strip('"') for line in lines] 41 | word2cnt = defaultdict(int) 42 | for smiles in tqdm(smiles_lst): 43 | word_lst = smiles2word(smiles) 44 | for word in word_lst: 45 | word2cnt[word] += 1 46 | word_cnt_lst = [(word,cnt) for word,cnt in word2cnt.items()] 47 | word_cnt_lst = sorted(word_cnt_lst, key=lambda x:x[1], reverse = True) 48 | 49 | with open(all_vocabulary_file, 'w') as fout: 50 | for word, cnt in word_cnt_lst: 51 | fout.write(word + '\t' + str(cnt) + '\n') 52 | else: 53 | with open(all_vocabulary_file, 'r') as fin: 54 | lines = fin.readlines() 55 | word_cnt_lst = [(line.split('\t')[0], int(line.split('\t')[1])) for line in lines] 56 | 57 | 58 | word_cnt_lst = list(filter(lambda x:x[1]>5000, word_cnt_lst)) 59 | print(len(word_cnt_lst)) 60 | 61 | with open(select_vocabulary_file, 'w') as fout: 62 | for word, cnt in word_cnt_lst: 63 | fout.write(word + '\t' + str(cnt) + '\n') 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /conda.yml: -------------------------------------------------------------------------------- 1 | name: mimosa 2 | channels: 3 | - rdkit 4 | - soumith 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - blas=1.0=mkl 9 | - bzip2=1.0.8=h7b6447c_0 10 | - ca-certificates=2021.1.19=h06a4308_0 11 | - cairo=1.14.12=h8948797_3 12 | - certifi=2020.12.5=py37h06a4308_0 13 | - fontconfig=2.13.0=h9420a91_0 14 | - freetype=2.10.4=h5ab3b9f_0 15 | - glib=2.66.1=h92f7085_0 16 | - icu=58.2=he6710b0_3 17 | - intel-openmp=2020.2=254 18 | - jpeg=9b=h024ee3a_2 19 | - lcms2=2.11=h396b838_0 20 | - ld_impl_linux-64=2.33.1=h53a641e_7 21 | - libboost=1.73.0=h3ff78a5_11 22 | - libedit=3.1.20191231=h14c3975_1 23 | - libffi=3.3=he6710b0_2 24 | - libgcc-ng=9.1.0=hdf63c60_0 25 | - libpng=1.6.37=hbc83047_0 26 | - libstdcxx-ng=9.1.0=hdf63c60_0 27 | - libtiff=4.1.0=h2733197_1 28 | - libuuid=1.0.3=h1bed415_2 29 | - libxcb=1.14=h7b6447c_0 30 | - libxml2=2.9.10=hb55368b_3 31 | - lz4-c=1.9.3=h2531618_0 32 | - mkl=2020.2=256 33 | - mkl-service=2.3.0=py37he8ac12f_0 34 | - mkl_fft=1.2.0=py37h23d657b_0 35 | - mkl_random=1.1.1=py37h0573a6f_0 36 | - ncurses=6.2=he6710b0_1 37 | - numpy=1.19.2=py37h54aff64_0 38 | - numpy-base=1.19.2=py37hfa32c7d_0 39 | - olefile=0.46=py_0 40 | - openssl=1.1.1i=h27cfd23_0 41 | - pandas=1.2.1=py37ha9443f7_0 42 | - pcre=8.44=he6710b0_0 43 | - pillow=8.1.0=py37he98fc37_0 44 | - pip=20.3.3=py37h06a4308_0 45 | - pixman=0.40.0=h7b6447c_0 46 | - py-boost=1.73.0=py37ha9443f7_11 47 | - python=3.7.9=h7579374_0 48 | - python-dateutil=2.8.1=py_0 49 | - pytz=2020.5=pyhd3eb1b0_0 50 | - rdkit=2020.09.1.0=py37hd50e099_1 51 | - readline=8.0=h7b6447c_0 52 | - setuptools=52.0.0=py37h06a4308_0 53 | - six=1.15.0=pyhd3eb1b0_0 54 | - sqlite=3.33.0=h62c20be_0 55 | - tk=8.6.10=hbc83047_0 56 | - wheel=0.36.2=pyhd3eb1b0_0 57 | - xz=5.2.5=h7b6447c_0 58 | - zlib=1.2.11=h7b6447c_3 59 | - zstd=1.4.5=h9ceee32_0 60 | - pip: 61 | - chardet==4.0.0 62 | - cycler==0.10.0 63 | - decorator==4.4.2 64 | - fuzzywuzzy==0.18.0 65 | - idna==2.10 66 | - joblib==1.0.0 67 | - kiwisolver==1.3.1 68 | - matplotlib==3.3.4 69 | - networkx==2.5 70 | - pyparsing==2.4.7 71 | - pytdc==0.1.5 72 | - requests==2.25.1 73 | - scikit-learn==0.23.2 74 | - scipy==1.6.0 75 | - threadpoolctl==2.1.0 76 | - torch==1.7.1 77 | - torchvision==0.8.2 78 | - tqdm==4.56.0 79 | - typing-extensions==3.7.4.3 80 | - urllib3==1.26.3 81 | 82 | -------------------------------------------------------------------------------- /mimosa.yml: -------------------------------------------------------------------------------- 1 | name: mimosa 2 | channels: 3 | - rdkit 4 | - pytorch 5 | - anaconda 6 | - defaults 7 | dependencies: 8 | - blas=1.0=mkl 9 | - bzip2=1.0.8=h1de35cc_0 10 | - ca-certificates=2020.1.1=0 11 | - cairo=1.14.12=hc4e6be7_4 12 | - certifi=2020.4.5.1=py37_0 13 | - cffi=1.14.0=py37hb5b8e2f_0 14 | - fontconfig=2.13.0=h5d5b041_1 15 | - freetype=2.9.1=hb4e5f40_0 16 | - gettext=0.19.8.1=h15daf44_3 17 | - glib=2.63.1=hd977a24_0 18 | - icu=58.2=h0a44026_3 19 | - intel-openmp=2019.4=233 20 | - joblib=0.14.1=py_0 21 | - jpeg=9b=he5867d9_2 22 | - libboost=1.67.0=hebc422b_4 23 | - libcxx=4.0.1=hcfea43d_1 24 | - libcxxabi=4.0.1=hcfea43d_1 25 | - libedit=3.1.20181209=hb402a30_0 26 | - libffi=3.2.1=h0a44026_6 27 | - libgfortran=3.0.1=h93005f0_2 28 | - libiconv=1.16=h1de35cc_0 29 | - libpng=1.6.37=ha441bb4_0 30 | - libtiff=4.1.0=hcb84e12_0 31 | - libxml2=2.9.9=hf6e021a_1 32 | - llvm-openmp=4.0.1=hcfea43d_1 33 | - mkl=2019.4=233 34 | - mkl-service=2.3.0=py37hfbe908c_0 35 | - mkl_fft=1.0.15=py37h5e564d8_0 36 | - mkl_random=1.1.0=py37ha771720_0 37 | - ncurses=6.2=h0a44026_1 38 | - ninja=1.9.0=py37h04f5b5a_0 39 | - numpy=1.18.1=py37h7241aed_0 40 | - numpy-base=1.18.1=py37h6575580_1 41 | - olefile=0.46=py37_0 42 | - openssl=1.1.1g=h1de35cc_0 43 | - pandas=1.0.3=py37h6c726b0_0 44 | - pcre=8.43=h0a44026_0 45 | - pillow=7.0.0=py37h4655f20_0 46 | - pip=20.0.2=py37_1 47 | - pixman=0.38.0=h1de35cc_0 48 | - py-boost=1.67.0=py37h6440ff4_4 49 | - pycparser=2.20=py_0 50 | - python=3.7.7=hc70fcce_0_cpython 51 | - python-dateutil=2.8.1=py_0 52 | - pytorch=1.0.1=py3.7_2 53 | - pytz=2019.3=py_0 54 | - rdkit=2020.03.1.0=py37h65625ec_1 55 | - readline=8.0=h1de35cc_0 56 | - scikit-learn=0.21.3=py37h27c97d8_0 57 | - scipy=1.4.1=py37h9fa6033_0 58 | - setuptools=46.1.3=py37_0 59 | - six=1.14.0=py37_0 60 | - sqlite=3.31.1=h5c1f38d_1 61 | - tk=8.6.8=ha441bb4_0 62 | - tqdm=4.45.0=py_0 63 | - wheel=0.34.2=py37_0 64 | - xz=5.2.5=h1de35cc_0 65 | - zlib=1.2.11=h1de35cc_3 66 | - zstd=1.3.7=h5bba6e5_0 67 | - pip: 68 | - decorator==4.4.2 69 | - isodate==0.6.0 70 | - molvs==0.1.1 71 | - networkx==2.4 72 | - plyfile==0.7.2 73 | - protobuf==3.11.3 74 | - pyparsing==2.4.7 75 | - rdflib==5.0.0 76 | - tensorboardx==2.0 77 | - torch-cluster==1.2.4 78 | - torch-geometric==1.0.3 79 | - torch-scatter==1.1.2 80 | - torch-sparse==0.2.4 81 | - torch-spline-conv==1.0.6 82 | 83 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | from tqdm import tqdm 7 | from matplotlib import pyplot as plt 8 | import pickle 9 | from random import shuffle 10 | torch.manual_seed(4) 11 | np.random.seed(2) 12 | from module import GCN 13 | from chemutils import smiles2graph, vocabulary, smiles2feature 14 | from utils import Molecule_Dataset 15 | 16 | 17 | device = 'cpu' 18 | data_file = "data/zinc_clean.txt" 19 | with open(data_file, 'r') as fin: 20 | lines = fin.readlines() 21 | 22 | shuffle(lines) 23 | lines = [line.strip() for line in lines] 24 | N = int(len(lines) * 0.9) 25 | train_data = lines[:N] 26 | valid_data = lines[N:] 27 | 28 | 29 | 30 | training_set = Molecule_Dataset(train_data) 31 | valid_set = Molecule_Dataset(valid_data) 32 | params = {'batch_size': 1, 33 | 'shuffle': True, 34 | 'num_workers': 1} 35 | # exit() 36 | 37 | 38 | def collate_fn(batch_lst): 39 | return batch_lst 40 | 41 | train_generator = torch.utils.data.DataLoader(training_set, collate_fn = collate_fn, **params) 42 | valid_generator = torch.utils.data.DataLoader(valid_set, collate_fn = collate_fn, **params) 43 | 44 | gnn = GCN(nfeat = 50, nhid = 100, num_layer = 3).to(device) 45 | print('GNN is built!') 46 | # exit() 47 | 48 | cost_lst = [] 49 | valid_loss_lst = [] 50 | epoch = 5 51 | every_k_iters = 5000 52 | save_folder = "save_model/GNN_epoch_" 53 | for ep in tqdm(range(epoch)): 54 | for i, smiles in tqdm(enumerate(train_generator)): 55 | ### 1. training 56 | smiles = smiles[0] 57 | node_mat, adjacency_matrix, idx, label = smiles2feature(smiles) ### smiles2feature: only mask leaf node 58 | # idx_lst, node_mat, substructure_lst, atomidx_2substridx, adjacency_matrix, leaf_extend_idx_pair = smiles2graph(smiles) 59 | node_mat = torch.FloatTensor(node_mat).to(device) 60 | adjacency_matrix = torch.FloatTensor(adjacency_matrix).to(device) 61 | label = torch.LongTensor([label]).view(-1).to(device) 62 | # print('label', label) 63 | cost = gnn.learn(node_mat, adjacency_matrix, idx, label) 64 | cost_lst.append(cost) 65 | 66 | #### 2. validation 67 | if i % every_k_iters == 0: 68 | gnn.eval() 69 | valid_loss, valid_num = 0,0 70 | for smiles in valid_generator: 71 | smiles = smiles[0] 72 | node_mat, adjacency_matrix, idx, label = smiles2feature(smiles) 73 | node_mat = torch.FloatTensor(node_mat).to(device) 74 | adjacency_matrix = torch.FloatTensor(adjacency_matrix).to(device) 75 | label = torch.LongTensor([label]).view(-1).to(device) 76 | cost, _ = gnn.infer(node_mat, adjacency_matrix, idx, label) 77 | valid_loss += cost 78 | valid_num += 1 79 | valid_loss = valid_loss / valid_num 80 | valid_loss_lst.append(valid_loss) 81 | file_name = save_folder + str(ep) + "_validloss_" + str(valid_loss)[:7] + ".ckpt" 82 | torch.save(gnn, file_name) 83 | gnn.train() 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /src/evaluate.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import numpy as np 3 | from time import time 4 | from tqdm import tqdm 5 | from matplotlib import pyplot as plt 6 | import pickle 7 | from random import shuffle 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from tdc import Oracle 12 | torch.manual_seed(1) 13 | np.random.seed(2) 14 | from tdc import Evaluator 15 | 16 | from chemutils import * 17 | ## 2. data and oracle 18 | # qed = Oracle(name = 'qed') 19 | # logp = Oracle(name = 'logp') 20 | # jnk = Oracle(name = 'JNK3') 21 | # gsk = Oracle(name = 'GSK3B') 22 | # def foracle(smiles): 23 | # return logp(smiles) 24 | 25 | oracle_name = sys.argv[1] 26 | # 'jnkgsk', 'qedsajnkgsk', 'qed', 'jnk', 'gsk' 27 | 28 | 29 | diversity = Evaluator(name = 'Diversity') 30 | novelty = Evaluator(name = 'Novelty') 31 | 32 | 33 | file = "data/zinc_clean.txt" 34 | with open(file, 'r') as fin: 35 | lines = fin.readlines() 36 | train_smiles_lst = [line.strip().split()[0] for line in lines][:1000] 37 | 38 | 39 | ## 5. run 40 | if __name__ == "__main__": 41 | 42 | # result_file = "result/denovo_from_" + start_smiles_lst[0] + "_generation_" + str(generations) + "_population_" + str(population_size) + ".pkl" 43 | # result_pkl = "result/ablation_dmg_topo_dmg_substr.pkl" 44 | # pkl_file = "result/denovo_qedlogpjnkgsk_start_ncncccn.pkl" 45 | pkl_file = "result/"+oracle_name+".pkl" 46 | idx_2_smiles2f, trace_dict = pickle.load(open(pkl_file, 'rb')) 47 | # bestvalue, best_smiles = 0, '' 48 | topk = 100 49 | whole_smiles2f = dict() 50 | for idx, (smiles2f,current_set) in tqdm(idx_2_smiles2f.items()): 51 | whole_smiles2f.update(smiles2f) 52 | # for smiles,f in smiles2f.items(): 53 | # if f > bestvalue: 54 | # bestvalue = f 55 | # print("best", f) 56 | # best_smiles = smiles 57 | 58 | smiles_f_lst = [(smiles,f) for smiles,f in whole_smiles2f.items()] 59 | smiles_f_lst.sort(key=lambda x:x[1], reverse=True) 60 | best_smiles_lst = [smiles for smiles,f in smiles_f_lst[:topk]] 61 | best_f_lst = [f for smiles,f in smiles_f_lst[:topk]] 62 | avg, std = np.mean(best_f_lst), np.std(best_f_lst) 63 | print('average of top-'+str(topk), str(avg)[:5], str(std)[:5]) 64 | #### evaluate novelty 65 | t1 = time() 66 | nov = novelty(best_smiles_lst, train_smiles_lst) 67 | t2 = time() 68 | print("novelty", nov, "takes", str(int(t2-t1)), 'seconds') 69 | 70 | ### evaluate diversity 71 | t1 = time() 72 | div = diversity(best_smiles_lst) 73 | t2 = time() 74 | print("diversity", div, 'takes', str(int(t2-t1)), 'seconds') 75 | 76 | 77 | # ### evaluate mean of property 78 | # for oracle_name in oracle_lst: 79 | # oracle = Oracle(name = oracle_name) 80 | # scores = oracle(best_smiles_lst) 81 | # avg = np.mean(scores) 82 | # std = np.std(scores) 83 | # print(oracle_name, str(avg)[:7], str(std)[:7]) 84 | 85 | # for ii,smiles in enumerate(best_smiles_lst[:20]): 86 | # print(smiles, str(gsk(smiles))) 87 | # draw_smiles(smiles, "figure/best_"+oracle_name+"_"+str(ii)+'.png') 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /src/gnn_layer.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import numpy as np 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.nn.parameter import Parameter 7 | from torch.nn.modules.module import Module 8 | torch.manual_seed(3) 9 | np.random.seed(1) 10 | 11 | class GraphConvolution(Module): 12 | """ 13 | Simple GCN layer, similar to https://arxiv.org/abs/1609.02907 14 | """ 15 | 16 | def __init__(self, in_features, out_features, bias=True, init='xavier'): 17 | super(GraphConvolution, self).__init__() 18 | self.in_features = in_features 19 | self.out_features = out_features 20 | self.weight = Parameter(torch.FloatTensor(in_features, out_features)) 21 | if bias: 22 | self.bias = Parameter(torch.FloatTensor(out_features)) 23 | else: 24 | self.register_parameter('bias', None) 25 | if init == 'uniform': 26 | print("| Uniform Initialization") 27 | self.reset_parameters_uniform() 28 | elif init == 'xavier': 29 | print("| Xavier Initialization") 30 | self.reset_parameters_xavier() 31 | elif init == 'kaiming': 32 | print("| Kaiming Initialization") 33 | self.reset_parameters_kaiming() 34 | else: 35 | raise NotImplementedError 36 | 37 | def reset_parameters_uniform(self): 38 | stdv = 1. / math.sqrt(self.weight.size(1)) 39 | self.weight.data.uniform_(-stdv, stdv) 40 | if self.bias is not None: 41 | self.bias.data.uniform_(-stdv, stdv) 42 | 43 | def reset_parameters_xavier(self): 44 | nn.init.xavier_normal_(self.weight.data, gain=0.02) # Implement Xavier Uniform 45 | if self.bias is not None: 46 | nn.init.constant_(self.bias.data, 0.0) 47 | 48 | def reset_parameters_kaiming(self): 49 | nn.init.kaiming_normal_(self.weight.data, a=0, mode='fan_in') 50 | if self.bias is not None: 51 | nn.init.constant_(self.bias.data, 0.0) 52 | 53 | def forward(self, input, adj): 54 | support = torch.mm(input, self.weight) 55 | # print("adj", adj.dtype, "support", support.dtype) 56 | output = torch.spmm(adj, support) 57 | if self.bias is not None: 58 | return output + self.bias 59 | else: 60 | return output 61 | 62 | def __repr__(self): 63 | return self.__class__.__name__ + ' (' \ 64 | + str(self.in_features) + ' -> ' \ 65 | + str(self.out_features) + ')' 66 | 67 | 68 | class GraphAttention(nn.Module): 69 | """ 70 | Simple GAT layer, similar to https://arxiv.org/abs/1710.10903 71 | """ 72 | 73 | def __init__(self, in_features, out_features, dropout, alpha, concat=True): 74 | super(GraphAttention, self).__init__() 75 | self.dropout = dropout 76 | self.in_features = in_features 77 | self.out_features = out_features 78 | self.alpha = alpha 79 | self.concat = concat 80 | 81 | self.W = nn.Parameter(nn.init.xavier_normal_(torch.Tensor(in_features, out_features).type(torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor), gain=np.sqrt(2.0)), requires_grad=True) 82 | self.a1 = nn.Parameter(nn.init.xavier_normal_(torch.Tensor(out_features, 1).type(torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor), gain=np.sqrt(2.0)), requires_grad=True) 83 | self.a2 = nn.Parameter(nn.init.xavier_normal_(torch.Tensor(out_features, 1).type(torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor), gain=np.sqrt(2.0)), requires_grad=True) 84 | 85 | self.leakyrelu = nn.LeakyReLU(self.alpha) 86 | 87 | def forward(self, input, adj): 88 | h = torch.mm(input, self.W) 89 | N = h.size()[0] 90 | 91 | f_1 = torch.matmul(h, self.a1) 92 | f_2 = torch.matmul(h, self.a2) 93 | e = self.leakyrelu(f_1 + f_2.transpose(0,1)) 94 | 95 | zero_vec = -9e15*torch.ones_like(e) 96 | attention = torch.where(adj > 0, e, zero_vec) 97 | attention = F.softmax(attention, dim=1) 98 | attention = F.dropout(attention, self.dropout, training=self.training) 99 | h_prime = torch.matmul(attention, h) 100 | 101 | if self.concat: 102 | return F.elu(h_prime) 103 | else: 104 | return h_prime 105 | 106 | def __repr__(self): 107 | return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')' 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /src/module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from copy import deepcopy 5 | from torch.autograd import Variable 6 | from torch.utils import data 7 | from torch.utils.data import SequentialSampler 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | sigmoid = torch.nn.Sigmoid() 11 | from tqdm import tqdm 12 | 13 | from gnn_layer import GraphConvolution, GraphAttention 14 | from chemutils import smiles2graph, vocabulary 15 | 16 | torch.manual_seed(4) 17 | np.random.seed(1) 18 | 19 | # def sigmoid(x): 20 | # return 1/(1+np.exp(-x)) 21 | # device = 'cuda' if torch.cuda.is_available() else 'cpu' 22 | device = 'cpu' 23 | 24 | class GCN(nn.Module): 25 | def __init__(self, nfeat, nhid, num_layer): 26 | super(GCN, self).__init__() 27 | self.gc1 = GraphConvolution(in_features = nfeat, out_features = nhid) 28 | self.gcs = [GraphConvolution(in_features = nhid, out_features = nhid) for i in range(num_layer)] 29 | # self.dropout = dropout 30 | from chemutils import vocabulary 31 | self.vocabulary_size = len(vocabulary) 32 | self.out_fc = nn.Linear(nhid, self.vocabulary_size) 33 | self.nfeat = nfeat 34 | self.nhid = nhid 35 | self.num_layer = num_layer 36 | # self.embedding = nn.Embedding(self.vocabulary_size, nfeat) 37 | self.embedding = nn.Linear(self.vocabulary_size + 1, nfeat) 38 | self.criteria = torch.nn.CrossEntropyLoss() 39 | self.opt = torch.optim.Adam(self.parameters(), lr=1e-3, betas=(0.9, 0.99)) 40 | self.device = device 41 | self = self.to(device) 42 | 43 | def switch_device(self, device): 44 | self.device = device 45 | self = self.to(device) 46 | 47 | def forward(self, node_mat, adj, idx): 48 | ''' N: # substructure & d: vocabulary size 49 | Input: 50 | node_mat: [N,d] row sum is 1. 51 | adj: [N,N] 52 | idx: integer 53 | 54 | Output: 55 | scalar prediction before sigmoid [-inf, inf] 56 | ''' 57 | node_mat, adj = node_mat.to(self.device), adj.to(self.device) 58 | x = self.embedding(node_mat) 59 | x = F.relu(self.gc1(x,adj)) 60 | for gc in self.gcs: 61 | x = F.relu(gc(x,adj)) 62 | x = x[idx].view(1,-1) 63 | logits = self.out_fc(x) 64 | return logits 65 | ## without sigmoid 66 | 67 | def smiles2embed(self, smiles): 68 | idx_lst, node_mat, substructure_lst, atomidx_2substridx, adj, leaf_extend_idx_pair = smiles2graph(smiles) 69 | idx_vec = torch.LongTensor(idx_lst).to(device) 70 | node_mat = torch.FloatTensor(node_mat).to(device) 71 | adj = torch.FloatTensor(adj).to(device) 72 | weight = torch.ones_like(idx_vec).to(device) 73 | 74 | ### forward 75 | node_mat, adj, weight = node_mat.to(self.device), adj.to(self.device), weight.to(self.device) 76 | x = self.embedding(node_mat) ## bug 77 | x = F.relu(self.gc1(x,adj)) 78 | for gc in self.gcs: 79 | x = F.relu(gc(x,adj)) 80 | return torch.mean(x, 0) 81 | 82 | 83 | def smiles2pred(self, smiles): 84 | idx_lst, node_mat, substructure_lst, atomidx_2substridx, adj, leaf_extend_idx_pair = smiles2graph(smiles) 85 | idx_vec = torch.LongTensor(idx_lst).to(device) 86 | node_mat = torch.FloatTensor(node_mat).to(device) 87 | adj = torch.FloatTensor(adj).to(device) 88 | weight = torch.ones_like(idx_vec).to(device) 89 | logits = self.forward(node_mat, adj, weight) 90 | pred = torch.sigmoid(logits) 91 | return pred.item() 92 | 93 | def learn(self, node_mat, adj, idx, label): 94 | pred_y = self.forward(node_mat, adj, idx) 95 | pred_y = pred_y.view(1,-1) 96 | # print(pred_y, pred_y.shape, label, label.shape) 97 | cost = self.criteria(pred_y, label) 98 | self.opt.zero_grad() 99 | cost.backward() 100 | self.opt.step() 101 | return cost.data.numpy(), pred_y.data.numpy() 102 | 103 | def infer(self, node_mat, adj, idx, target): 104 | pred_y = self.forward(node_mat, adj, idx) 105 | pred_y = pred_y.view(1,-1) 106 | cost = self.criteria(pred_y, target) 107 | return cost.data.numpy(), pred_y.data.numpy() 108 | 109 | 110 | if __name__ == "__main__": 111 | gnn = GCN(nfeat = 50, nhid = 100, num_layer = 2) 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /src/run.py: -------------------------------------------------------------------------------- 1 | import os, pickle, torch, random 2 | import numpy as np 3 | import argparse 4 | from time import time 5 | from tqdm import tqdm 6 | from matplotlib import pyplot as plt 7 | from random import shuffle 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from tdc import Oracle 11 | torch.manual_seed(1) 12 | np.random.seed(2) 13 | random.seed(1) 14 | from chemutils import * 15 | from inference_utils import * 16 | 17 | 18 | def optimization(start_smiles_lst, gnn, oracle, oracle_num, oracle_name, generations, population_size, lamb, topk, epsilon, result_pkl): 19 | smiles2score = dict() ### oracle_num 20 | def oracle_new(smiles): 21 | if smiles not in smiles2score: 22 | value = oracle(smiles) 23 | smiles2score[smiles] = value 24 | return smiles2score[smiles] 25 | trace_dict = dict() 26 | existing_set = set(start_smiles_lst) 27 | current_set = set(start_smiles_lst) 28 | average_f = np.mean([oracle_new(smiles) for smiles in current_set]) 29 | f_lst = [(average_f, 0.0)] 30 | idx_2_smiles2f = {} 31 | smiles2f_new = {smiles:oracle_new(smiles) for smiles in start_smiles_lst} 32 | idx_2_smiles2f[-1] = smiles2f_new, current_set 33 | for i_gen in tqdm(range(generations)): 34 | next_set = set() 35 | for smiles in current_set: 36 | smiles_set = optimize_single_molecule_one_iterate(smiles, gnn) 37 | 38 | for smi in smiles_set: 39 | if smi not in trace_dict: 40 | trace_dict[smi] = smiles ### ancestor -> offspring 41 | next_set = next_set.union(smiles_set) 42 | # next_set = next_set.difference(existing_set) ### if allow repeat molecule 43 | smiles_score_lst = oracle_screening(next_set, oracle_new) ### sorted smiles_score_lst 44 | print(smiles_score_lst[:5], "Oracle num", len(smiles2score)) 45 | 46 | # current_set = [i[0] for i in smiles_score_lst[:population_size]] # Option I: top-k 47 | current_set,_,_ = dpp(smiles_score_lst = smiles_score_lst, num_return = population_size, lamb = lamb) # Option II: DPP 48 | existing_set = existing_set.union(next_set) 49 | 50 | # save 51 | smiles2f_new = {smiles:score for smiles,score in smiles_score_lst} 52 | idx_2_smiles2f[i_gen] = smiles2f_new, current_set 53 | pickle.dump((idx_2_smiles2f, trace_dict), open(result_pkl, 'wb')) 54 | 55 | #### compute f-score 56 | score_lst = [smiles2f_new[smiles] for smiles in current_set] 57 | average_f = np.mean(score_lst) 58 | std_f = np.std(score_lst) 59 | f_lst.append((average_f, std_f)) 60 | str_f_lst = [str(i[0])[:5]+'\t'+str(i[1])[:5] for i in f_lst] 61 | with open("result/" + oracle_name + "_f_t.txt", 'w') as fout: 62 | fout.write('\n'.join(str_f_lst)) 63 | if len(smiles2score) > oracle_num: 64 | break 65 | 66 | def main(): 67 | parser = argparse.ArgumentParser() 68 | parser.add_argument('--oracle_num', type=int, default=1500) 69 | parser.add_argument('--oracle_name', type=str, default="qed", choices=['jnkgsk', 'qedsajnkgsk', 'qed', 'jnk', 'gsk']) 70 | parser.add_argument('--generations', type=int, default=50) 71 | parser.add_argument('--population_size', type=int, default=20) 72 | args = parser.parse_args() 73 | 74 | oracle_num = args.oracle_num 75 | oracle_name = args.oracle_name 76 | generations = args.generations 77 | population_size = args.population_size 78 | 79 | start_smiles_lst = ['C1(N)=NC=CC=N1'] ## 'C1=CC=CC=C1NC2=NC=CC=N2' 80 | qed = Oracle('qed') 81 | sa = Oracle('sa') 82 | jnk = Oracle('JNK3') 83 | gsk = Oracle('GSK3B') 84 | logp = Oracle('logp') 85 | mu = 2.230044 86 | sigma = 0.6526308 87 | def normalize_sa(smiles): 88 | sa_score = sa(smiles) 89 | mod_score = np.maximum(sa_score, mu) 90 | return np.exp(-0.5 * np.power((mod_score - mu) / sigma, 2.)) 91 | 92 | 93 | if oracle_name == 'jnkgsk': 94 | def oracle(smiles): 95 | return np.mean((jnk(smiles), gsk(smiles))) 96 | elif oracle_name == 'qedsajnkgsk': 97 | def oracle(smiles): 98 | return np.mean((qed(smiles), normalize_sa(smiles), jnk(smiles), gsk(smiles))) 99 | elif oracle_name == 'qed': 100 | def oracle(smiles): 101 | return qed(smiles) 102 | elif oracle_name == 'jnk': 103 | def oracle(smiles): 104 | return jnk(smiles) 105 | elif oracle_name == 'gsk': 106 | def oracle(smiles): 107 | return gsk(smiles) 108 | elif oracle_name == 'logp': 109 | def oracle(smiles): 110 | return logp(smiles) 111 | 112 | # device = 'cuda' if torch.cuda.is_available() else 'cpu' 113 | device = 'cpu' ## cpu is better 114 | model_ckpt = "save_model/GNN_epoch_0_validloss_1.61160.ckpt" 115 | gnn = torch.load(model_ckpt) 116 | gnn.switch_device(device) 117 | 118 | result_pkl = "result/" + oracle_name + ".pkl" 119 | optimization(start_smiles_lst, gnn, oracle, oracle_num, oracle_name, 120 | generations = generations, 121 | population_size = population_size, 122 | lamb=2, 123 | topk = 5, 124 | epsilon = 0.7, 125 | result_pkl = result_pkl) 126 | 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 💊 MIMOSA: Multi-constraint Molecule Sampling for Molecule Optimization 2 | 3 | [![License](https://img.shields.io/badge/License-BSD_2--Clause-orange.svg)](https://opensource.org/licenses/BSD-2-Clause) 4 | [![Python 3.7+](https://img.shields.io/badge/python-3.7+-blue.svg)](https://www.python.org/downloads/release/python-370/) 5 | [![GitHub Repo stars](https://img.shields.io/github/stars/futianfan/MIMOSA)](https://github.com/futianfan/MIMOSA/stargazers) 6 | [![GitHub Repo stars](https://img.shields.io/github/forks/futianfan/MIMOSA)](https://github.com/futianfan/MIMOSA/network/members) 7 | 8 | 9 | 10 | This repository hosts MIMOSA: Multi-constraint Molecule Sampling for Molecule Optimization (AAAI) 2021 (Tianfan Fu, Cao Xiao, Xinhao Li, Lucas Glass, Jimeng Sun), which used pretrained graph neural network (GNN) and MCMC for molecule optimization. 11 | 12 | ![pipeline](MIMOSA.png) 13 | 14 | 15 | ## Table Of Contents 16 | 17 | - [Installation](#installation) 18 | - [Data](#data) 19 | - [Pretraining](#pretrain) 20 | - [Run](#run) 21 | - [Contact](#contact) 22 | 23 | 24 | 25 | ## ⚙️ 1. Installation 26 | 27 | To install locally, we recommend to install from `pip` and `conda`. Please see `conda.yml` for the package dependency. 28 | ```bash 29 | conda create -n mimosa python=3.7 30 | conda activate mimosa 31 | pip install torch 32 | pip install PyTDC 33 | conda install -c rdkit rdkit 34 | ``` 35 | 36 | Activate conda environment. 37 | ```bash 38 | conda activate mimosa 39 | ``` 40 | 41 | make directory 42 | ```bash 43 | mkdir -p save_model result 44 | ``` 45 | 46 | 47 | ## 📊 2. Data 48 | In our setup, we restrict the number of oracle calls. In realistic discovery settings, the oracle acquisition cost is usually not negligible. 49 | 50 | ### Raw Data 51 | We use [`ZINC`](https://tdcommons.ai/generation_tasks/molgen/) database, which contains around 250K drug-like molecules and can be downloaded [`download ZINC`](https://tdcommons.ai/generation_tasks/molgen/). 52 | ```bash 53 | python src/download.py 54 | ``` 55 | - output 56 | - `data/zinc.tab`: all the smiles in ZINC, around 250K. 57 | 58 | ### Oracle 59 | Oracle is a property evaluator and is a function whose input is molecular structure, and output is the property. 60 | We consider following oracles: 61 | * `JNK3`: biological activity to JNK3, ranging from 0 to 1. 62 | * `GSK3B` biological activity to GSK3B, ranging from 0 to 1. 63 | * `QED`: Quantitative Estimate of Drug-likeness, ranging from 0 to 1. 64 | * `SA`: Synthetic Accessibility, we normalize SA to (0,1). 65 | * `LogP`: solubility and synthetic accessibility of a compound. It ranges from negative infinity to positive infinity. 66 | 67 | For all the property scores above, higher is more desirable. 68 | 69 | ### Optimization Task 70 | There are two kinds of optimization tasks: single-objective and multi-objective optimization. 71 | Multi-objective optimization contains `jnkgsk` (JNK3 + GSK3B), `qedsajnkgsk` (QED + SA + JNK3 + GSK3B). 72 | 73 | 74 | ### Generate Vocabulary 75 | In this project, the basic unit is `substructure`, which can be atoms or single rings. 76 | The vocabulary is the set of frequent `substructures`. 77 | ```bash 78 | python src/vocabulary.py 79 | ``` 80 | - input 81 | - `data/zinc.tab`: all the smiles in ZINC, around 250K. 82 | - output 83 | - `data/substructure.txt`: including all the substructures in ZINC. 84 | - `data/vocabulary.txt`: vocabulary, frequent substructures. 85 | 86 | ### data cleaning 87 | We remove the molecules that contains substructure that is not in vocabulary. 88 | 89 | ```bash 90 | python src/clean.py 91 | ``` 92 | 93 | - input 94 | - `data/vocabulary.txt`: vocabulary 95 | - `data/zinc.tab`: all the smiles in ZINC 96 | - output 97 | - `data/zinc_clean.txt` 98 | 99 | 100 | 101 | 102 | 103 | 104 | ## Pre-train graph neural network (GNN) 105 | ```bash 106 | python src/train.py 107 | ``` 108 | - input 109 | - `data/zinc_clean.txt` 110 | - output 111 | - `save_model/GNN.ckpt`: trained GNN model. 112 | - log 113 | - `gnn_loss.pkl`: the valid loss. 114 | 115 | 116 | ## 🤖 Run 117 | 118 | ### de novo molecule design 119 | 120 | ```bash 121 | python src/run.py 122 | ``` 123 | - input 124 | - `save_model/GNN.ckpt`: pretrained GNN model. 125 | - output 126 | - `result/{$prop}.pkl`: set of generated molecules. 127 | 128 | For example, 129 | ```bash 130 | python src/run.py 131 | ``` 132 | 133 | ### evaluate 134 | 135 | ```bash 136 | python src/evaluate.py $prop 137 | ``` 138 | - input 139 | - `result/{$prop}.pkl` 140 | - output 141 | - `diversity`, `novelty`, `average property` of top-100 molecules with highest property. 142 | 143 | For example, 144 | ```bash 145 | python src/evaluate.py jnkgsk 146 | ``` 147 | 148 | 149 | ## 📞 Contact 150 | Please contact futianfan@gmail.com for help or submit an issue. 151 | 152 | 153 | ## Cite Us 154 | If you found this package useful, please cite our paper: 155 | ``` 156 | @inproceedings{fu2021mimosa, 157 | title={MIMOSA: Multi-constraint Molecule Sampling for Molecule Optimization}, 158 | author={Fu, Tianfan and Xiao, Cao and Li, Xinhao and Glass, Lucas M and Sun, Jimeng}, 159 | booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, 160 | volume={35}, 161 | number={1}, 162 | pages={125--133}, 163 | year={2021} 164 | } 165 | ``` 166 | 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /src/inference_utils.py: -------------------------------------------------------------------------------- 1 | 2 | ### 1. import 3 | import numpy as np 4 | from tqdm import tqdm 5 | from matplotlib import pyplot as plt 6 | import pickle 7 | from random import shuffle 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from tdc import Oracle 12 | torch.manual_seed(1) 13 | np.random.seed(2) 14 | import random 15 | random.seed(1) 16 | from chemutils import * 17 | ''' 18 | optimize_single_molecule_one_iterate 19 | gnn_prediction_of_single_smiles 20 | oracle_screening 21 | gnn_screening 22 | optimize_single_molecule_all_generations 23 | similarity_matrix(smiles_lst) 24 | ''' 25 | from dpp import DPPModel 26 | 27 | 28 | 29 | def gnn_prediction_of_single_smiles(smiles, gnn): 30 | if not is_valid(smiles): 31 | return 0 32 | return gnn.smiles2pred(smiles) 33 | # idx_lst, node_mat, substructure_lst, atomidx_2substridx, adjacency_matrix, leaf_extend_idx_pair = smiles2graph(smiles) 34 | # idx_vec = torch.LongTensor(idx_lst) 35 | # node_mat = torch.FloatTensor(node_mat) 36 | # adjacency_matrix = torch.FloatTensor(adjacency_matrix) 37 | # weight = torch.ones_like(idx_vec) 38 | # logits = gnn(node_mat = node_mat, adj = adjacency_matrix, weight = weight) 39 | # logits = logits.item() 40 | # print("gnn prediction", logits) 41 | # return logits 42 | 43 | 44 | def oracle_screening(smiles_set, oracle): 45 | smiles_score_lst = [] 46 | for smiles in smiles_set: 47 | score = oracle(smiles) 48 | smiles_score_lst.append((smiles, score)) 49 | smiles_score_lst.sort(key=lambda x:x[1], reverse=True) 50 | return smiles_score_lst 51 | 52 | def dpp(smiles_score_lst, num_return, lamb): 53 | smiles_lst = [i[0] for i in smiles_score_lst] 54 | if len(smiles_lst) <= num_return: 55 | return smiles_lst, None, None 56 | score_arr = np.array([i[1] for i in smiles_score_lst]) 57 | sim_mat = similarity_matrix(smiles_lst) 58 | dpp_model = DPPModel(smiles_lst = smiles_lst, sim_matrix = sim_mat, f_scores = score_arr, top_k = num_return, lamb = lamb) 59 | smiles_lst, log_det_V, log_det_S = dpp_model.dpp() 60 | return smiles_lst, log_det_V, log_det_S 61 | 62 | 63 | def gnn_screening(smiles_set, gnn): 64 | smiles_score_lst = [] 65 | for smiles in smiles_set: 66 | score = gnn_prediction_of_single_smiles(smiles, gnn) 67 | smiles_score_lst.append((smiles, score)) 68 | smiles_score_lst.sort(key=lambda x:x[1], reverse=True) 69 | return smiles_score_lst 70 | # smiles_lst = [i[0] for i in smiles_score_lst] 71 | # return smiles_lst 72 | 73 | def optimize_single_node(smiles): 74 | assert substr_num(smiles)==1 75 | vocabulary = load_vocabulary() 76 | atoms = ['N', 'C'] 77 | 78 | # bondtype_list = [rdkit.Chem.rdchem.BondType.SINGLE, rdkit.Chem.rdchem.BondType.DOUBLE] ### chemutils 79 | 80 | def optimize_single_molecule_one_iterate(smiles, gnn): 81 | target_ = torch.LongTensor([0]).view(-1) 82 | if smiles == None: 83 | return set() 84 | if not is_valid(smiles): 85 | return set() 86 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(smiles)) 87 | new_smiles_set = set() 88 | jj=-100 89 | 90 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \ 91 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(smiles) 92 | 93 | feature_lst = smiles2expandfeature(smiles) 94 | for node_mat, adj_mat, mask_idx in feature_lst: 95 | node_mat = torch.FloatTensor(node_mat) 96 | adj_mat = torch.FloatTensor(adj_mat) 97 | N = adj_mat.shape[0] 98 | for jj in range(N): 99 | if adj_mat[jj,N-1]==1: 100 | break 101 | 102 | _, prediction = gnn.infer(node_mat, adj_mat, mask_idx, target_) 103 | top_idxs = prediction.reshape(-1).argsort().tolist()[::-1][:3] 104 | top_words = [vocabulary[ii] for ii in top_idxs] 105 | for substru_idx, word in zip(top_idxs, top_words): 106 | leaf_atom_idx_lst = origin_substructure_lst[jj] 107 | 108 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer 109 | leaf_atom_idx_lst = [leaf_atom_idx_lst] 110 | for leaf_atom_idx in leaf_atom_idx_lst: 111 | for new_bond in bondtype_list: 112 | if ith_substructure_is_atom(substru_idx): 113 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 114 | new_atom = word, new_bond = new_bond) 115 | new_smiles_set.add(new_smiles) 116 | else: 117 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 118 | fragment = word , new_bond = new_bond) 119 | new_smiles_set = new_smiles_set.union(new_smiles_batch) 120 | 121 | new_smiles_set = set([new_smiles for new_smiles in new_smiles_set if new_smiles != None]) 122 | return new_smiles_set 123 | 124 | 125 | 126 | 127 | def optimize_single_molecule_all_generations(input_smiles, gnn, oracle, generations, population_size, lamb): 128 | smiles2f = dict() 129 | traceback_dict = dict() 130 | input_smiles = canonical(input_smiles) 131 | input_score = oracle(input_smiles) 132 | best_mol_score_list = [] 133 | existing_set = set([input_smiles]) 134 | current_mol_score_list = [(input_smiles, input_score)] 135 | for it in tqdm(range(generations)): 136 | new_smiles_set = set() 137 | #### optimize each single smiles 138 | for smiles,score in current_mol_score_list: 139 | # proposal_smiles_set = optimize_single_molecule_one_iterate(smiles, gnn) 140 | proposal_smiles_set = optimize_single_molecule_one_iterate_v2(smiles, gnn) 141 | proposal_smiles_set = proposal_smiles_set.difference(set([input_smiles])) 142 | for new_smiles in proposal_smiles_set: 143 | if new_smiles not in traceback_dict: 144 | traceback_dict[new_smiles] = smiles 145 | new_smiles_set = new_smiles_set.union(proposal_smiles_set) 146 | 147 | ### remove the repetition 148 | # new_smiles_set = new_smiles_set.difference(existing_set) 149 | 150 | ### add smiles into existing_set 151 | existing_set = existing_set.union(new_smiles_set) 152 | 153 | ### scoring new smiles 154 | ####### I:GNN & oracle scoring 155 | # gnn_smiles_lst = gnn_screening(new_smiles_set, gnn) 156 | # gnn_smiles_lst = gnn_smiles_lst[:population_size*3] 157 | # mol_score_list = oracle_screening(gnn_smiles_lst, oracle) 158 | ############ oracle call <= generations * population_size * 3 + 1 159 | 160 | ####### II: only oracle scoring 161 | mol_score_list = oracle_screening(new_smiles_set, oracle) 162 | ############ oracle call: unbounded, with better performance 163 | for smiles, score in mol_score_list: 164 | if score > 0.50: 165 | print('example', smiles, score) 166 | 167 | 168 | ### save results 169 | best_mol_score_list.extend(mol_score_list) 170 | 171 | 172 | ### only keep top-k 173 | # mol_score_list = mol_score_list[:population_size] 174 | ### dpp(smiles_score_lst, num_return, lamb) 175 | smiles_lst = dpp(mol_score_list, num_return = population_size, lamb = lamb) 176 | 177 | 178 | ### for next generation 179 | # current_mol_score_list = mol_score_list 180 | current_mol_score_list = [(smiles,0.0) for smiles in smiles_lst] 181 | 182 | ### endfor 183 | 184 | best_mol_score_list.sort(key=lambda x:x[1], reverse=True) 185 | return best_mol_score_list, input_score, traceback_dict 186 | 187 | 188 | 189 | def calculate_results(input_smiles, input_score, best_mol_score_list): 190 | if best_mol_score_list == []: 191 | with open(result_file, 'a') as fout: 192 | fout.write("fail to optimize" + input_smiles + '\n') 193 | return None 194 | output_scores = [i[1] for i in best_mol_score_list] 195 | smiles_lst = [i[0] for i in best_mol_score_list] 196 | with open(result_file, 'a') as fout: 197 | fout.write(str(input_score) + '\t' + str(output_scores[0]) + '\t' + str(np.mean(output_scores[:3])) 198 | + '\t' + input_smiles + '\t' + ' '.join(smiles_lst[:3]) + '\n') 199 | return input_score, output_scores[0] 200 | 201 | def inference_single_molecule(input_smiles, gnn, result_file, generations, population_size): 202 | best_mol_score_list, input_score, traceback_dict = optimize_single_molecule_all_generations(input_smiles, gnn, oracle, generations, population_size) 203 | return calculate_results(input_smiles, input_score, result_file, best_mol_score_list, oracle) 204 | 205 | 206 | 207 | 208 | def inference_molecule_set(input_smiles_lst, gnn, result_file, generations, population_size): 209 | score_lst = [] 210 | for input_smiles in tqdm(input_smiles_lst): 211 | if not is_valid(input_smiles): 212 | continue 213 | result = inference_single_molecule(input_smiles, gnn, result_file, generations, population_size) 214 | if result is None: 215 | score_lst.append(None) 216 | else: 217 | input_score, output_score = result 218 | score_lst.append((input_score, output_score)) 219 | return score_lst 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | -------------------------------------------------------------------------------- /data/substructure.txt: -------------------------------------------------------------------------------- 1 | C 1158545 2 | O 500212 3 | N 280451 4 | C1=CC=CC=C1 257945 5 | F 79430 6 | S 51103 7 | Cl 42872 8 | C1=CC=NC=C1 27852 9 | C1CCCCC1 20256 10 | C1=CNN=C1 18920 11 | C1=CSC=C1 17515 12 | C1CCNCC1 15912 13 | C1CC1 15462 14 | C1CCCC1 14328 15 | Br 12722 16 | C1=CSC=N1 12617 17 | C1COCCN1 11924 18 | C1CNCCN1 11701 19 | C1=COC=C1 11274 20 | C1CCCN1 9739 21 | C1=CN=CN=C1 7964 22 | C1CC[NH+]CC1 7948 23 | C1CCNC1 7634 24 | C1CCCNC1 7277 25 | C1=CCCC=C1 6243 26 | C1=NN=CN1 5748 27 | C1CNCC1 5513 28 | C1CCOC1 5310 29 | C1=CNC=N1 5201 30 | C1=NOC=N1 5141 31 | C1=CON=C1 4917 32 | C1C[NH+]CCN1 4893 33 | C1CC[NH+]C1 4027 34 | C1=CCCCC1 3985 35 | C1=NNC=N1 3958 36 | C1COCC1 3829 37 | C1=CCNC=C1 3752 38 | C1=C[NH]N=C1 3575 39 | C1=CNC=C1 3521 40 | C1CCC1 2924 41 | C1CCOCC1 2906 42 | C1CCC[NH+]C1 2784 43 | C1CNC=N1 2772 44 | C1CCCCN1 2771 45 | C1CSC=N1 2756 46 | C1COCO1 2643 47 | C1CSCC1 2614 48 | C1=CN=CNC1 2566 49 | C1CNCN1 2548 50 | C1CNCCC1 2540 51 | C1CSCN1 2360 52 | C1=C[NH]CC1 2336 53 | C1=CCCC1 2217 54 | C1COCCC1 2178 55 | C1COCCO1 2140 56 | C1=NN=NN1 2117 57 | C1=NN=CS1 2079 58 | C1=NN=CO1 1994 59 | C1=CCNCC1 1991 60 | C1=COC=N1 1936 61 | C1=CC=[NH+]C=C1 1876 62 | C1=C[NH+]=CN1 1716 63 | C1=CN=CC=N1 1706 64 | C1=CNCN=C1 1630 65 | C1=N[NH]C=N1 1598 66 | C1C[NH+]CC1 1477 67 | C1=CNN=N1 1444 68 | C1=CCOCC1 1434 69 | C1=COCC1 1406 70 | C1=C[NH]C=C1 1389 71 | C1CSCCN1 1385 72 | C1=CC=NCC1 1375 73 | C1C=CCCC1 1364 74 | C1=CCNN=C1 1329 75 | C1=CNCC1 1317 76 | C1=CN=C[NH]C1 1268 77 | C1=CNCNC1 1265 78 | C1C[NH]C=N1 1230 79 | C1CN=CNC1 1145 80 | C1CCCCCC1 1145 81 | C1COC=N1 1139 82 | C1CCCNCC1 1102 83 | C1=CSCN1 993 84 | C1C=CCN1 955 85 | C1CC[NH2+]CC1 940 86 | C1=CN=NC=C1 917 87 | I 888 88 | C1CNCC[NH+]C1 834 89 | C1=CC=[NH+]C1 816 90 | C1=C[NH]C=N1 766 91 | C1C=NNC1 759 92 | C1=CSCC1 746 93 | C1=NNCN1 735 94 | C1CCCCCN1 717 95 | C1=CC[NH]C=C1 694 96 | C1CCCO1 692 97 | C1CNCNC1 672 98 | C1C[NH+]CCC1 654 99 | C1CCSCC1 650 100 | C1C=C[NH]C1 625 101 | C1NCCN1 589 102 | C1CNCCNC1 575 103 | C1CCNCN1 566 104 | C1=NNCC1 563 105 | C1CNC1 538 106 | C1=CN=NC1 536 107 | C1NCCS1 523 108 | C1C=CSC1 505 109 | C1=CC[NH]CC1 500 110 | C1NCCCN1 494 111 | C1CNCS1 492 112 | C1CCC=CN1 489 113 | C1C[NH2+]CCN1 488 114 | C1C=N[NH]C1 486 115 | C1CC[NH2+]C1 482 116 | C1COCN1 481 117 | C1CCCS1 470 118 | C1CCNCCN1 459 119 | C1=NNCS1 459 120 | C1=C[NH+]=C[NH]1 444 121 | C1=CC=[N+]C=C1 443 122 | C1CCNN=C1 430 123 | C1CCC[NH+]CC1 430 124 | C1=N[NH]CN1 428 125 | C1=COCO1 426 126 | C1C[NH2+]CC1 415 127 | C1CCC[NH+]1 398 128 | C1=CC[NH+]CC1 397 129 | C1CCC[NH2+]C1 388 130 | C1NC=CCN1 388 131 | C1C=COCC1 386 132 | C1=CNCCC1 381 133 | C1=COCCO1 379 134 | C1COCCCO1 378 135 | C1C=NC=NC1 377 136 | C1NCCO1 370 137 | C1CCCSC1 356 138 | C1C=COC1 354 139 | C1CCCOC1 350 140 | C1=CNC[NH]C1 349 141 | C1CN=CC=N1 326 142 | C1=NNN=N1 321 143 | C1CN=C[NH]C1 320 144 | C1CCSC1 320 145 | C1=CNC=CC1 318 146 | C1=CSN=N1 311 147 | C1=COC=CC1 310 148 | C1CNN=N1 309 149 | C1=NSC=N1 308 150 | C1=[NH+]CCN1 307 151 | C1=COCCC1 296 152 | C1C=CNCC1 286 153 | C1C=CC=CC1 278 154 | C1=CCOC=C1 272 155 | C1=NON=C1 262 156 | C1CC=NC=N1 251 157 | C1C=CN=CC1 250 158 | C1=NC=CNC1 249 159 | C1=CC=CCC1 249 160 | C1=CNNC1 244 161 | C1=CSN=C1 242 162 | C1CNCN=C1 230 163 | C1CNC[NH]1 223 164 | C1=NC=NC=N1 216 165 | C1=NOCC1 215 166 | C1C=NOC1 213 167 | C1=CCN=C1 208 168 | C1COCC[NH+]1 203 169 | C1C=CNC1 202 170 | C1=CCCCCC1 198 171 | C1=[NH+]C[NH+]=C1 197 172 | C1COC1 195 173 | C1=NN=CC1 193 174 | C1C[NH2+]CCC1 192 175 | C1CCCCO1 190 176 | C1CSCCC1 190 177 | C1CCCCCCC1 189 178 | C1=NC=NN=C1 188 179 | C1CCOC=C1 184 180 | C1C[NH]C[NH]1 181 181 | C1=NSN=C1 181 182 | C1CSCC[NH+]1 181 183 | C1CCCCNC1 181 184 | C1CN=CN=C1 178 185 | C1=NCCNC1 176 186 | C1CNCCCN1 176 187 | C1CC[NH+]CCN1 174 188 | C1=C[NH]CNC1 172 189 | C1=NCCN=C1 171 190 | C1=CN=C[NH+]=C1 171 191 | C1CCSN1 169 192 | C1C=CCS1 168 193 | C1COCCNC1 164 194 | C1=NNN=C1 162 195 | C1C=CCC1 154 196 | C1C=C[NH]CC1 153 197 | C1C[NH]CNC1 152 198 | C1=C[NH+]=CN=C1 151 199 | C1COCC[NH2+]1 146 200 | C1CCC=NN1 145 201 | C1C=CNC=C1 142 202 | C1=NN=N[N-]1 141 203 | C1CSCCS1 140 204 | C1=CNCN1 139 205 | C1=NCCS1 139 206 | C1COCOC1 138 207 | C1=[N+]CCC1 137 208 | C1C[NH+]C1 137 209 | C1=CNCC=N1 136 210 | C1=CCOC1 135 211 | C1C=COC=C1 135 212 | C1CCOCCN1 135 213 | C1COC=C1 131 214 | C1=CN=CC1 131 215 | C1CO1 130 216 | C1CNN=C1 130 217 | C1=NNCO1 130 218 | C1CNSC1 129 219 | C1=NCNCC1 129 220 | C1=CCSCC1 123 221 | C1=CC=[NH+]CC1 123 222 | C1CSCCO1 121 223 | C1=C[NH]C[NH]C1 117 224 | C1CNCCSC1 116 225 | C1CCN1 116 226 | C1CCCC[NH+]C1 109 227 | C1C[NH2+]C1 109 228 | C1CC=CN1 108 229 | C1=NSCC1 108 230 | C1CNCCO1 108 231 | C1NN=CS1 106 232 | C1=CC=NN=C1 105 233 | C1CN=NNC1 104 234 | C1=[NH+]CCS1 102 235 | C1CC=CC=[NH+]1 102 236 | C1CCC[NH2+]CC1 100 237 | C1=NC=CCC1 100 238 | P 99 239 | C1=N[NH]C[NH]1 99 240 | C1COCCCC1 98 241 | C1C[NH+]=CN1 97 242 | C1=CSC[NH]1 97 243 | C1C=C[NH]C=C1 96 244 | C1CC[NH]N=C1 94 245 | C1=NCCN1 92 246 | C1CCNCCC1 92 247 | C1COCNC1 92 248 | C1=CNC=NC1 92 249 | C1=CN=CCC1 91 250 | C1CCC[NH2+]1 91 251 | C1=COCOC1 90 252 | C1CCCNCCC1 89 253 | C1=CC[NH2+]CC1 89 254 | C1=COCCCO1 88 255 | C1CNNC1 87 256 | C1C=CCO1 87 257 | C1=C[NH]C=NC1 87 258 | C1OCCO1 85 259 | C1=C[NH]CN1 82 260 | C1CCSCCN1 82 261 | C1CCSCN1 82 262 | C1COC=CN1 82 263 | C1=NC=NC1 80 264 | C1C=CNCN1 78 265 | C1=N[N-]C=N1 76 266 | C1NCCCO1 76 267 | C1=CN=[NH+]C=C1 75 268 | C1CNN=CN1 74 269 | C1COCCCN1 74 270 | C1=C[NH]NC1 73 271 | C1=NOCN1 73 272 | C1C=CCCN1 72 273 | C1CNC=C1 72 274 | C1C=CNN=C1 71 275 | C1=NSCCN1 71 276 | C1C=NCNC1 71 277 | C1=C[NH+]=CNC1 70 278 | C1C=CCC=C1 70 279 | C1=CNCN=N1 70 280 | C1CSCCCN1 70 281 | C1=CNCCNC1 68 282 | C1NCCCS1 68 283 | C1=CSC=[N+]1 68 284 | C1CC=NCN1 67 285 | C1CNCC[NH2+]C1 67 286 | C1=CC[NH]N=C1 67 287 | C1=CN[NH]C1 65 288 | C1=CNCCN1 64 289 | C1NCNN1 64 290 | C1N=CCN1 63 291 | C1CC=NN=C1 62 292 | C1C[NH+]CCCN1 62 293 | C1=NNCNC1 61 294 | C1CCNC=N1 59 295 | C1NNCCS1 59 296 | C1COC[NH]1 57 297 | C1CNCC=C1 57 298 | C1=NN=CNC1 55 299 | C1=CN[NH+]=C1 55 300 | C1CNCCCC1 55 301 | C1CN=CN1 55 302 | C1=C[NH]CN=C1 53 303 | C1C=CC=CN1 52 304 | C1=CSCCC1 52 305 | C1COCC[NH+]C1 51 306 | C1C[NH+]=C[NH]1 49 307 | C1CC[NH]C1 48 308 | C1NCN=[NH+]1 48 309 | C1=CCNC1 48 310 | C1CCOCO1 47 311 | C1=COCNC1 45 312 | C1CNCO1 45 313 | C1=CCCCCCC1 44 314 | C1CC=CC=N1 44 315 | C1CNC=CN1 43 316 | C1=CON=[N+]1 43 317 | C1CSCS1 43 318 | C1NN=NN1 42 319 | C1CCNN1 42 320 | C1=NCNC1 42 321 | C1CCOCNC1 41 322 | C1CC[N+]CC1 40 323 | C1N=NCN1 40 324 | C1C=NCS1 39 325 | C1CC=NN1 39 326 | C1N=NCS1 39 327 | C1=CCC1 39 328 | C1C=NCCC1 39 329 | C1CCC[NH+]CCC1 39 330 | C1=CCCNC1 39 331 | C1=N[NH]N=C1 38 332 | C1CCNC=C1 37 333 | C1C[N+]=CN1 36 334 | C1NCNCN1 36 335 | C1CCSNC1 36 336 | C1=C[NH]C[NH]1 36 337 | C1=NCN=C1 36 338 | C1C[N+]CCN1 36 339 | C1C=NCN1 35 340 | C1CCC=CO1 35 341 | C1CC=CCN1 34 342 | C1=CN=NNC1 34 343 | C1=[N+]CCN1 34 344 | C1=CONC1 34 345 | C1C=NSC1 32 346 | C1C[N-]C=N1 32 347 | C1CNCCOC1 32 348 | C1=COCCCC1 32 349 | C1=CN=[NH+]C1 31 350 | C1=NN=C[NH]1 31 351 | C1CSNCN1 31 352 | C1C[NH+]CCNC1 31 353 | C1N=CNN1 30 354 | C1=NC=CCSC1 30 355 | C1C=NC=CC1 30 356 | C1=NCSC1 29 357 | C1CSC[NH]1 29 358 | C1CCOCN1 28 359 | C1C[NH+]CCSC1 28 360 | C1=NCNCN1 28 361 | C1=NNCSC1 28 362 | C1C=CN=N1 28 363 | C1CNCOC1 27 364 | C1CSCC[NH2+]1 27 365 | C1CCCC=N1 27 366 | C1=NCCC1 27 367 | C1=C[NH+]=CCN1 26 368 | C1=NNC[NH]1 26 369 | C1CNCN=N1 26 370 | C1CC[NH+]CCC1 26 371 | C1=CNC[NH]1 26 372 | C1=NCCCN1 26 373 | C1CCCCCCN1 26 374 | C1CCC=N1 26 375 | C1=C[N-]C=N1 26 376 | C1=C[N+]=CN1 25 377 | C1CCN=CN1 25 378 | C1CSCC=CN1 25 379 | C1C[NH]N=N1 24 380 | C1CNOC1 24 381 | C1=CSCCO1 24 382 | C1C=CSN1 23 383 | C1=CCSC1 23 384 | C1C[NH2+]CS1 23 385 | C1=CCCNCC1 23 386 | C1CNCC[NH]1 22 387 | C1CSC=[N+]1 22 388 | C1=CC=[N+]CC1 22 389 | C1NNCS1 22 390 | C1CC=COC1 22 391 | C1CCON1 21 392 | C1CCCC=C1 21 393 | C1NCNCS1 21 394 | C1CN1 21 395 | C1=NCNC=N1 21 396 | C1C[NH]CC[NH]1 21 397 | C1C=NC=[NH+]C1 21 398 | C1NC=CS1 21 399 | C1=[NH+]CCCN1 21 400 | C1CNC=NN1 21 401 | C1=NCC[NH]C1 20 402 | C1C[NH]CN1 20 403 | C1=NCNN=C1 20 404 | C1CCCNN1 20 405 | C1C=CCCCN1 20 406 | C1=[NH+]CNCN1 20 407 | C1C=NC=N1 19 408 | C1CSCSC1 19 409 | C1=CNCSC1 19 410 | C1C[NH]NC1 19 411 | C1=[NH+]CCC1 19 412 | C1CN=COC1 19 413 | C1CC[NH]C=C1 19 414 | C1CNC[NH]C1 19 415 | C1CNSNC1 18 416 | C1=CCC=CC1 18 417 | C1CCC=CC1 18 418 | C1=C[N+]=CN=C1 18 419 | C1C=NNC=N1 18 420 | C1CSN1 17 421 | C1CC=CC=C1 17 422 | C1=[NH+]CCCCC1 17 423 | C1=NCNC=C1 17 424 | C1COC[NH+]C1 16 425 | C1CSCO1 16 426 | C1=CSCCN1 16 427 | C1C[NH+]=CC=[NH+]1 16 428 | C1C[NH2+]CCCN1 15 429 | C1NCCSN1 15 430 | C1CC=[NH+]N=C1 15 431 | C1=NN=CSC1 15 432 | C1=[NH+]CCCC1 15 433 | C1CC[N+]C1 15 434 | C1=CN=NC=N1 14 435 | C1CCN[NH]C1 14 436 | C1=NC[NH2+]CC1 14 437 | C1C=CNCCN1 14 438 | C1N=CCS1 14 439 | C1CN[NH]C1 14 440 | C1=COCCNC1 13 441 | C1=NN=C[NH]C1 13 442 | C1C[NH+]=CC=C1 13 443 | C1[N+]=CCS1 13 444 | C1CCC[N+]C1 13 445 | C1CSN=N1 13 446 | C1=C[NH]CC=N1 13 447 | C1=NC[NH]N=C1 13 448 | C1CSNCC1 12 449 | C1=C[N-]C=C1 12 450 | C1=CNSN=C1 12 451 | C1=CCC[NH2+]CC1 12 452 | C1CCCCOC1 12 453 | C1CSCCCC1 11 454 | C1COCC[N+]1 11 455 | C1=CC=COC1 11 456 | C1C[NH+]CCOC1 11 457 | C1=N[NH]C[NH]C1 11 458 | C1=CNOC1 11 459 | C1COCCC[NH2+]1 11 460 | C1COC=CC1 11 461 | C1CC=CCC1 11 462 | C1=NCCNCC1 11 463 | C1C=NCC1 11 464 | C1C=N[NH]C=N1 11 465 | C1C=C[NH+]=CC1 11 466 | C1=C[NH]N=N1 11 467 | C1C=CCCO1 11 468 | C1C[NH2+]CCNC1 11 469 | C1CCSC=N1 11 470 | C1CON=C1 10 471 | C1=CCSC=C1 10 472 | C1C=CC=N1 10 473 | C1CC=CN=N1 10 474 | C1NN=CCS1 10 475 | C1=N[NH+]=CSC1 10 476 | C1=COCN1 10 477 | C1CCN=CCN1 10 478 | C1=NCCO1 10 479 | C1=NCCCC1 10 480 | C1CN=NC1 10 481 | C1CN=NCN1 10 482 | C1=COC[NH]1 10 483 | C1CNC[NH+]C1 10 484 | C1=NCNN1 10 485 | C1CN=C[NH+]=N1 10 486 | C1=NC=C[NH]C1 10 487 | C1CCNNC1 10 488 | C1C[NH+]CC[NH2+]1 9 489 | C1C=CSCC1 9 490 | C1CCC=CCN1 9 491 | C1N=CC=CN1 9 492 | C1=NCCSC1 9 493 | C1CN=CCCN1 9 494 | C1NC=NS1 9 495 | C1CC[NH2+]CCC1 9 496 | C1=[N+]CCCS1 9 497 | C1=C[NH+]CCC1 9 498 | C1CN=C[N+]C1 9 499 | C1=NCCCS1 9 500 | C1=NCCOC1 9 501 | C1=CC[NH2+]C1 9 502 | C1=CC=CNC1 9 503 | C1=CNSCC1 9 504 | C1CCONC1 9 505 | C1=CSCNC1 9 506 | C1=CC=[O+]C=C1 9 507 | C1=[N+]CCCCC1 9 508 | C1N=[NH+]CS1 9 509 | C1COPOC1 9 510 | C1=CC[N+]CC1 9 511 | C1=[N+]CCCN1 9 512 | C1CC=NC=C1 9 513 | C1C[NH+]CN1 8 514 | C1=COCCN1 8 515 | C1=NC=NCN1 8 516 | C1CSNC=C1 8 517 | C1=CC=CCC=C1 8 518 | C1CSC=CN1 8 519 | C1=NCN=N1 8 520 | C1=CN=NCC1 8 521 | C1C=CON1 8 522 | C1CCCCCCCCCCC1 8 523 | C1CONC1 8 524 | C1CN=NN1 8 525 | C1CCCCS1 8 526 | C1=CCC=C1 8 527 | C1C=CN=CN1 8 528 | C1=COCC[NH+]C1 8 529 | C1CCOCCCN1 8 530 | C1CNCCS1 8 531 | C1CCNO1 8 532 | C1CC[NH+]CN1 8 533 | C1CNCC=N1 7 534 | C1CC=CO1 7 535 | C1=CSNCC1 7 536 | C1=C[NH]CN=N1 7 537 | C1=[N+]CC[N+]=C1 7 538 | C1CCC=C1 7 539 | C1NCON1 7 540 | C1=C[NH][NH+]=C1 7 541 | C1C[N+]CC[N+]1 7 542 | C1C=C[N+]=CC1 7 543 | C1=CCNNC1 7 544 | C1N=NN=N1 7 545 | C1SCCS1 7 546 | C1N=NNN1 7 547 | C1=NO[N+]=C1 7 548 | C1[NH+]=CC=[NH+]1 7 549 | C1CCCSCC1 7 550 | C1=[N+]CNC1 7 551 | C1=COC[NH+]C1 7 552 | C1C[N+]CCC1 7 553 | C1=CNC[NH+]C1 6 554 | C1=CSNC1 6 555 | C1C[NH+]CCO1 6 556 | C1CCNC=CN1 6 557 | C1=[NH+]CON1 6 558 | C1CCCNCN1 6 559 | C1NC[N+]CN1 6 560 | C1CNCC[NH2+]1 6 561 | C1C=NNCN1 6 562 | C1=CCN[NH]C1 6 563 | C1C=CC=C1 6 564 | C1=[N+]CCCC1 6 565 | C1=NCCOCC1 6 566 | C1C[NH2+]CCO1 6 567 | C1CCCC[NH2+]C1 6 568 | C1NC=NN1 6 569 | C1=NC[NH]C=N1 6 570 | C1CSSC1 6 571 | C1=NC=CCS1 6 572 | C1=CN=COC1 6 573 | C1CC[NH2+]CCN1 6 574 | C1=C[N+]CCN1 6 575 | C1=CCC[NH+]CC1 6 576 | C1=[NH+]CCCS1 6 577 | C1CCCNC=N1 6 578 | C1C[NH+]=CC=N1 6 579 | C1CC=CNC1 5 580 | C1=CCCOC=C1 5 581 | C1=[NH+]CNN1 5 582 | C1=C[N+]=CC=[N+]1 5 583 | C1C[N+]CC1 5 584 | C1=CC[NH+]C1 5 585 | C1N=NCO1 5 586 | C1=C[NH+]=NC1 5 587 | C1=C[NH][NH]C1 5 588 | C1CCCCNCC1 5 589 | C1CN=NC=N1 5 590 | C1=NNCCC1 5 591 | C1C[NH+]CCCC1 5 592 | C1CC[NH+]1 5 593 | C1C=NC=C1 5 594 | C1=C[NH+]=NN1 5 595 | C1=CCOCCC1 5 596 | C1CNC=CC1 5 597 | C1C=[N+]CCN1 5 598 | C1CC[NH]CC1 5 599 | C1C[NH]C[NH]C1 5 600 | C1C=CN=C1 5 601 | C1CC=CC1 5 602 | C1CC=[NH+]C=N1 5 603 | C1=NC=[NH+]CN1 5 604 | C1CNCC[NH+]1 5 605 | C1C=CSCN1 5 606 | C1=N[N-]N=C1 5 607 | C1N=CCCN1 5 608 | C1CCN=CC1 5 609 | C1=N[NH]C=[NH+]1 5 610 | C1=CSC=CC1 5 611 | C1=CNNCC1 4 612 | C1=CNCCCN1 4 613 | C1=NCOC1 4 614 | C1CCC=CCC1 4 615 | C1C[NH]N=CN1 4 616 | C1NNC[NH+]1 4 617 | C1C[NH+]1 4 618 | C1=CNCCO1 4 619 | C1CSCCSC1 4 620 | C1C[NH2+]CCS1 4 621 | C1=CCCSC1 4 622 | C1CSCC[NH2+]C1 4 623 | C1=NNSCC1 4 624 | C1=CNCCN=C1 4 625 | C1=CN=C[N+]=C1 4 626 | C1C=NCN=C1 4 627 | C1=NCC[N+]1 4 628 | C1C=CNC=[N+]1 4 629 | C1C=CCNCC1 4 630 | C1C[NH]CC=N1 4 631 | C1CCCCCCCCC1 4 632 | C1CCC[NH+]=N1 4 633 | C1NNCO1 4 634 | C1[N-]C=NN1 4 635 | C1C[NH2+]CCSC1 4 636 | C1=NC=CCN1 4 637 | C1=[NH+]CN=N1 4 638 | C1=CSCO1 4 639 | C1CCCOCC1 4 640 | C1CCOCCC1 4 641 | C1=NCCSCC1 4 642 | C1NCCNN1 4 643 | C1=CC[NH+]CCC1 4 644 | C1=CCC[NH+]C1 4 645 | C1CN=CC=CN1 4 646 | C1CC=CCCN1 4 647 | C1CN=CN=N1 4 648 | C1NCC=CO1 4 649 | C1=CC=[NH+]N=C1 4 650 | C1=C[NH+]=C[NH]C1 4 651 | C1=NCCCCC1 4 652 | C1CNCCC[NH2+]C1 4 653 | C1=NCCSN1 4 654 | C1=C[N-]CC1 4 655 | C1=NN[N+]=C1 4 656 | C1=NCCC=NN1 4 657 | C1COCC[NH2+]C1 4 658 | C1CNSCC1 4 659 | C1C[NH+]CO1 3 660 | C1=N[NH]CNC1 3 661 | C1COCCOCCOCCOCCO1 3 662 | C1CCCSN1 3 663 | C1=NN=NC1 3 664 | C1CN=CCC1 3 665 | C1CSC1 3 666 | C1CNS[NH+]C1 3 667 | C1NC[NH+]CN1 3 668 | C1=NOC[N-]1 3 669 | C1=NNCCS1 3 670 | C1CNN=C[NH]1 3 671 | C1CN=[NH+]C=N1 3 672 | C1C=CC1 3 673 | C1=COCCC[NH+]C1 3 674 | C1C=CCNN1 3 675 | C1=[NH+]CCCCN1 3 676 | C1=[NH+]CCO1 3 677 | C1C[NH+]=CSC1 3 678 | C1=NC=NNC1 3 679 | C1CN=C[N+]=C1 3 680 | C1=CCSN1 3 681 | C1=CSC=[NH+]1 3 682 | C1=CCCCOC1 3 683 | C1=CN=C[N-]C1 3 684 | C1C=CC[NH+]C1 3 685 | C1=NSCC[N-]1 3 686 | C1=NNC[NH]C1 3 687 | C1CC=[N+]CC1 3 688 | C1C=CNN1 3 689 | C1=CO[NH]C1 3 690 | C1=NC=NCC1 3 691 | C1C=NC[NH]C1 3 692 | C1=NC[NH+]CN1 3 693 | C1CC=NCC1 3 694 | C1COC[NH2+]1 3 695 | C1C[NH+]CC[NH+]1 3 696 | C1COCCOCCOCCN1 3 697 | C1CS1 3 698 | C1=COCCCNC1 3 699 | C1=NN=CN=N1 3 700 | C1C=CNNC1 3 701 | C1=NC[NH+]C1 3 702 | C1=CCNCCC1 3 703 | C1C=CCNC1 3 704 | C1CN=C[NH+]=C1 3 705 | C1N=C[NH]CN1 3 706 | C1NNCNN1 3 707 | C1C=NCCN1 3 708 | C1C=CC=[NH+]1 3 709 | C1CS[N-]CN1 3 710 | C1=NCN=CO1 3 711 | C1=[NH+]CCC=C1 3 712 | C1=CNC=[N+]C1 3 713 | C1CNSN1 3 714 | C1=NNNN1 3 715 | C1CCNCO1 3 716 | C1CCCC=CN1 3 717 | C1CSCNN1 3 718 | C1=CNCCCC1 3 719 | C1COC[NH+]1 3 720 | C1=NNCNN1 3 721 | C1=NSC=CN1 2 722 | C1=CNCC[NH2+]C1 2 723 | C1C[NH2+]CN1 2 724 | C1CCSS1 2 725 | C1C=CCCCC1 2 726 | C1=[N+]CCNC1 2 727 | C1CCCC[NH+]1 2 728 | C1=N[NH]CS1 2 729 | C1CNCC=CN1 2 730 | C1N=CNCN1 2 731 | C1=NSSC1 2 732 | C1CNNCN1 2 733 | C1=CCCOC1 2 734 | C1CCCCCCCC1 2 735 | C1COCCOCCOCCO1 2 736 | C1CCCCC=C1 2 737 | C1CCCNCCCN1 2 738 | C1=NCCOC=C1 2 739 | C1=CSSC1 2 740 | C1CCOC=NN1 2 741 | C1CCS1 2 742 | C1C=CCOC1 2 743 | C1=CNCCSC1 2 744 | C1=NC=[NH+]C=N1 2 745 | C1NN1 2 746 | C1=NCCC=[NH+]1 2 747 | C1CCC=[NH+]1 2 748 | C1C[N+]=CC=N1 2 749 | C1NCSS1 2 750 | C1=CN=CSC1 2 751 | C1SCSCS1 2 752 | C1C=NNP1 2 753 | C1C=COCO1 2 754 | C1=CNNN1 2 755 | C1CN=CCSC1 2 756 | C1=CNCCC=C1 2 757 | C1CC[NH][NH]C1 2 758 | C1C=NN=C1 2 759 | C1=C/CCCCCC/1 2 760 | C1CC[NH+]CSC1 2 761 | C1CNNC=N1 2 762 | C1COCC[NH+]CCOCCOCC[NH+]CCO1 2 763 | C1NC=CC=NN1 2 764 | C1CCCC[NH+]CC1 2 765 | C1CC=[N+]C=C1 2 766 | C1=[NH+]CCSC[CH-]1 2 767 | C1C[NH]COC1 2 768 | C1=N[N-]N=N1 2 769 | C1CC[N-]C=N1 2 770 | C1N=C[NH+]=C[NH]1 2 771 | C1CCN=N1 2 772 | C1=CNC=[NH+]C1 2 773 | C1NC=[NH+]CN1 2 774 | C1NCCC=[NH+]1 2 775 | C1=CN=N[NH]C1 2 776 | C1N=N1 2 777 | C1N=CS1 2 778 | C1=CN=C[N+]C1 2 779 | C1N[NH]CS1 2 780 | C1COCCC[NH+]1 2 781 | C1=[N+]CCS1 2 782 | C1=NCCCSC1 2 783 | C1C=NCO1 2 784 | C1=CS[N+]=C1 2 785 | C1=NCCC=C1 2 786 | C1=CSOC1 2 787 | C1=CC=[NH+]CN1 2 788 | C1NCC[NH+]1 2 789 | C1COCON1 2 790 | C1=NN[N-]N1 2 791 | C1CSCCNC1 2 792 | C1CNSCCN1 2 793 | C1=CN=NN=C1 2 794 | C1=CCC[NH2+]C1 2 795 | C1C=CC=CC=C1 2 796 | C1COCS1 2 797 | C1COCSN1 2 798 | C1C[NH+]CS1 2 799 | C1CCCC=[NH+]1 2 800 | C1CSC=CCN1 2 801 | C1=NC=C[N+]=C1 2 802 | C1=CNPN=C1 2 803 | C1CN[NH2+]N1 2 804 | C1=[N+]NCC1 2 805 | C1CCN[N+]CC1 2 806 | C1CSCOC1 2 807 | C1C[NH2+]CSC1 2 808 | C1COPO1 2 809 | C1=[NH+]CCNC1 2 810 | C1CNNCC1 2 811 | C1N=CN=CN1 2 812 | C1CCOCOC1 2 813 | C1CCCCCO1 2 814 | C1N=[NH+]CO1 2 815 | C1NCC[N+]1 2 816 | C1=CC=CC1 2 817 | C1C=CCN=C1 2 818 | C1=NOCCN1 2 819 | C1=CC[N+]C=C1 2 820 | C1C=NCCN=C1 2 821 | C1=CC1 2 822 | C1CCOCC[NH2+]C1 2 823 | C1CC[N+]=[N+]CC1 2 824 | C1CSC=C1 2 825 | C1COCPCO1 2 826 | C1NOSN1 2 827 | C1C[NH+]C[NH+]1 1 828 | C1=CSNCN1 1 829 | C1=NCCC=[N+]1 1 830 | C1C=NC[NH+]C1 1 831 | C1C=COCCO1 1 832 | C1CNC[NH2+]C1 1 833 | C1CSCNCCC1 1 834 | C1CC=CCCO1 1 835 | C1C[NH+]COC1 1 836 | C1[NH+]CCO1 1 837 | C1=COC=CN1 1 838 | C1NCCCCCN1 1 839 | C1C=CSNC1 1 840 | C1C=C[NH+]C=C1 1 841 | C1=NCN=C[N+]1 1 842 | C1CSCCOC1 1 843 | C1=C[N+]=CNC1 1 844 | C1=CCC=CCC1 1 845 | C1=CC=NCN1 1 846 | C1=CC=CCCC1 1 847 | C1=CCCCC/C=C/1 1 848 | C1=[NH+]NCN1 1 849 | C1CC=CS1 1 850 | C1=CC=COC=C1 1 851 | C1=NNCCP1 1 852 | C1=NC=CCNC1 1 853 | C1=NC=CSC1 1 854 | C1COCCOCCOCCOCCOCCOCCOCCO1 1 855 | C1NCNO1 1 856 | C1C[N+]CN1 1 857 | C1CCCN[P+]N1 1 858 | C1CCC/C=C/C=CCOCCCC1 1 859 | C1N=C[N-]N1 1 860 | C1C=CNCN=C1 1 861 | C1CC[S+]C1 1 862 | C1=CSC=[N+]C1 1 863 | C1=CC=[NH+]C=NN1 1 864 | C1C=[NH+]C=N1 1 865 | C1NCN1 1 866 | C1OCCCO1 1 867 | C1C[NH+]CCC[NH2+]C1 1 868 | C1SC=CS1 1 869 | C1C=CNO1 1 870 | C1C=CC=NN1 1 871 | C1CC=CCC=C1 1 872 | C1CCC[NH]CC1 1 873 | C1=CNCNN1 1 874 | C1C[N+]=CC=C1 1 875 | C1C=[NH+]CC1 1 876 | C1CCNCCCCC1 1 877 | C1N=CSS1 1 878 | C1N=N[NH]N1 1 879 | C1CSCNCCN1 1 880 | C1=CNCCS1 1 881 | C1=CCCSCC1 1 882 | C1CNC[NH+]CN1 1 883 | C1C=CNSC1 1 884 | C1C[N+]=CC=[N+]1 1 885 | C1=C\CCCC/C=C/CC/1 1 886 | C1CCCC[N+]C1 1 887 | C1CCNCCOC1 1 888 | C1=NSCN1 1 889 | C1=CC=C[NH]C1 1 890 | C1[NH]C=[NH+]CN1 1 891 | C1C=[N+]CC1 1 892 | C1COCCOCC[NH+]CCOCCOCC[NH+]1 1 893 | C1=NCSN1 1 894 | C1=[NH+]CCC[NH2+]C1 1 895 | C1C=[NH+]C=C[NH]1 1 896 | C1=CC=CCCC=CC=CCC1 1 897 | C1C=CCCCC=CC=CCC1 1 898 | C1C=CCCCCC=CCCC1 1 899 | C1=CC=CCCCC=CCCC1 1 900 | C1=NNC=[NH+]1 1 901 | C1=C/CCCCCC\C=C/CC\1 1 902 | C1=NCN=CN1 1 903 | C1CCSCCC1 1 904 | C1=CNCC[NH]1 1 905 | C1C[NH+]=CN=N1 1 906 | C1CCPN1 1 907 | C1N=CC=N1 1 908 | C1C=CNSN1 1 909 | C1[N+]CCO1 1 910 | C1COCC=C1 1 911 | C1=NCC=C[N-]1 1 912 | C1CCC[N+]1 1 913 | C1=C[NH2+]C=CC1 1 914 | C1CCC=CC=C1 1 915 | C1C=CCC=N1 1 916 | C1C=NNS1 1 917 | C1=CNN=CN1 1 918 | C1=C[NH]CSC1 1 919 | C1CNCCNN1 1 920 | C1CSOC1 1 921 | C1C/C=C\C=CCN1 1 922 | C1CN=CC1 1 923 | C1=COCCOCCOCCOCCO1 1 924 | C1=CC=NC1 1 925 | C1CCCSCNC1 1 926 | C1=NN=C[N-]1 1 927 | C1COCCOCCOCCSCCOCCO1 1 928 | C1=CN[N+]=C1 1 929 | C1CC\C=C/CCC1 1 930 | C1CN[N+]=C1 1 931 | C1CNC[NH+]1 1 932 | C1CCCCSC1 1 933 | C1C=[NH+]C=NC1 1 934 | C1C[NH+]CCNN1 1 935 | C1=C[N+]=C[NH]1 1 936 | C1=CC=[NH+]CCN1 1 937 | C1C=N[N+]=C1 1 938 | C1COC[N-]1 1 939 | C1=C[NH+]CC1 1 940 | C1=CNC[NH2+]C1 1 941 | C1=C[S+]=CS1 1 942 | C1=NNCSCC1 1 943 | C1=C[N-]CN=C1 1 944 | C1=NN=N[NH]1 1 945 | C1CCCCNCCC1 1 946 | C1=C[NH2+]CCN=C1 1 947 | C1=C[NH2+]NC1 1 948 | C1CC[N+]N1 1 949 | C1=C/CCCC/C=C/CC/1 1 950 | C1C=NCC=C1 1 951 | C1=NCC=C1 1 952 | C1=CSC=CS1 1 953 | C1C[NH2+]CCOC1 1 954 | C1NNCCNN1 1 955 | C1CN=CO1 1 956 | C1=C[NH2+]CCC1 1 957 | C1=COC[N+]1 1 958 | C1=CSCC=C1 1 959 | C1CN=[NH+]C1 1 960 | C1CC=CCCC1 1 961 | C1NN=NS1 1 962 | C1=N\CC/N=C\CC/1 1 963 | C1NCCC=[N+]1 1 964 | C1=CCC=CC=C1 1 965 | C1CCSCC[NH2+]1 1 966 | C1CC[N+]C=N1 1 967 | C1NCC=[N+]1 1 968 | C1C[NH]C=C[NH]1 1 969 | C1CC[NH+]=N1 1 970 | C1=NC=[O+]C=N1 1 971 | C1=NCC=CN1 1 972 | C1CN=CSC1 1 973 | C1NNC=[NH+]1 1 974 | C1=C[NH+]=CSC1 1 975 | C1C=CNC=CN1 1 976 | C1C[NH+]=NC1 1 977 | C1CSSCCSS1 1 978 | C1CCCOCCCCO1 1 979 | C1=CNN=CC1 1 980 | C1C=NSN=C1 1 981 | C1C[NH+]CC[NH+]C1 1 982 | C1=CC[NH]C1 1 983 | C1C=NCCCN1 1 984 | C1CNC=NC1 1 985 | C1CSCCSCCS1 1 986 | C1C=NNCS1 1 987 | C1NC=CC=[NH+]1 1 988 | C1C=CC[N+]CC1 1 989 | C1CSC=NN1 1 990 | C1=[NH+]NCCN1 1 991 | C1=CN=P[NH+]=C1 1 992 | C1COPN1 1 993 | C1=NNSC1 1 994 | C1C[NH+]=NCN1 1 995 | C1N=CSN1 1 996 | C1=NNC=[N+]1 1 997 | C1CCNC=NN1 1 998 | C1CSNS1 1 999 | C1COSO1 1 1000 | C1=CC=CCSCC=NCCSC1 1 1001 | C1C=CCCSCC=NCCSC1 1 1002 | N1NNN1 1 1003 | C1CSSCCNN1 1 1004 | C1=C[NH]COC1 1 1005 | C1=CN=SCC1 1 1006 | C1CCC=NCC1 1 1007 | C1=CC[NH2+]CCC1 1 1008 | C1CCNNCC1 1 1009 | C1=CCNC=CC1 1 1010 | C1=CCCOCC1 1 1011 | C1=[NH+]CCNCC1 1 1012 | C1CN[PH]O1 1 1013 | C1CO[PH]O1 1 1014 | C1C=[N+]CCCC1 1 1015 | C1CCOCCCCCCOC1 1 1016 | C1\C=C/CNCCC1 1 1017 | C1=CSCNN1 1 1018 | C1=NNC[NH2+]C1 1 1019 | C1CC[NH]NC1 1 1020 | C1C=CC=NC1 1 1021 | C1CSNN1 1 1022 | C1=N[NH+]=CNC1 1 1023 | C1=NC=CC1 1 1024 | C1=NCSCC1 1 1025 | C1C[NH+]CC[N+]1 1 1026 | C1=[N+]C[N+]=C1 1 1027 | C1CSCN[NH]1 1 1028 | C1COC=CCN1 1 1029 | C1=C[N+]=CC=N1 1 1030 | C1=NCCCCO1 1 1031 | C1=CNC[NH+]=C1 1 1032 | C1COCC[NH2+]CCOCCNCCCCCN1 1 1033 | C1CC[N+]NC1 1 1034 | C1=CNCC[NH+]=C1 1 1035 | C1CSCCSCCCSCCSC1 1 1036 | C1NN[NH2+]N1 1 1037 | C1=NNCN=N1 1 1038 | C1NC[NH+]CS1 1 1039 | C1=[NH+]C=CCN1 1 1040 | C1CCOPN1 1 1041 | C1=CSC[N-]1 1 1042 | C1CCC=CS1 1 1043 | C1C=[N+]C=CN1 1 1044 | C1=CC=COCC1 1 1045 | C1OCCS1 1 1046 | C1C=CC[N+]1 1 1047 | C1=NC[NH+]=C1 1 1048 | C1=NC=N[N+]=C1 1 1049 | C1CNC[N+]=C1 1 1050 | C1=NCC[N+]=C1 1 1051 | C1=NCCCO1 1 1052 | C1CNC=[NH+]1 1 1053 | C1C[NH+]SN1 1 1054 | C1NCOCN1 1 1055 | C1CC=CC=CN1 1 1056 | C1CC[N+]C=CN1 1 1057 | C1=NCCN=CC1 1 1058 | C1COCNN1 1 1059 | C1CNCNN1 1 1060 | C1=CN=[N+]C=C1 1 1061 | C1CCC=C[N-]1 1 1062 | C1CN=NC=C1 1 1063 | C1=[N+]C=[NH+]CC1 1 1064 | C1CNC[N+]C1 1 1065 | C1=C[N+]=NC=N1 1 1066 | C1=[NH+]C[NH][NH]1 1 1067 | C1C=CC=CCN1 1 1068 | C1COCCOCCOCC[NH+]1 1 1069 | C1=NC=[NH+]C1 1 1070 | C1=C[NH+]=COC1 1 1071 | C1C[NH2+]CC[NH2+]1 1 1072 | C1=CCCSCCC1 1 1073 | C1NOCNO1 1 1074 | C1=C\CNC/C=C\CNC/1 1 1075 | C1COCCOCC[NH2+]CCOCCOCCN1 1 1076 | C1=C[N+]CN=C1 1 1077 | C1=NCOCS1 1 1078 | C1CNCC[N-]1 1 1079 | C1=NCC=[NH+]C1 1 1080 | C1C=COCCN1 1 1081 | C1COCCOCCOCCOCC[NH2+]CCO1 1 1082 | C1CCC=NC=N1 1 1083 | C1C[N-]SCCSO1 1 1084 | C1C=CNC=N1 1 1085 | C1=C[N+]=CCC1 1 1086 | C1CCNCCCN1 1 1087 | C1CC=NNCN1 1 1088 | C1NNN=[NH+]1 1 1089 | C1=NC=CC[N-]1 1 1090 | C1C[O+]=CC=N1 1 1091 | C1[N-]CCS1 1 1092 | C1COC=[NH+]1 1 1093 | C1=CC=NCC[NH2+]1 1 1094 | C1=C[NH2+]CCCN1 1 1095 | C1C=NCC[NH]C1 1 1096 | C1=NCCP1 1 1097 | C1CC[NH+]NC1 1 1098 | C1N[NH+]1 1 1099 | C1NNC=CS1 1 1100 | C1=CC[NH+]=C1 1 1101 | C1=CCCC=CC1 1 1102 | C1C=NNCC1 1 1103 | C1C[N-]NC1 1 1104 | C1CC/C=C/CCCCCCCCO1 1 1105 | C1NCN[NH]1 1 1106 | C1CCCCOCCCCCC1 1 1107 | C1N=[NH+]CN1 1 1108 | C1CN=CC=C1 1 1109 | C1=NC=CPN1 1 1110 | C1=CCC\C=C/CCCC1 1 1111 | C1=NCSS1 1 1112 | C1COCCOCCOCCOCCOCCO1 1 1113 | C1=CSC1 1 1114 | C1NSC=[N+]1 1 1115 | C1SCCCS1 1 1116 | C1=NN=COC1 1 1117 | C1=NCC=CO1 1 1118 | C1C[N-]CC1 1 1119 | C1[NH]CN[NH]1 1 1120 | C1C=C[NH+]=N1 1 1121 | C1C[NH+]CCNCC[NH+]CCNCC[NH+]CCN1 1 1122 | C1N=CCC=N1 1 1123 | C1C=CCNCN1 1 1124 | C1=C[N-]N=C1 1 1125 | C1CCC[NH+]CCC=CC1 1 1126 | C1C[NH]C=[N+]C1 1 1127 | C1=CNPC=C1 1 1128 | C1=NCNPN1 1 1129 | C1=CSC=NC1 1 1130 | C1C[NH+]=CCCN1 1 1131 | C1=CSCC=[NH+]1 1 1132 | C1COCCOCCOCCOCCN1 1 1133 | C1CCNS1 1 1134 | C1=CC[N+]C1 1 1135 | C1C[NH2+]CC[NH+]1 1 1136 | C1=CCCC[NH2+]C1 1 1137 | C1CSNC1 1 1138 | C1=C[NH+]CCN1 1 1139 | C1CN=[NH+]CC1 1 1140 | C1=CC=[O+]C1 1 1141 | C1N=NN[N+]1 1 1142 | C1CCOPO1 1 1143 | C1=NCCNN1 1 1144 | C1COCOO1 1 1145 | C1CCOOCC1 1 1146 | C1CCC[N+]C=N1 1 1147 | C1C[NH+]=COC1 1 1148 | C1NC=CSCCO1 1 1149 | -------------------------------------------------------------------------------- /src/chemutils.py: -------------------------------------------------------------------------------- 1 | import rdkit 2 | from rdkit import Chem, DataStructs 3 | from rdkit.Chem import AllChem 4 | from rdkit.Chem import Draw 5 | from functools import reduce 6 | from tqdm import tqdm 7 | from copy import deepcopy 8 | import numpy as np 9 | import torch 10 | from torch.autograd import Variable 11 | torch.manual_seed(4) 12 | np.random.seed(1) 13 | import random 14 | random.seed(1) 15 | 16 | ''' 17 | 1. vocabulary: find frequent words (atom and ring) 18 | 2. graph2tree 19 | 3. generate smiles set 20 | 4. chemical utility 21 | tanimot similarity 22 | canonicalize smiles 23 | is valid 24 | 5. score modifier 25 | logp_modifier [-inf, inf] -> [0,1] 26 | 27 | qed_logp_jnk_gsk_fusion 28 | qed, logp, jsn, gsk -> [0,1] 29 | 30 | 31 | ''' 32 | def sigmoid(float_x): 33 | return 1.0 / (1 + np.exp(-float_x)) 34 | 35 | from scipy.stats import gmean 36 | 37 | def logp_modifier(logp_score): 38 | return max(0.0,min(1.0,1/14*(logp_score+10))) 39 | ''' 40 | [-inf, inf] -> [0,1] 41 | ''' 42 | 43 | def docking_modifier(docking_score): 44 | ''' 45 | [-12,-4] -> [0,1] 46 | -12 -----> 1 47 | -4 -----> 0 48 | ''' 49 | docking_score = 1/(12-4)*(-docking_score - 4) 50 | docking_score = max(docking_score, 0.0) 51 | docking_score = min(docking_score, 1.0) 52 | return docking_score 53 | 54 | def qed_logp_fusion(qed_score, logp_score, jnk_score, gsk_score): 55 | logp_score = logp_modifier(logp_score) 56 | gmean_score = gmean([qed_score, logp_score]) 57 | modified_score = min(1.0,gmean_score) 58 | return modified_score 59 | 60 | def logp_jnk_gsk_fusion(logp_score, jnk_score, gsk_score): 61 | logp_score = logp_modifier(logp_score) 62 | return np.mean([logp_score, jnk_score, gsk_score]) 63 | 64 | 65 | def qed_logp_jnk_gsk_fusion(qed_score, logp_score, jnk_score, gsk_score): 66 | logp_score = logp_modifier(logp_score) 67 | gmean_score = gmean([qed_score, logp_score, jnk_score, gsk_score]) 68 | modified_score = min(1.0,gmean_score) 69 | return modified_score 70 | 71 | def qed_logp_jnk_gsk_fusion2(qed_score, logp_score, jnk_score, gsk_score): 72 | logp_score = logp_modifier(logp_score) 73 | return np.mean([qed_score, logp_score, jnk_score, gsk_score]) 74 | 75 | def qed_logp_fusion(qed_score, logp_score): 76 | logp_score = logp_modifier(logp_score) 77 | gmean_score = gmean([qed_score, logp_score]) 78 | modified_score = min(1.0, gmean_score) 79 | return modified_score 80 | 81 | def jnk_gsk_fusion(jnk_score, gsk_score): 82 | gmean_score = gmean([jnk_score, gsk_score]) 83 | modified_score = min(1.0,gmean_score) 84 | return modified_score 85 | 86 | 87 | def load_vocabulary(): 88 | datafile = "data/vocabulary.txt" 89 | with open(datafile, 'r') as fin: 90 | lines = fin.readlines() 91 | vocabulary = [line.split()[0] for line in lines] 92 | return vocabulary 93 | 94 | vocabulary = load_vocabulary() 95 | bondtype_list = [rdkit.Chem.rdchem.BondType.SINGLE, rdkit.Chem.rdchem.BondType.DOUBLE] 96 | 97 | 98 | def ith_substructure_is_atom(i): 99 | substructure = vocabulary[i] 100 | return True if len(substructure)==1 else False 101 | 102 | def word2idx(word): 103 | return vocabulary.index(word) 104 | 105 | 106 | # def smiles2fingerprint(smiles): 107 | # mol = Chem.MolFromSmiles(smiles) 108 | # fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048, useChirality=False) 109 | # return np.array(fp) 110 | # ### shape: (2048,) 111 | 112 | def smiles2fingerprint(smiles): 113 | mol = Chem.MolFromSmiles(smiles) 114 | fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useChirality=False) 115 | return np.array(fp) 116 | ### shape: (1024,) 117 | 118 | 119 | ## similarity of two SMILES 120 | def similarity(a, b): 121 | if a is None or b is None: 122 | return 0.0 123 | amol = Chem.MolFromSmiles(a) 124 | bmol = Chem.MolFromSmiles(b) 125 | if amol is None or bmol is None: 126 | return 0.0 127 | fp1 = AllChem.GetMorganFingerprintAsBitVect(amol, 2, nBits=2048, useChirality=False) 128 | fp2 = AllChem.GetMorganFingerprintAsBitVect(bmol, 2, nBits=2048, useChirality=False) 129 | return DataStructs.TanimotoSimilarity(fp1, fp2) 130 | 131 | 132 | def similarity_matrix(smiles_lst): 133 | n = len(smiles_lst) 134 | sim_matrix = np.eye(n) 135 | mol_lst = [Chem.MolFromSmiles(smiles) for smiles in smiles_lst] 136 | fingerprint_lst = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048, useChirality=False) for mol in mol_lst] 137 | for i in range(n): 138 | fp1 = fingerprint_lst[i] 139 | for j in range(i+1,n): 140 | fp2 = fingerprint_lst[j] 141 | sim = DataStructs.TanimotoSimilarity(fp1, fp2) 142 | sim_matrix[i,j] = sim_matrix[j,i] = sim 143 | return sim_matrix 144 | 145 | 146 | def canonical(smiles): 147 | try: 148 | mol = Chem.MolFromSmiles(smiles) 149 | except: 150 | return None 151 | if mol is not None: 152 | return Chem.MolToSmiles(mol, isomericSmiles=True) ### todo double check 153 | else: 154 | return None 155 | 156 | 157 | def smiles2mol(smiles): 158 | try: 159 | mol = Chem.MolFromSmiles(smiles) 160 | except: 161 | return None 162 | if mol is None: 163 | return None 164 | Chem.Kekulize(mol) 165 | return mol 166 | 167 | ## input: smiles, output: word lst; 168 | def smiles2word(smiles): 169 | mol = smiles2mol(smiles) 170 | if mol is None: 171 | return None 172 | word_lst = [] 173 | 174 | cliques = [list(x) for x in Chem.GetSymmSSSR(mol)] 175 | cliques_smiles = [] 176 | for clique in cliques: 177 | clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True) 178 | cliques_smiles.append(clique_smiles) 179 | atom_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()] 180 | return cliques_smiles + atom_not_in_rings_list 181 | 182 | ## is_valid_smiles 183 | def is_valid(smiles): 184 | word_lst = smiles2word(smiles) 185 | word_set = set(word_lst) 186 | return word_set.issubset(vocabulary) 187 | 188 | 189 | def is_valid_mol(mol): 190 | try: 191 | smiles = Chem.MolToSmiles(mol) 192 | except: 193 | return False 194 | if smiles.strip() == '': 195 | return False 196 | mol = Chem.MolFromSmiles(smiles) 197 | if mol is None or mol.GetNumAtoms() == 0: 198 | return False 199 | return True 200 | 201 | def substr_num(smiles): 202 | mol = smiles2mol(smiles) 203 | clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)] 204 | return len(clique_lst) 205 | 206 | 207 | def smiles2substrs(smiles): 208 | if not is_valid(smiles): 209 | return None 210 | mol = smiles2mol(smiles) 211 | if mol is None: 212 | return None 213 | idx_lst = [] 214 | 215 | clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)] 216 | # print(clique_lst) ## [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15]] 217 | for clique in clique_lst: 218 | clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True) 219 | # print("clique_smiles", clique_smiles) ## C1=CC=CC=C1, C1=COCC1, C1=CC=CC=C1 220 | idx_lst.append(word2idx(clique_smiles)) 221 | atom_symbol_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()] 222 | atom_idx_not_in_rings_list = [atom.GetIdx() for atom in mol.GetAtoms() if not atom.IsInRing()] 223 | # print(atom_idx_not_in_rings_list) ## [0, 1, 2, 3, 11, 12, 13, 14, 21] nonring atom's index in molecule 224 | for atom in atom_symbol_not_in_rings_list: 225 | idx_lst.append(word2idx(atom)) 226 | 227 | return idx_lst 228 | 229 | 230 | 231 | def smiles2graph(smiles): 232 | ''' N is # of substructures in the molecule 233 | 234 | Output: 235 | 1. 236 | idx_lst [N] list of substructure's index 237 | node_mat [N,d] 238 | 2. 239 | substructure_lst 240 | atomidx_2substridx dict 241 | 3. 242 | adjacency_matrix [N,N] 0/1 np.zeros((4,4)) 243 | 4. 244 | leaf_extend_idx_pair [(x1,y1), (x2,y2), ...] 245 | ''' 246 | 247 | ### 0. smiles -> mol 248 | if not is_valid(smiles): 249 | return None 250 | mol = smiles2mol(smiles) 251 | if mol is None: 252 | return None 253 | 254 | ### 1. idx_lst & node_mat 255 | idx_lst = [] 256 | clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)] 257 | # print(clique_lst) ## [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15]] 258 | for clique in clique_lst: 259 | clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True) 260 | # print("clique_smiles", clique_smiles) ## C1=CC=CC=C1, C1=COCC1, C1=CC=CC=C1 261 | idx_lst.append(word2idx(clique_smiles)) 262 | 263 | atom_symbol_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()] 264 | atom_idx_not_in_rings_list = [atom.GetIdx() for atom in mol.GetAtoms() if not atom.IsInRing()] 265 | # print(atom_idx_not_in_rings_list) ## [0, 1, 2, 3, 11, 12, 13, 14, 21] nonring atom's index in molecule 266 | for atom in atom_symbol_not_in_rings_list: 267 | idx_lst.append(word2idx(atom)) 268 | # print(idx_lst) ## [3, 68, 3, 0, 0, 0, 0, 0, 0, 1, 2, 4] 269 | d = len(vocabulary) 270 | N = len(idx_lst) 271 | node_mat = np.zeros((N, d)) 272 | for i,v in enumerate(idx_lst): 273 | node_mat[i,v]=1 274 | 275 | 276 | ### 2. substructure_lst & atomidx_2substridx 277 | ### map from atom index to substructure index 278 | atomidx_2substridx = dict() 279 | substructure_lst = clique_lst + atom_idx_not_in_rings_list 280 | ### [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15], 0, 1, 2, 3, 11, 12, 13, 14, 21] 281 | ### 4:0 23:0, 22:0, ... 8:1, 7:1, 22:1, ... 16:2, 17:2, 18:2, ... 0:3, 1:4, 282 | for idx, substructure in enumerate(substructure_lst): 283 | if type(substructure)==list: 284 | for atom in substructure: 285 | atomidx_2substridx[atom] = idx 286 | else: 287 | atomidx_2substridx[substructure] = idx 288 | 289 | 290 | ### 3. adjacency_matrix 291 | adjacency_matrix = np.zeros((N,N),dtype=np.int32) 292 | 293 | ####### 3.1 atom-atom bonds and atom-ring bonds 294 | for bond in mol.GetBonds(): 295 | if not bond.IsInRing(): 296 | a1 = bond.GetBeginAtom().GetIdx() 297 | a2 = bond.GetEndAtom().GetIdx() 298 | idx1 = atomidx_2substridx[a1] 299 | idx2 = atomidx_2substridx[a2] 300 | adjacency_matrix[idx1,idx2] = adjacency_matrix[idx2,idx1] = 1 301 | ####### 3.2 ring-ring connection 302 | for i1,c1 in enumerate(clique_lst): 303 | for i2,c2 in enumerate(clique_lst): 304 | if i1>=i2: 305 | continue 306 | if len(set(c1).intersection(set(c2))) > 0: 307 | adjacency_matrix[i1,i2] = adjacency_matrix[i2,i1] = 1 308 | assert np.sum(adjacency_matrix)>=2*(N-1) 309 | 310 | leaf_idx_lst = list(np.where(np.sum(adjacency_matrix,1)==1)[0]) 311 | M = len(leaf_idx_lst) 312 | extend_idx_lst = list(range(N,N+M)) 313 | leaf_extend_idx_pair = list(zip(leaf_idx_lst, extend_idx_lst)) 314 | ####### [(3, 12), (5, 13), (6, 14), (9, 15), (11, 16)] 315 | 316 | return idx_lst, node_mat, substructure_lst, atomidx_2substridx, adjacency_matrix, leaf_extend_idx_pair 317 | 318 | 319 | def smiles2feature(smiles): 320 | """ 321 | (1) molecule2tree 322 | (2) mask leaf node 323 | """ 324 | ### 0. smiles -> mol 325 | if not is_valid(smiles): 326 | return None 327 | mol = smiles2mol(smiles) 328 | if mol is None: 329 | return None 330 | 331 | ### 1. idx_lst 332 | idx_lst = [] 333 | clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)] 334 | # print(clique_lst) ## [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15]] 335 | for clique in clique_lst: 336 | clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True) 337 | # print("clique_smiles", clique_smiles) ## C1=CC=CC=C1, C1=COCC1, C1=CC=CC=C1 338 | idx_lst.append(word2idx(clique_smiles)) 339 | 340 | atom_symbol_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()] 341 | atom_idx_not_in_rings_list = [atom.GetIdx() for atom in mol.GetAtoms() if not atom.IsInRing()] 342 | # print(atom_idx_not_in_rings_list) ## [0, 1, 2, 3, 11, 12, 13, 14, 21] nonring atom's index in molecule 343 | for atom in atom_symbol_not_in_rings_list: 344 | idx_lst.append(word2idx(atom)) 345 | # print(idx_lst) ## [3, 68, 3, 0, 0, 0, 0, 0, 0, 1, 2, 4] 346 | d = len(vocabulary) 347 | N = len(idx_lst) 348 | 349 | ### 2. substructure_lst & atomidx_2substridx 350 | ### map from atom index to substructure index 351 | atomidx_2substridx = dict() 352 | substructure_lst = clique_lst + atom_idx_not_in_rings_list 353 | ### [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15], 0, 1, 2, 3, 11, 12, 13, 14, 21] 354 | ### 4:0 23:0, 22:0, ... 8:1, 7:1, 22:1, ... 16:2, 17:2, 18:2, ... 0:3, 1:4, 355 | for idx, substructure in enumerate(substructure_lst): 356 | if type(substructure)==list: 357 | for atom in substructure: 358 | atomidx_2substridx[atom] = idx 359 | else: 360 | atomidx_2substridx[substructure] = idx 361 | 362 | ### 3. adjacency_matrix 363 | adjacency_matrix = np.zeros((N,N),dtype=np.int32) 364 | ####### 3.1 atom-atom bonds and atom-ring bonds 365 | for bond in mol.GetBonds(): 366 | if not bond.IsInRing(): 367 | a1 = bond.GetBeginAtom().GetIdx() 368 | a2 = bond.GetEndAtom().GetIdx() 369 | idx1 = atomidx_2substridx[a1] 370 | idx2 = atomidx_2substridx[a2] 371 | adjacency_matrix[idx1,idx2] = adjacency_matrix[idx2,idx1] = 1 372 | ####### 3.2 ring-ring connection 373 | for i1,c1 in enumerate(clique_lst): 374 | for i2,c2 in enumerate(clique_lst): 375 | if i1>=i2: 376 | continue 377 | if len(set(c1).intersection(set(c2))) > 0: 378 | adjacency_matrix[i1,i2] = adjacency_matrix[i2,i1] = 1 379 | assert np.sum(adjacency_matrix)>=2*(N-1) 380 | 381 | # print(adjacency_matrix, smiles) 382 | leaf_idx_lst = list(np.where(np.sum(adjacency_matrix,1)==1)[0]) 383 | mask_idx = random.choice(leaf_idx_lst) 384 | label = idx_lst[mask_idx] 385 | 386 | node_mat = np.zeros((N, d + 1)) 387 | for i,v in enumerate(idx_lst): 388 | if i==mask_idx: 389 | node_mat[i,d] = 1 390 | else: 391 | node_mat[i,v] = 1 392 | 393 | return node_mat, adjacency_matrix, mask_idx, label 394 | 395 | 396 | 397 | def smiles2expandfeature(smiles): 398 | """ 399 | (1) molecule2tree 400 | (2) mask leaf node 401 | """ 402 | ### 0. smiles -> mol 403 | if not is_valid(smiles): 404 | return None 405 | mol = smiles2mol(smiles) 406 | if mol is None: 407 | return None 408 | 409 | ### 1. idx_lst 410 | idx_lst = [] 411 | clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)] 412 | # print(clique_lst) ## [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15]] 413 | for clique in clique_lst: 414 | clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True) 415 | # print("clique_smiles", clique_smiles) ## C1=CC=CC=C1, C1=COCC1, C1=CC=CC=C1 416 | idx_lst.append(word2idx(clique_smiles)) 417 | 418 | atom_symbol_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()] 419 | atom_idx_not_in_rings_list = [atom.GetIdx() for atom in mol.GetAtoms() if not atom.IsInRing()] 420 | # print(atom_idx_not_in_rings_list) ## [0, 1, 2, 3, 11, 12, 13, 14, 21] nonring atom's index in molecule 421 | for atom in atom_symbol_not_in_rings_list: 422 | idx_lst.append(word2idx(atom)) 423 | # print(idx_lst) ## [3, 68, 3, 0, 0, 0, 0, 0, 0, 1, 2, 4] 424 | d = len(vocabulary) 425 | N = len(idx_lst) 426 | 427 | ### 2. substructure_lst & atomidx_2substridx 428 | ### map from atom index to substructure index 429 | atomidx_2substridx = dict() 430 | substructure_lst = clique_lst + atom_idx_not_in_rings_list 431 | ### [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15], 0, 1, 2, 3, 11, 12, 13, 14, 21] 432 | ### 4:0 23:0, 22:0, ... 8:1, 7:1, 22:1, ... 16:2, 17:2, 18:2, ... 0:3, 1:4, 433 | for idx, substructure in enumerate(substructure_lst): 434 | if type(substructure)==list: 435 | for atom in substructure: 436 | atomidx_2substridx[atom] = idx 437 | else: 438 | atomidx_2substridx[substructure] = idx 439 | 440 | ### 3. adjacency_matrix 441 | adjacency_matrix = np.zeros((N+1,N+1),dtype=np.int32) 442 | ####### 3.1 atom-atom bonds and atom-ring bonds 443 | for bond in mol.GetBonds(): 444 | if not bond.IsInRing(): 445 | a1 = bond.GetBeginAtom().GetIdx() 446 | a2 = bond.GetEndAtom().GetIdx() 447 | idx1 = atomidx_2substridx[a1] 448 | idx2 = atomidx_2substridx[a2] 449 | adjacency_matrix[idx1,idx2] = adjacency_matrix[idx2,idx1] = 1 450 | ####### 3.2 ring-ring connection 451 | for i1,c1 in enumerate(clique_lst): 452 | for i2,c2 in enumerate(clique_lst): 453 | if i1>=i2: 454 | continue 455 | if len(set(c1).intersection(set(c2))) > 0: 456 | adjacency_matrix[i1,i2] = adjacency_matrix[i2,i1] = 1 457 | # assert np.sum(adjacency_matrix)>=2*(N-1) 458 | 459 | # print(adjacency_matrix, smiles) 460 | leaf_idx_lst = list(np.where(np.sum(adjacency_matrix,1)==1)[0]) 461 | mask_idx = random.choice(leaf_idx_lst) 462 | label = idx_lst[mask_idx] 463 | 464 | 465 | node_mat = np.zeros((N + 1, d + 1)) 466 | for i,v in enumerate(idx_lst): 467 | node_mat[i,v] = 1 468 | 469 | feature_lst = [] 470 | for idx in range(N): 471 | new_node_mat = deepcopy(node_mat) 472 | new_adj_mat = deepcopy(adjacency_matrix) 473 | new_node_mat[-1,d] = 1 474 | new_adj_mat[idx,N] = 1 475 | new_adj_mat[N,idx] = 1 476 | feature_lst.append((new_node_mat, new_adj_mat, N)) 477 | 478 | 479 | return feature_lst 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | def copy_atom(atom): 488 | new_atom = Chem.Atom(atom.GetSymbol()) 489 | new_atom.SetFormalCharge(atom.GetFormalCharge()) 490 | new_atom.SetAtomMapNum(atom.GetAtomMapNum()) 491 | return new_atom 492 | 493 | def add_atom_at_position(editmol, position_idx, new_atom, new_bond): 494 | ''' 495 | position_idx: index of edited atom in editmol 496 | new_atom: 'C', 'N', 'O', ... 497 | new_bond: SINGLE, DOUBLE 498 | ''' 499 | ###### 1 edit mol 500 | new_atom = Chem.rdchem.Atom(new_atom) 501 | rwmol = deepcopy(editmol) 502 | new_atom_idx = rwmol.AddAtom(new_atom) 503 | rwmol.AddBond(position_idx, new_atom_idx, order = new_bond) 504 | ###### 2 check valid of new mol 505 | if not is_valid_mol(rwmol): 506 | return None 507 | try: 508 | rwmol.UpdatePropertyCache() 509 | except: 510 | return None 511 | smiles = Chem.MolToSmiles(rwmol) 512 | assert '.' not in smiles 513 | return canonical(smiles) 514 | 515 | 516 | def add_fragment_at_position(editmol, position_idx, fragment, new_bond): 517 | ''' 518 | position_idx: index of edited atom in editmol 519 | fragment: e.g., "C1=CC=CC=C1", "C1=CC=NC=C1", ... 520 | new_bond: {SINGLE, DOUBLE} 521 | 522 | Return: 523 | list of SMILES 524 | ''' 525 | new_smiles_set = set() 526 | fragment_mol = Chem.MolFromSmiles(fragment) 527 | current_atom = editmol.GetAtomWithIdx(position_idx) 528 | neighbor_atom_set = set() ## index of neighbor of current atom in new_mol 529 | 530 | 531 | ## (A) add a bond between atom and ring 532 | #### 1. initialize empty new_mol 533 | new_mol = Chem.RWMol(Chem.MolFromSmiles('')) 534 | 535 | #### 2. add editmol into new_mol 536 | old_idx2new_idx = dict() 537 | for atom in editmol.GetAtoms(): 538 | old_idx = atom.GetIdx() 539 | new_atom = copy_atom(atom) 540 | new_idx = new_mol.AddAtom(new_atom) 541 | old_idx2new_idx[old_idx] = new_idx 542 | assert old_idx == new_idx 543 | for bond in editmol.GetBonds(): 544 | a1 = bond.GetBeginAtom() 545 | a2 = bond.GetEndAtom() 546 | i1 = a1.GetIdx() 547 | i2 = a2.GetIdx() 548 | i1_new = old_idx2new_idx[i1] 549 | i2_new = old_idx2new_idx[i2] 550 | bt = bond.GetBondType() 551 | new_mol.AddBond(i1_new, i2_new, bt) 552 | ### collect the neighbor atoms of current atom, both are in ring. 553 | if (i1==position_idx or i2==position_idx) and (a1.IsInRing() and a2.IsInRing()): 554 | neighbor_atom_set.add(i1_new) 555 | neighbor_atom_set.add(i2_new) 556 | if neighbor_atom_set != set(): 557 | neighbor_atom_set.remove(old_idx2new_idx[position_idx]) 558 | 559 | #### 3. combine two components 560 | #### 3.1 add fragment into new_mol 561 | new_atom_idx_lst = [] 562 | old_idx2new_idx2 = dict() ### fragment idx -> new mol idx 563 | for atom in fragment_mol.GetAtoms(): 564 | old_atom_idx = atom.GetIdx() 565 | new_atom = copy_atom(atom) 566 | new_atom_idx = new_mol.AddAtom(new_atom) 567 | new_atom_idx_lst.append(new_atom_idx) 568 | old_idx2new_idx2[old_atom_idx] = new_atom_idx 569 | for bond in fragment_mol.GetBonds(): 570 | a1 = bond.GetBeginAtom().GetIdx() 571 | a2 = bond.GetEndAtom().GetIdx() 572 | i1 = old_idx2new_idx2[a1] 573 | i2 = old_idx2new_idx2[a2] 574 | bt = bond.GetBondType() 575 | new_mol.AddBond(i1, i2, bt) 576 | 577 | #### 3.2 enumerate possible binding atoms and generate new smiles 578 | for i in new_atom_idx_lst: ### enumeration 579 | copy_mol = deepcopy(new_mol) 580 | copy_mol.AddBond(old_idx2new_idx[position_idx], i, new_bond) 581 | if is_valid_mol(copy_mol): 582 | try: 583 | copy_mol.UpdatePropertyCache() 584 | new_smiles = Chem.MolToSmiles(copy_mol) 585 | new_smiles = canonical(new_smiles) 586 | if new_smiles is not None: 587 | assert '.' not in new_smiles 588 | new_smiles_set.add(new_smiles) 589 | except: 590 | pass 591 | 592 | 593 | # if not current_atom.IsInRing() or new_bond != rdkit.Chem.rdchem.BondType.SINGLE: 594 | if not current_atom.IsInRing(): 595 | return new_smiles_set 596 | 597 | 598 | # print(new_smiles_set) 599 | ## (B) share bond between rings 600 | #### 1. initialize empty new_mol 601 | new_mol = Chem.RWMol(Chem.MolFromSmiles('')) 602 | 603 | #### 2. add editmol into new_mol 604 | old_idx2new_idx = dict() 605 | for atom in editmol.GetAtoms(): 606 | old_idx = atom.GetIdx() 607 | new_atom = copy_atom(atom) 608 | new_idx = new_mol.AddAtom(new_atom) 609 | old_idx2new_idx[old_idx] = new_idx 610 | assert old_idx == new_idx 611 | for bond in editmol.GetBonds(): 612 | a1 = bond.GetBeginAtom().GetIdx() 613 | a2 = bond.GetEndAtom().GetIdx() 614 | i1 = old_idx2new_idx[a1] 615 | i2 = old_idx2new_idx[a2] 616 | bt = bond.GetBondType() 617 | new_mol.AddBond(i1, i2, bt) 618 | 619 | # print(Chem.MolToSmiles(new_mol)) 620 | #### 3. fragment mol 621 | ####### 3.1 find 2 common atoms and 1 bond 622 | current_atom = editmol.GetAtomWithIdx(old_idx2new_idx[position_idx]) 623 | current_atom_symbol = current_atom.GetSymbol() 624 | 625 | atom_lst = list(fragment_mol.GetAtoms()) 626 | for neighbor_atom in neighbor_atom_set: 627 | neighbor_atom_symbol = editmol.GetAtomWithIdx(neighbor_atom).GetSymbol() 628 | bondtype_edit = new_mol.GetBondBetweenAtoms(neighbor_atom, old_idx2new_idx[position_idx]).GetBondType() 629 | for i,v in enumerate(atom_lst): 630 | v_idx = v.GetIdx() 631 | ### v1 is neighbor of v 632 | for v1 in [atom_lst[i-1], atom_lst[i+1-len(atom_lst)]]: 633 | v1_idx = v1.GetIdx() 634 | bondtype_frag = fragment_mol.GetBondBetweenAtoms(v_idx, v1_idx).GetBondType() 635 | # print("current:", current_atom_symbol, "neighbor:", neighbor_atom_symbol, bondtype_edit) 636 | # print(v.GetSymbol(), v1.GetSymbol(), bondtype_frag) 637 | if v.GetSymbol()==current_atom_symbol and v1.GetSymbol()==neighbor_atom_symbol and bondtype_edit==bondtype_frag: 638 | ####### 3.1 find 2 common atoms and 1 bond 639 | # print("2 common atoms and 1 bond ") 640 | ############################################ 641 | ####### 3.2 add other atoms and bonds 642 | new_mol2 = deepcopy(new_mol) 643 | old_idx2new_idx2 = dict() 644 | old_idx2new_idx2[v_idx] = current_atom.GetIdx() 645 | old_idx2new_idx2[v1_idx] = neighbor_atom 646 | for atom in fragment_mol.GetAtoms(): 647 | old_idx = atom.GetIdx() 648 | if not (old_idx==v_idx or old_idx==v1_idx): 649 | new_atom = copy_atom(atom) 650 | new_idx = new_mol2.AddAtom(new_atom) 651 | old_idx2new_idx2[old_idx] = new_idx 652 | for bond in fragment_mol.GetBonds(): 653 | a1 = bond.GetBeginAtom() 654 | a2 = bond.GetEndAtom() 655 | i1 = a1.GetIdx() 656 | i2 = a2.GetIdx() 657 | i1_new = old_idx2new_idx2[i1] 658 | i2_new = old_idx2new_idx2[i2] 659 | bt = bond.GetBondType() 660 | if not (set([i1,i2]) == set([v1.GetIdx(), v.GetIdx()])): 661 | new_mol2.AddBond(i1_new, i2_new, bt) 662 | ####### 3.2 add other atoms and bonds 663 | ####### 3.3 check validity and canonicalize 664 | if not is_valid_mol(new_mol2): 665 | continue 666 | try: 667 | new_mol2.UpdatePropertyCache() 668 | # print("success") 669 | except: 670 | continue 671 | new_smiles = Chem.MolToSmiles(new_mol2) 672 | new_smiles = canonical(new_smiles) 673 | if new_smiles is not None: 674 | assert '.' not in new_smiles 675 | new_smiles_set.add(new_smiles) 676 | # print(new_smiles) 677 | # print(new_smiles_set) 678 | return new_smiles_set 679 | 680 | 681 | 682 | def delete_substructure_at_idx(editmol, atom_idx_lst): 683 | edit_smiles = Chem.MolToSmiles(editmol) 684 | #### 1. initialize with empty mol 685 | new_mol = Chem.RWMol(Chem.MolFromSmiles('')) 686 | 687 | #### 2. add editmol into new_mol 688 | old_idx2new_idx = dict() 689 | for atom in editmol.GetAtoms(): 690 | old_idx = atom.GetIdx() 691 | if old_idx in atom_idx_lst: 692 | continue 693 | new_atom = copy_atom(atom) 694 | new_idx = new_mol.AddAtom(new_atom) 695 | old_idx2new_idx[old_idx] = new_idx 696 | for bond in editmol.GetBonds(): 697 | a1 = bond.GetBeginAtom().GetIdx() 698 | a2 = bond.GetEndAtom().GetIdx() 699 | if a1 in atom_idx_lst or a2 in atom_idx_lst: 700 | continue 701 | a1_new = old_idx2new_idx[a1] 702 | a2_new = old_idx2new_idx[a2] 703 | bt = bond.GetBondType() 704 | new_mol.AddBond(a1_new, a2_new, bt) 705 | 706 | if not is_valid_mol(new_mol): 707 | return None 708 | try: 709 | new_mol.UpdatePropertyCache() 710 | except: 711 | return None 712 | return new_mol, old_idx2new_idx 713 | 714 | 715 | 716 | 717 | 718 | 719 | def differentiable_graph2smiles_lgp(origin_smiles, differentiable_graph, 720 | leaf_extend_idx_pair, leaf_nonleaf_lst, 721 | max_num_offspring = 100, topk = 3): 722 | ''' 723 | origin_smiles: 724 | origin_idx_lst [N] 0,1,...,d-1 725 | origin_node_mat [N,d] 726 | origin_substructure_lst 727 | origin_atomidx_2substridx 728 | origin_adjacency_matrix [N,N] 0/1 729 | differentiable_graph: returned results 730 | node_indicator [N+M,d] 731 | adjacency_weight [N+M,N+M] 732 | N is # of substructures in the molecule 733 | M is # of leaf node, also number of extended node. 734 | main utility 735 | add_atom_at_position 736 | add_fragment_at_position 737 | delete_substructure_at_idx 738 | REPLACE = delete + add 739 | Output: 740 | new_smiles_set 741 | ''' 742 | new_smiles_set = set() 743 | #### 1. data preparation 744 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles)) 745 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \ 746 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles) 747 | node_indicator, adjacency_weight = differentiable_graph 748 | N = len(origin_idx_lst) 749 | M = len(leaf_extend_idx_pair) 750 | d = len(vocabulary) 751 | 752 | ####### 2.3 add todo: use adjacency_weight to further narrow scope 753 | for leaf_idx, extend_idx in leaf_extend_idx_pair: 754 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx] 755 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer 756 | leaf_atom_idx_lst = [leaf_atom_idx_lst] 757 | for leaf_atom_idx in leaf_atom_idx_lst: 758 | added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk] 759 | for substructure_idx in added_substructure_lst: 760 | new_substructure = vocabulary[substructure_idx] 761 | for new_bond in bondtype_list: 762 | if ith_substructure_is_atom(substructure_idx): 763 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 764 | new_atom = new_substructure, new_bond = new_bond) 765 | new_smiles_set.add(new_smiles) 766 | else: 767 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 768 | fragment = new_substructure , new_bond = new_bond) 769 | new_smiles_set = new_smiles_set.union(new_smiles_batch) 770 | 771 | return new_smiles_set.difference(set([None])) 772 | 773 | 774 | 775 | 776 | 777 | def differentiable_graph2smiles_v0(origin_smiles, differentiable_graph, 778 | leaf_extend_idx_pair, leaf_nonleaf_lst, 779 | max_num_offspring = 100, topk = 3): 780 | ''' 781 | origin_smiles: 782 | origin_idx_lst [N] 0,1,...,d-1 783 | origin_node_mat [N,d] 784 | origin_substructure_lst 785 | origin_atomidx_2substridx 786 | origin_adjacency_matrix [N,N] 0/1 787 | differentiable_graph: returned results 788 | node_indicator [N+M,d] 789 | adjacency_weight [N+M,N+M] 790 | N is # of substructures in the molecule 791 | M is # of leaf node, also number of extended node. 792 | main utility 793 | add_atom_at_position 794 | add_fragment_at_position 795 | delete_substructure_at_idx 796 | REPLACE = delete + add 797 | Output: 798 | new_smiles_set 799 | ''' 800 | new_smiles_set = set() 801 | #### 1. data preparation 802 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles)) 803 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \ 804 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles) 805 | node_indicator, adjacency_weight = differentiable_graph 806 | N = len(origin_idx_lst) 807 | M = len(leaf_extend_idx_pair) 808 | d = len(vocabulary) 809 | 810 | #### 2. edit the original molecule 811 | ####### 2.1 delete & 2.2 replace 812 | for leaf_idx, _ in leaf_extend_idx_pair: 813 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx] 814 | if type(leaf_atom_idx_lst)==int: ### single atom 815 | new_leaf_atom_idx_lst = [leaf_atom_idx_lst] 816 | else: #### ring 817 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 818 | new_leaf_atom_idx_lst = [] 819 | remaining_atoms_idx_lst = [] 820 | for i,v in enumerate(origin_substructure_lst): 821 | if i==leaf_idx: 822 | continue 823 | if type(v)==int: 824 | remaining_atoms_idx_lst.append(v) 825 | else: #### list 826 | remaining_atoms_idx_lst.extend(v) 827 | new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst] 828 | ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst 829 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 830 | result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst) 831 | if result is None: 832 | continue 833 | delete_mol, old_idx2new_idx = result 834 | delete_smiles = Chem.MolToSmiles(delete_mol) 835 | if delete_smiles is None or '.' in delete_smiles: 836 | continue 837 | delete_smiles = canonical(delete_smiles) 838 | new_smiles_set.add(delete_smiles) #### 2.1 delete done 839 | #### 2.2 replace a & b 840 | ######### (a) get neighbor substr 841 | neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1] 842 | assert len(neighbor_substructures_idx)==1 843 | neighbor_substructures_idx = neighbor_substructures_idx[0] 844 | neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx] 845 | if type(neighbor_atom_idx_lst)==int: 846 | neighbor_atom_idx_lst = [neighbor_atom_idx_lst] 847 | ######### (b) add new substructure todo, enumerate several possibility 848 | added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk] ### topk 849 | for substructure_idx in added_substructure_lst: 850 | new_substructure = vocabulary[substructure_idx] 851 | for new_bond in bondtype_list: 852 | for leaf_atom_idx in neighbor_atom_idx_lst: 853 | new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx] 854 | if ith_substructure_is_atom(substructure_idx): 855 | new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 856 | new_atom = new_substructure, new_bond = new_bond) 857 | new_smiles_set.add(new_smiles) 858 | else: 859 | new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 860 | fragment = new_substructure, new_bond = new_bond) 861 | new_smiles_set = new_smiles_set.union(new_smiles_batch) 862 | 863 | 864 | 865 | ####### 2.3 add todo: use adjacency_weight to further narrow scope 866 | for leaf_idx, extend_idx in leaf_extend_idx_pair: 867 | expand_prob = (adjacency_weight[leaf_idx][extend_idx] + adjacency_weight[extend_idx][leaf_idx])/2 ### [-inf, inf] 868 | # print("expand prob", expand_prob) 869 | if expand_prob < -3: 870 | continue 871 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx] 872 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer 873 | leaf_atom_idx_lst = [leaf_atom_idx_lst] 874 | for leaf_atom_idx in leaf_atom_idx_lst: 875 | added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk] 876 | for substructure_idx in added_substructure_lst: 877 | new_substructure = vocabulary[substructure_idx] 878 | for new_bond in bondtype_list: 879 | if ith_substructure_is_atom(substructure_idx): 880 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 881 | new_atom = new_substructure, new_bond = new_bond) 882 | new_smiles_set.add(new_smiles) 883 | else: 884 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 885 | fragment = new_substructure , new_bond = new_bond) 886 | new_smiles_set = new_smiles_set.union(new_smiles_batch) 887 | 888 | 889 | 890 | return new_smiles_set.difference(set([None])) 891 | 892 | 893 | 894 | 895 | def differentiable_graph2smiles(origin_smiles, differentiable_graph, 896 | leaf_extend_idx_pair, leaf_nonleaf_lst, 897 | max_num_offspring = 100, topk = 3): 898 | ''' 899 | origin_smiles: 900 | origin_idx_lst [N] 0,1,...,d-1 901 | origin_node_mat [N,d] 902 | origin_substructure_lst 903 | origin_atomidx_2substridx 904 | origin_adjacency_matrix [N,N] 0/1 905 | 906 | differentiable_graph: returned results 907 | node_indicator [N+M,d] 908 | adjacency_weight [N+M,N+M] 909 | 910 | N is # of substructures in the molecule 911 | M is # of leaf node, also number of extended node. 912 | 913 | 914 | main utility 915 | add_atom_at_position 916 | add_fragment_at_position 917 | delete_substructure_at_idx 918 | REPLACE = delete + add 919 | 920 | Output: 921 | new_smiles_set 922 | ''' 923 | leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst} 924 | leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair} 925 | new_smiles_set = set() 926 | #### 1. data preparation 927 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles)) 928 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \ 929 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles) 930 | node_indicator, adjacency_weight = differentiable_graph 931 | N = len(origin_idx_lst) 932 | M = len(leaf_extend_idx_pair) 933 | d = len(vocabulary) 934 | 935 | 936 | #### 2. edit the original molecule 937 | ####### 2.1 delete & 2.2 replace 938 | for leaf_idx, extend_idx in leaf_extend_idx_pair: 939 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx] 940 | if type(leaf_atom_idx_lst)==int: ### single atom 941 | new_leaf_atom_idx_lst = [leaf_atom_idx_lst] 942 | else: #### ring 943 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 944 | new_leaf_atom_idx_lst = [] 945 | remaining_atoms_idx_lst = [] 946 | for i,v in enumerate(origin_substructure_lst): 947 | if i==leaf_idx: 948 | continue 949 | if type(v)==int: 950 | remaining_atoms_idx_lst.append(v) 951 | else: #### list 952 | remaining_atoms_idx_lst.extend(v) 953 | new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst] 954 | ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst 955 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 956 | result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst) 957 | if result is None: 958 | continue 959 | delete_mol, old_idx2new_idx = result 960 | delete_smiles = Chem.MolToSmiles(delete_mol) 961 | if delete_smiles is None or '.' in delete_smiles: 962 | continue 963 | delete_smiles = canonical(delete_smiles) 964 | nonleaf_idx = leaf2nonleaf[leaf_idx] 965 | shrink_prob = (adjacency_weight[leaf_idx,nonleaf_idx] + adjacency_weight[nonleaf_idx,leaf_idx])/2 966 | if shrink_prob > -3: ### sigmoid(-3)=0.1 967 | new_smiles_set.add(delete_smiles) 968 | #### 2.1 delete done 969 | #### 2.2 replace a & b 970 | ######### (a) get neighbor substr 971 | neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1] 972 | assert len(neighbor_substructures_idx)==1 973 | neighbor_substructures_idx = neighbor_substructures_idx[0] 974 | neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx] 975 | if type(neighbor_atom_idx_lst)==int: 976 | neighbor_atom_idx_lst = [neighbor_atom_idx_lst] 977 | ######### (b) add new substructure todo, enumerate several possibility 978 | added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk] 979 | for substructure_idx in added_substructure_lst: 980 | new_substructure = vocabulary[substructure_idx] 981 | for new_bond in bondtype_list: 982 | for leaf_atom_idx in neighbor_atom_idx_lst: 983 | new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx] 984 | if ith_substructure_is_atom(substructure_idx): 985 | new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 986 | new_atom = new_substructure, new_bond = new_bond) 987 | new_smiles_set.add(new_smiles) 988 | else: 989 | new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 990 | fragment = new_substructure, new_bond = new_bond) 991 | new_smiles_set = new_smiles_set.union(new_smiles_batch) 992 | expand_prob = (adjacency_weight[leaf_idx,extend_idx] + adjacency_weight[extend_idx,leaf_idx])/2 993 | if expand_prob < -3: 994 | return new_smiles_set.difference(set([None])) 995 | 996 | 997 | ####### 2.3 add todo: use adjacency_weight to further narrow scope 998 | for leaf_idx, extend_idx in leaf_extend_idx_pair: 999 | expand_prob = (adjacency_weight[leaf_idx][extend_idx] + adjacency_weight[extend_idx][leaf_idx])/2 ### [-inf, inf] 1000 | # print("expand prob", expand_prob) 1001 | if expand_prob < -3: 1002 | continue 1003 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx] 1004 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer 1005 | leaf_atom_idx_lst = [leaf_atom_idx_lst] 1006 | for leaf_atom_idx in leaf_atom_idx_lst: 1007 | added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk] 1008 | for substructure_idx in added_substructure_lst: 1009 | new_substructure = vocabulary[substructure_idx] 1010 | for new_bond in bondtype_list: 1011 | if ith_substructure_is_atom(substructure_idx): 1012 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 1013 | new_atom = new_substructure, new_bond = new_bond) 1014 | new_smiles_set.add(new_smiles) 1015 | else: 1016 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 1017 | fragment = new_substructure , new_bond = new_bond) 1018 | new_smiles_set = new_smiles_set.union(new_smiles_batch) 1019 | 1020 | return new_smiles_set.difference(set([None])) 1021 | 1022 | 1023 | 1024 | 1025 | def differentiable_graph2smiles_sample(origin_smiles, differentiable_graph, 1026 | leaf_extend_idx_pair, leaf_nonleaf_lst, 1027 | topk, epsilon): 1028 | ''' 1029 | origin_smiles: 1030 | origin_idx_lst [N] 0,1,...,d-1 1031 | origin_node_mat [N,d] 1032 | origin_substructure_lst 1033 | origin_atomidx_2substridx 1034 | origin_adjacency_matrix [N,N] 0/1 1035 | 1036 | differentiable_graph: returned results 1037 | node_indicator [N+M,d] 1038 | adjacency_weight [N+M,N+M] 1039 | 1040 | N is # of substructures in the molecule 1041 | M is # of leaf node, also number of extended node. 1042 | 1043 | 1044 | main utility 1045 | add_atom_at_position 1046 | add_fragment_at_position 1047 | delete_substructure_at_idx 1048 | REPLACE = delete + add 1049 | 1050 | Output: 1051 | new_smiles_set 1052 | ''' 1053 | leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst} 1054 | leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair} 1055 | new_smiles_set = set() 1056 | #### 1. data preparation 1057 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles)) 1058 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \ 1059 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles) 1060 | node_indicator, adjacency_weight = differentiable_graph 1061 | N = len(origin_idx_lst) 1062 | M = len(leaf_extend_idx_pair) 1063 | d = len(vocabulary) 1064 | 1065 | 1066 | #### 2. edit the original molecule 1067 | ####### 2.1 delete & 2.2 replace 1068 | for leaf_idx, extend_idx in leaf_extend_idx_pair: 1069 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx] 1070 | if type(leaf_atom_idx_lst)==int: ### single atom 1071 | new_leaf_atom_idx_lst = [leaf_atom_idx_lst] 1072 | else: #### ring 1073 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 1074 | new_leaf_atom_idx_lst = [] 1075 | remaining_atoms_idx_lst = [] 1076 | for i,v in enumerate(origin_substructure_lst): 1077 | if i==leaf_idx: 1078 | continue 1079 | if type(v)==int: 1080 | remaining_atoms_idx_lst.append(v) 1081 | else: #### list 1082 | remaining_atoms_idx_lst.extend(v) 1083 | new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst] 1084 | ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst 1085 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 1086 | result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst) 1087 | if result is None: 1088 | continue 1089 | delete_mol, old_idx2new_idx = result 1090 | delete_smiles = Chem.MolToSmiles(delete_mol) 1091 | if delete_smiles is None or '.' in delete_smiles: 1092 | continue 1093 | delete_smiles = canonical(delete_smiles) 1094 | nonleaf_idx = leaf2nonleaf[leaf_idx] 1095 | shrink_prob = (adjacency_weight[leaf_idx,nonleaf_idx] + adjacency_weight[nonleaf_idx,leaf_idx])/2 1096 | if shrink_prob > -3: ### sigmoid(-3)=0.1 1097 | new_smiles_set.add(delete_smiles) 1098 | #### 2.1 delete done 1099 | #### 2.2 replace a & b 1100 | ######### (a) get neighbor substr 1101 | neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1] 1102 | assert len(neighbor_substructures_idx)==1 1103 | neighbor_substructures_idx = neighbor_substructures_idx[0] 1104 | neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx] 1105 | if type(neighbor_atom_idx_lst)==int: 1106 | neighbor_atom_idx_lst = [neighbor_atom_idx_lst] 1107 | ######### (b) add new substructure todo, enumerate several possibility 1108 | u = random.random() 1109 | if u < epsilon: 1110 | added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk] ### topk (greedy) 1111 | else: 1112 | added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator[leaf_idx], k=topk + 3) 1113 | added_substructure_lst = list(set(added_substructure_lst))[:topk] ### avoid repetition 1114 | for substructure_idx in added_substructure_lst: 1115 | new_substructure = vocabulary[substructure_idx] 1116 | for new_bond in bondtype_list: 1117 | for leaf_atom_idx in neighbor_atom_idx_lst: 1118 | new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx] 1119 | if ith_substructure_is_atom(substructure_idx): 1120 | new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 1121 | new_atom = new_substructure, new_bond = new_bond) 1122 | new_smiles_set.add(new_smiles) 1123 | else: 1124 | new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 1125 | fragment = new_substructure, new_bond = new_bond) 1126 | new_smiles_set = new_smiles_set.union(new_smiles_batch) 1127 | expand_prob = (adjacency_weight[leaf_idx,extend_idx] + adjacency_weight[extend_idx,leaf_idx])/2 1128 | if expand_prob < -3: 1129 | return new_smiles_set.difference(set([None])) 1130 | 1131 | 1132 | ####### 2.3 add todo: use adjacency_weight to further narrow scope 1133 | for leaf_idx, extend_idx in leaf_extend_idx_pair: 1134 | expand_prob = (adjacency_weight[leaf_idx][extend_idx] + adjacency_weight[extend_idx][leaf_idx])/2 ### [-inf, inf] 1135 | # print("expand prob", expand_prob) 1136 | if expand_prob < -3: 1137 | continue 1138 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx] 1139 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer 1140 | leaf_atom_idx_lst = [leaf_atom_idx_lst] 1141 | for leaf_atom_idx in leaf_atom_idx_lst: 1142 | u = random.random() 1143 | if u < epsilon: 1144 | added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk] 1145 | else: 1146 | added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator[extend_idx], k=topk + 3) 1147 | added_substructure_lst = list(set(added_substructure_lst))[:topk] ### avoid repetition 1148 | for substructure_idx in added_substructure_lst: 1149 | new_substructure = vocabulary[substructure_idx] 1150 | for new_bond in bondtype_list: 1151 | if ith_substructure_is_atom(substructure_idx): 1152 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 1153 | new_atom = new_substructure, new_bond = new_bond) 1154 | new_smiles_set.add(new_smiles) 1155 | else: 1156 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 1157 | fragment = new_substructure , new_bond = new_bond) 1158 | new_smiles_set = new_smiles_set.union(new_smiles_batch) 1159 | 1160 | return new_smiles_set.difference(set([None])) 1161 | 1162 | 1163 | 1164 | def differentiable_graph2smiles_sample_v2(origin_smiles, differentiable_graph, 1165 | leaf_extend_idx_pair, leaf_nonleaf_lst, 1166 | topk, epsilon): 1167 | ''' 1168 | origin_smiles: 1169 | origin_idx_lst [N] 0,1,...,d-1 1170 | origin_node_mat [N,d] 1171 | origin_substructure_lst 1172 | origin_atomidx_2substridx 1173 | origin_adjacency_matrix [N,N] 0/1 1174 | 1175 | differentiable_graph: returned results 1176 | node_indicator [N+M,d] 1177 | adjacency_weight [N+M,N+M] 1178 | 1179 | N is # of substructures in the molecule 1180 | M is # of leaf node, also number of extended node. 1181 | 1182 | main utility 1183 | add_atom_at_position 1184 | add_fragment_at_position 1185 | delete_substructure_at_idx 1186 | REPLACE = delete + add 1187 | 1188 | Output: 1189 | new_smiles_set 1190 | ''' 1191 | leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst} 1192 | leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair} 1193 | new_smiles_set = set() 1194 | #### 1. data preparation 1195 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles)) 1196 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \ 1197 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles) 1198 | node_indicator, adjacency_weight = differentiable_graph #### both are np.array 1199 | N = len(origin_idx_lst) 1200 | M = len(leaf_extend_idx_pair) 1201 | d = len(vocabulary) 1202 | 1203 | 1204 | #### 2. edit the original molecule 1205 | ####### 2.1 delete & 2.2 replace 1206 | for leaf_idx, extend_idx in leaf_extend_idx_pair: 1207 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx] 1208 | if type(leaf_atom_idx_lst)==int: ### single atom 1209 | new_leaf_atom_idx_lst = [leaf_atom_idx_lst] 1210 | else: #### ring 1211 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 1212 | new_leaf_atom_idx_lst = [] 1213 | remaining_atoms_idx_lst = [] 1214 | for i,v in enumerate(origin_substructure_lst): 1215 | if i==leaf_idx: 1216 | continue 1217 | if type(v)==int: 1218 | remaining_atoms_idx_lst.append(v) 1219 | else: #### list 1220 | remaining_atoms_idx_lst.extend(v) 1221 | new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst] 1222 | ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst 1223 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 1224 | result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst) 1225 | if result is None: 1226 | continue 1227 | delete_mol, old_idx2new_idx = result 1228 | delete_smiles = Chem.MolToSmiles(delete_mol) 1229 | if delete_smiles is None or '.' in delete_smiles: 1230 | continue 1231 | delete_smiles = canonical(delete_smiles) 1232 | nonleaf_idx = leaf2nonleaf[leaf_idx] 1233 | u = random.random() 1234 | shrink_prob = sigmoid(adjacency_weight[leaf_idx,nonleaf_idx]) + sigmoid(adjacency_weight[nonleaf_idx,leaf_idx]) 1235 | if u < shrink_prob: 1236 | new_smiles_set.add(delete_smiles) 1237 | # if shrink_prob < 0: ### sigmoid(-3)=0.1 1238 | # new_smiles_set.add(delete_smiles) 1239 | #### 2.1 delete done 1240 | #### 2.2 replace a & b 1241 | ######### (a) get neighbor substr 1242 | neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1] 1243 | assert len(neighbor_substructures_idx)==1 1244 | neighbor_substructures_idx = neighbor_substructures_idx[0] 1245 | neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx] 1246 | if type(neighbor_atom_idx_lst)==int: 1247 | neighbor_atom_idx_lst = [neighbor_atom_idx_lst] 1248 | ######### (b) add new substructure todo, enumerate several possibility 1249 | u = random.random() 1250 | 1251 | node_indicator_leaf = node_indicator[leaf_idx] ### before softmax 1252 | node_indicator_leaf[12:] -= 5 1253 | node_indicator_leaf = np.exp(node_indicator_leaf) 1254 | node_indicator_leaf = node_indicator_leaf / np.sum(node_indicator_leaf) 1255 | if u < epsilon: 1256 | added_substructure_lst = list(np.argsort(-node_indicator_leaf))[:topk] ### topk (greedy) 1257 | else: 1258 | added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator_leaf, k=topk + 3) 1259 | added_substructure_lst = list(set(added_substructure_lst))[:topk] ### avoid repetition 1260 | for substructure_idx in added_substructure_lst: 1261 | new_substructure = vocabulary[substructure_idx] 1262 | for new_bond in bondtype_list: 1263 | for leaf_atom_idx in neighbor_atom_idx_lst: 1264 | new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx] 1265 | if ith_substructure_is_atom(substructure_idx): 1266 | new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 1267 | new_atom = new_substructure, new_bond = new_bond) 1268 | new_smiles_set.add(new_smiles) 1269 | else: 1270 | new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 1271 | fragment = new_substructure, new_bond = new_bond) 1272 | new_smiles_set = new_smiles_set.union(new_smiles_batch) 1273 | 1274 | expand_prob = sigmoid(adjacency_weight[leaf_idx,extend_idx]) + sigmoid(adjacency_weight[extend_idx,leaf_idx])/2 1275 | u = random.random() 1276 | if u > expand_prob: 1277 | return new_smiles_set.difference(set([None])) 1278 | 1279 | 1280 | ####### 2.3 add todo: use adjacency_weight to further narrow scope 1281 | for leaf_idx, extend_idx in leaf_extend_idx_pair: 1282 | expand_prob = (adjacency_weight[leaf_idx][extend_idx] + adjacency_weight[extend_idx][leaf_idx])/2 ### [-inf, inf] 1283 | # print("expand prob", expand_prob) 1284 | if expand_prob < -3: 1285 | continue 1286 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx] 1287 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer 1288 | leaf_atom_idx_lst = [leaf_atom_idx_lst] 1289 | for leaf_atom_idx in leaf_atom_idx_lst: 1290 | u = random.random() 1291 | node_indicator_leaf = node_indicator[extend_idx] 1292 | node_indicator_leaf[12:]-=5 1293 | node_indicator_leaf = np.exp(node_indicator_leaf) 1294 | node_indicator_leaf = node_indicator_leaf / np.sum(node_indicator_leaf) 1295 | if u < epsilon: 1296 | added_substructure_lst = list(np.argsort(-node_indicator_leaf))[:topk] 1297 | else: 1298 | added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator_leaf, k=topk + 3) 1299 | added_substructure_lst = list(set(added_substructure_lst))[:topk] ### avoid repetition 1300 | for substructure_idx in added_substructure_lst: 1301 | new_substructure = vocabulary[substructure_idx] 1302 | for new_bond in bondtype_list: 1303 | if ith_substructure_is_atom(substructure_idx): 1304 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 1305 | new_atom = new_substructure, new_bond = new_bond) 1306 | new_smiles_set.add(new_smiles) 1307 | else: 1308 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 1309 | fragment = new_substructure , new_bond = new_bond) 1310 | new_smiles_set = new_smiles_set.union(new_smiles_batch) 1311 | 1312 | return new_smiles_set.difference(set([None])) 1313 | 1314 | 1315 | def differentiable_graph_to_smiles_purely_randomwalk(origin_smiles, differentiable_graph, 1316 | leaf_extend_idx_pair, leaf_nonleaf_lst, 1317 | topk = 3, epsilon = 0.7,): 1318 | # print(origin_smiles) 1319 | leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst} 1320 | leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair} 1321 | new_smiles_set = set() 1322 | #### 1. data preparation 1323 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles)) 1324 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \ 1325 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles) 1326 | node_indicator, adjacency_weight = differentiable_graph 1327 | N = len(origin_idx_lst) 1328 | M = len(leaf_extend_idx_pair) 1329 | d = len(vocabulary) 1330 | 1331 | 1332 | 1333 | #### 2. edit the original molecule 1334 | ####### 2.1 delete & 2.2 replace 1335 | for leaf_idx, extend_idx in leaf_extend_idx_pair: 1336 | u_shrink = random.random() 1337 | shrink, unchange, expand = False, False, False 1338 | if u_shrink < 0.7 and substr_num(origin_smiles) > 1: 1339 | shrink = True 1340 | else: 1341 | u_expand = random.random() 1342 | if u_expand < 0.3: 1343 | expand = True 1344 | else: 1345 | unchange = True 1346 | 1347 | if shrink or unchange: 1348 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx] 1349 | if type(leaf_atom_idx_lst)==int: ### single atom 1350 | new_leaf_atom_idx_lst = [leaf_atom_idx_lst] 1351 | else: #### ring 1352 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 1353 | new_leaf_atom_idx_lst = [] 1354 | remaining_atoms_idx_lst = [] 1355 | for i,v in enumerate(origin_substructure_lst): 1356 | if i==leaf_idx: 1357 | continue 1358 | if type(v)==int: 1359 | remaining_atoms_idx_lst.append(v) 1360 | else: #### list 1361 | remaining_atoms_idx_lst.extend(v) 1362 | new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst] 1363 | ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst 1364 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 1365 | result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst) 1366 | if result is None: 1367 | continue 1368 | delete_mol, old_idx2new_idx = result 1369 | delete_smiles = Chem.MolToSmiles(delete_mol) 1370 | if delete_smiles is None or '.' in delete_smiles: 1371 | continue 1372 | delete_smiles = canonical(delete_smiles) 1373 | nonleaf_idx = leaf2nonleaf[leaf_idx] 1374 | 1375 | if shrink: 1376 | new_smiles_set.add(delete_smiles) 1377 | continue 1378 | #### 2.1 delete done 1379 | #### 2.2 replace a & b 1380 | ######### (a) get neighbor substr 1381 | neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1] 1382 | assert len(neighbor_substructures_idx)==1 1383 | neighbor_substructures_idx = neighbor_substructures_idx[0] 1384 | neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx] 1385 | if type(neighbor_atom_idx_lst)==int: 1386 | neighbor_atom_idx_lst = [neighbor_atom_idx_lst] 1387 | ######### (b) add new substructure todo, enumerate several possibility 1388 | # added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk] 1389 | added_substructure_lst = [random.choice(list(range(len(vocabulary)))) for i in range(topk)] 1390 | for substructure_idx in added_substructure_lst: 1391 | new_substructure = vocabulary[substructure_idx] 1392 | for new_bond in bondtype_list: 1393 | for leaf_atom_idx in neighbor_atom_idx_lst: 1394 | new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx] 1395 | if ith_substructure_is_atom(substructure_idx): 1396 | new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 1397 | new_atom = new_substructure, new_bond = new_bond) 1398 | new_smiles_set.add(new_smiles) 1399 | else: 1400 | new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 1401 | fragment = new_substructure, new_bond = new_bond) 1402 | new_smiles_set = new_smiles_set.union(new_smiles_batch) 1403 | continue ### end of shrink or unchange 1404 | 1405 | ####### 2.3 add todo: use adjacency_weight to further narrow scope 1406 | for leaf_idx, extend_idx in leaf_extend_idx_pair: 1407 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx] 1408 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer 1409 | leaf_atom_idx_lst = [leaf_atom_idx_lst] 1410 | for leaf_atom_idx in leaf_atom_idx_lst: 1411 | added_substructure_lst = [random.choice(list(range(len(vocabulary)))) for i in range(topk)] 1412 | for substructure_idx in added_substructure_lst: 1413 | new_substructure = vocabulary[substructure_idx] 1414 | for new_bond in bondtype_list: 1415 | if ith_substructure_is_atom(substructure_idx): 1416 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 1417 | new_atom = new_substructure, new_bond = new_bond) 1418 | new_smiles_set.add(new_smiles) 1419 | else: 1420 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 1421 | fragment = new_substructure , new_bond = new_bond) 1422 | new_smiles_set = new_smiles_set.union(new_smiles_batch) 1423 | 1424 | return new_smiles_set.difference(set([None])) 1425 | 1426 | 1427 | 1428 | 1429 | def differentiable_graph2smiles_plus_random(origin_smiles, differentiable_graph, 1430 | leaf_extend_idx_pair, leaf_nonleaf_lst, 1431 | max_num_offspring = 100, topk = 3, epsilon = 0.7, 1432 | random_topology = False, random_substr = False): 1433 | ''' 1434 | origin_smiles: 1435 | origin_idx_lst [N] 0,1,...,d-1 1436 | origin_node_mat [N,d] 1437 | origin_substructure_lst 1438 | origin_atomidx_2substridx 1439 | origin_adjacency_matrix [N,N] 0/1 1440 | 1441 | differentiable_graph: returned results 1442 | node_indicator [N+M,d] 1443 | adjacency_weight [N+M,N+M] 1444 | 1445 | N is # of substructures in the molecule 1446 | M is # of leaf node, also number of extended node. 1447 | 1448 | 1449 | main utility 1450 | add_atom_at_position 1451 | add_fragment_at_position 1452 | delete_substructure_at_idx 1453 | REPLACE = delete + add 1454 | 1455 | Output: 1456 | new_smiles_set 1457 | ''' 1458 | leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst} 1459 | leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair} 1460 | new_smiles_set = set() 1461 | #### 1. data preparation 1462 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles)) 1463 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \ 1464 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles) 1465 | node_indicator, adjacency_weight = differentiable_graph 1466 | N = len(origin_idx_lst) 1467 | M = len(leaf_extend_idx_pair) 1468 | d = len(vocabulary) 1469 | 1470 | u_topology = random.random() 1471 | ### shrink, unchange, expand prob = 0.2, 0.3, 0.5 1472 | shrink, unchange, expand = False, False, False 1473 | for leaf_idx, extend_idx in leaf_extend_idx_pair: 1474 | u_topology = random.random() 1475 | #### 1. topology 1476 | if random_topology: 1477 | # if u_topology < 0.1: 1478 | # shrink = True 1479 | # elif 0.4 > u_topology >= 0.2: 1480 | # unchange = True 1481 | if u_topology < 0.2: 1482 | unchange = True 1483 | else: 1484 | expand = True 1485 | else: ## dmg topology 1486 | nonleaf_idx = leaf2nonleaf[leaf_idx] 1487 | shrink_prob = sigmoid((adjacency_weight[leaf_idx,nonleaf_idx] + adjacency_weight[nonleaf_idx,leaf_idx])/2) 1488 | # if u_topology < shrink_prob: 1489 | if False: 1490 | shrink = True 1491 | else: 1492 | u_topology2 = random.random() 1493 | expand_prob = (adjacency_weight[leaf_idx,extend_idx] + adjacency_weight[extend_idx,leaf_idx])/2 1494 | if u_topology2 < expand_prob: 1495 | expand_prob = True 1496 | else: 1497 | unchange = True 1498 | 1499 | if shrink or unchange: 1500 | 1501 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx] 1502 | if type(leaf_atom_idx_lst)==int: ### single atom 1503 | new_leaf_atom_idx_lst = [leaf_atom_idx_lst] 1504 | else: #### ring 1505 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 1506 | new_leaf_atom_idx_lst = [] 1507 | remaining_atoms_idx_lst = [] 1508 | for i,v in enumerate(origin_substructure_lst): 1509 | if i==leaf_idx: 1510 | continue 1511 | if type(v)==int: 1512 | remaining_atoms_idx_lst.append(v) 1513 | else: #### list 1514 | remaining_atoms_idx_lst.extend(v) 1515 | new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst] 1516 | ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst 1517 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond. 1518 | result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst) 1519 | if result is None: 1520 | continue 1521 | delete_mol, old_idx2new_idx = result 1522 | delete_smiles = Chem.MolToSmiles(delete_mol) 1523 | if delete_smiles is None or '.' in delete_smiles: 1524 | continue 1525 | delete_smiles = canonical(delete_smiles) 1526 | if shrink: 1527 | new_smiles_set.add(delete_smiles) 1528 | if unchange: 1529 | ######### (a) get neighbor substr 1530 | neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1] 1531 | assert len(neighbor_substructures_idx)==1 1532 | neighbor_substructures_idx = neighbor_substructures_idx[0] 1533 | neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx] 1534 | if type(neighbor_atom_idx_lst)==int: 1535 | neighbor_atom_idx_lst = [neighbor_atom_idx_lst] 1536 | ######### (b) add new substructure todo, enumerate several possibility 1537 | if random_substr: ## random sample 1538 | added_substructure_lst = random.choices(list(range(len(vocabulary))), k=topk) 1539 | else: ## dmg sampling 1540 | u = random.random() 1541 | if u < epsilon: 1542 | added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk] ### topk (greedy) 1543 | else: 1544 | added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator[leaf_idx], k=topk + 3) 1545 | added_substructure_lst = list(set(added_substructure_lst))[:topk] ### avoid repetition 1546 | for substructure_idx in added_substructure_lst: 1547 | new_substructure = vocabulary[substructure_idx] 1548 | for new_bond in bondtype_list: 1549 | for leaf_atom_idx in neighbor_atom_idx_lst: 1550 | new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx] 1551 | if ith_substructure_is_atom(substructure_idx): 1552 | new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 1553 | new_atom = new_substructure, new_bond = new_bond) 1554 | new_smiles_set.add(new_smiles) 1555 | else: 1556 | new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx, 1557 | fragment = new_substructure, new_bond = new_bond) 1558 | new_smiles_set = new_smiles_set.union(new_smiles_batch) 1559 | else: ## expand 1560 | 1561 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx] 1562 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer 1563 | leaf_atom_idx_lst = [leaf_atom_idx_lst] 1564 | if random_substr: 1565 | added_substructure_lst = random.choices(list(range(len(vocabulary))), k=topk) 1566 | else: 1567 | for leaf_atom_idx in leaf_atom_idx_lst: 1568 | u = random.random() 1569 | if u < epsilon: 1570 | added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk] 1571 | else: 1572 | added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator[extend_idx], k=topk + 3) 1573 | added_substructure_lst = list(set(added_substructure_lst))[:topk] ### avoid repetition 1574 | for substructure_idx in added_substructure_lst: 1575 | new_substructure = vocabulary[substructure_idx] 1576 | for new_bond in bondtype_list: 1577 | if ith_substructure_is_atom(substructure_idx): 1578 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 1579 | new_atom = new_substructure, new_bond = new_bond) 1580 | new_smiles_set.add(new_smiles) 1581 | else: 1582 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx, 1583 | fragment = new_substructure , new_bond = new_bond) 1584 | new_smiles_set = new_smiles_set.union(new_smiles_batch) 1585 | 1586 | 1587 | 1588 | return new_smiles_set.difference(set([None])) 1589 | 1590 | 1591 | def draw_smiles(smiles, figfile_name): 1592 | mol = Chem.MolFromSmiles(smiles) 1593 | Draw.MolToImageFile(mol, figfile_name, size = (300,180)) 1594 | return 1595 | 1596 | 1597 | 1598 | 1599 | 1600 | if __name__ == "__main__": 1601 | 1602 | # s = 'FC1NCC(-C1=CC=CC(Br)=C1)C1' 1603 | s = 'C1=CC=CC=C1NC2=NC=CC(F)=N2' 1604 | draw_smiles(s, "figure/tmp.png") 1605 | # rawdata_file = "raw_data/zinc.tab" 1606 | # with open(rawdata_file) as fin: 1607 | # lines = fin.readlines()[1:] 1608 | # smiles_lst = [line.strip().strip('"') for line in lines] 1609 | 1610 | # from random import shuffle 1611 | # # shuffle(smiles_lst) 1612 | # fragment_lst = ['C1NCC1', 'C1CNCCN1', 'C1=CC=CC=C1', 'C1CNNC1'] 1613 | 1614 | 1615 | # smiles = smiles_lst[0] 1616 | # differentiable_graph = smiles2differentiable_graph(smiles) 1617 | # ### optimize differentiable_graph using GNN 1618 | # smiles_set = differentiable_graph2smiles(origin_smiles = smiles, differentiable_graph = differentiable_graph, max_num_offspring = 100) 1619 | # print(len(smiles_set)) 1620 | 1621 | # s = "CCc1ccc(Nc2nc(-c3ccccc3)cs2)cc1" 1622 | # s = 'Oc1ccc(Nc2nc(-c3ccc(Cl)cc3)cs2)cc1' 1623 | # draw_smiles(s, "figure/tmp.png") 1624 | # from tdc import Oracle 1625 | # qed = Oracle('qed') 1626 | # logp = Oracle('logp') 1627 | # jnk = Oracle('jnk3') 1628 | # gsk = Oracle('gsk3b') 1629 | # print(qed(s), logp(s), jnk(s), gsk(s)) 1630 | 1631 | 1632 | # smiles_lst = ['NO', 'ONO', 'CNO', 'CS'] 1633 | # print(similarity_matrix(smiles_lst)) 1634 | 1635 | 1636 | 1637 | ##### test over zinc 1638 | # for smiles in tqdm(smiles_lst): 1639 | # mol = Chem.MolFromSmiles(smiles) 1640 | # print(smiles) 1641 | # new_smiles_lst = [] 1642 | # for idx in range(mol.GetNumAtoms()): 1643 | # for fragment in fragment_lst: 1644 | # smiles_set = add_fragment_at_position(editmol = mol, position_idx = idx, fragment = fragment, new_bond = bondtype_list[0]) 1645 | # new_smiles_lst.extend(list(smiles_set)) 1646 | # new_smiles_lst = list(set(new_smiles_lst)) 1647 | # print("length of smiles set is", len(new_smiles_lst)) 1648 | 1649 | 1650 | 1651 | ### single test 1652 | # smiles = 'CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1' 1653 | # draw_smiles(smiles, "figure/origin.png") 1654 | # fragment = 'C1CCNCN1' 1655 | # mol = Chem.MolFromSmiles(smiles) 1656 | # for idx in range(mol.GetNumAtoms()): 1657 | # smiles_set = add_fragment_at_position(editmol = mol, position_idx = idx, fragment = fragment, new_bond = bondtype_list[0]) 1658 | # print("length of smiles set is", len(smiles_set), smiles_set) 1659 | # for i,smiles in enumerate(smiles_set): 1660 | # name = "figure/" + str(idx) + '_' + str(i) + '.png' 1661 | # draw_smiles(smiles, name) 1662 | 1663 | 1664 | 1665 | 1666 | ''' 1667 | 1668 | "CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1", 1669 | "C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1", 1670 | "N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)cc2)cc1", 1671 | "CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c2CCCCC3)C1", 1672 | "N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#N)C12CCCCC2", 1673 | "CC[NH+](CC)[C@](C)(CC)[C@H](O)c1cscc1Br" 1674 | 1675 | CCc1ccc(Nc2nc(-c3ccccc3)cs2)cc1 1676 | 1677 | 1678 | 1679 | rawdata_file = "raw_data/zinc.tab" 1680 | with open(rawdata_file) as fin: 1681 | lines = fin.readlines()[1:] 1682 | smiles_lst = [line.strip().strip('"') for line in lines] 1683 | 1684 | 1685 | 1686 | test case: 1687 | 1688 | smiles fragment 1689 | C1CCCC1 C1NCC1 1690 | C1=CC=CC=C1 C1CNCCN1 1691 | C1=CC=CC=C1 C1CCNCN1 1692 | CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1 1693 | ''' 1694 | 1695 | 1696 | --------------------------------------------------------------------------------