├── MIMOSA.png
├── result
├── jnk.pkl
├── qed.pkl
├── jnkgsk.pkl
├── qed_f_t.txt
├── jnk_f_t.txt
└── jnkgsk_f_t.txt
├── src
├── __pycache__
│ ├── dpp.cpython-37.pyc
│ ├── module.cpython-37.pyc
│ ├── utils.cpython-37.pyc
│ ├── chemutils.cpython-37.pyc
│ ├── gnn_layer.cpython-37.pyc
│ └── inference_utils.cpython-37.pyc
├── utils.py
├── download.py
├── clean.py
├── dpp.py
├── vocabulary.py
├── train.py
├── evaluate.py
├── gnn_layer.py
├── module.py
├── run.py
├── inference_utils.py
└── chemutils.py
├── save_model
└── GNN_epoch_0_validloss_1.61160.ckpt
├── data
├── vocabulary.txt
└── substructure.txt
├── conda.yml
├── mimosa.yml
└── README.md
/MIMOSA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/MIMOSA.png
--------------------------------------------------------------------------------
/result/jnk.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/result/jnk.pkl
--------------------------------------------------------------------------------
/result/qed.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/result/qed.pkl
--------------------------------------------------------------------------------
/result/jnkgsk.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/result/jnkgsk.pkl
--------------------------------------------------------------------------------
/src/__pycache__/dpp.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/dpp.cpython-37.pyc
--------------------------------------------------------------------------------
/src/__pycache__/module.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/module.cpython-37.pyc
--------------------------------------------------------------------------------
/src/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/utils.cpython-37.pyc
--------------------------------------------------------------------------------
/src/__pycache__/chemutils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/chemutils.cpython-37.pyc
--------------------------------------------------------------------------------
/src/__pycache__/gnn_layer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/gnn_layer.cpython-37.pyc
--------------------------------------------------------------------------------
/save_model/GNN_epoch_0_validloss_1.61160.ckpt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/save_model/GNN_epoch_0_validloss_1.61160.ckpt
--------------------------------------------------------------------------------
/src/__pycache__/inference_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/futianfan/MIMOSA/HEAD/src/__pycache__/inference_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/result/qed_f_t.txt:
--------------------------------------------------------------------------------
1 | 0.495 0.0
2 | 0.514 0.025
3 | 0.612 0.064
4 | 0.731 0.065
5 | 0.815 0.041
6 | 0.856 0.033
7 | 0.889 0.022
8 | 0.912 0.018
9 | 0.921 0.019
--------------------------------------------------------------------------------
/result/jnk_f_t.txt:
--------------------------------------------------------------------------------
1 | 0.0 0.0
2 | 0.008 0.011
3 | 0.021 0.023
4 | 0.044 0.034
5 | 0.060 0.035
6 | 0.070 0.042
7 | 0.085 0.051
8 | 0.101 0.057
9 | 0.115 0.057
10 | 0.122 0.059
11 | 0.136 0.047
--------------------------------------------------------------------------------
/result/jnkgsk_f_t.txt:
--------------------------------------------------------------------------------
1 | 0.015 0.0
2 | 0.022 0.010
3 | 0.038 0.031
4 | 0.065 0.040
5 | 0.086 0.043
6 | 0.103 0.039
7 | 0.131 0.039
8 | 0.144 0.046
9 | 0.151 0.037
10 | 0.170 0.038
11 | 0.182 0.036
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 |
5 | class Molecule_Dataset(torch.utils.data.Dataset):
6 | def __init__(self, smiles_lst):
7 | self.smiles_lst = smiles_lst
8 |
9 | def __len__(self):
10 | return len(self.smiles_lst)
11 |
12 | def __getitem__(self, idx):
13 | return self.smiles_lst[idx]
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/src/download.py:
--------------------------------------------------------------------------------
1 | from tdc.generation import MolGen
2 | data = MolGen(name = 'ZINC')
3 | # from random import shuffle
4 | # smiles_lst = data.get_data()['smiles'].to_list()
5 | # shuffle(smiles_lst)
6 | # smiles_lst = smiles_lst[:data_size]
7 | # with open("data/zinc_" + str(data_size) + ".txt", 'w') as fout:
8 | # for smiles in smiles_lst:
9 | # fout.write(smiles + '\n')
10 |
11 |
12 | """
13 | python src/download.py
14 | """
15 |
16 |
17 |
--------------------------------------------------------------------------------
/data/vocabulary.txt:
--------------------------------------------------------------------------------
1 | C 1158545
2 | O 500212
3 | N 280451
4 | C1=CC=CC=C1 257945
5 | F 79430
6 | S 51103
7 | Cl 42872
8 | C1=CC=NC=C1 27852
9 | C1CCCCC1 20256
10 | C1=CNN=C1 18920
11 | C1=CSC=C1 17515
12 | C1CCNCC1 15912
13 | C1CC1 15462
14 | C1CCCC1 14328
15 | Br 12722
16 | C1=CSC=N1 12617
17 | C1COCCN1 11924
18 | C1CNCCN1 11701
19 | C1=COC=C1 11274
20 | C1CCCN1 9739
21 | C1=CN=CN=C1 7964
22 | C1CC[NH+]CC1 7948
23 | C1CCNC1 7634
24 | C1CCCNC1 7277
25 | C1=CCCC=C1 6243
26 | C1=NN=CN1 5748
27 | C1CNCC1 5513
28 | C1CCOC1 5310
29 | C1=CNC=N1 5201
30 | C1=NOC=N1 5141
31 |
--------------------------------------------------------------------------------
/src/clean.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 | import os
3 | # from chemutils import vocabulary, smiles2word
4 | from chemutils import is_valid, logp_modifier
5 | smiles_database = "data/zinc.tab"
6 | clean_smiles_database = "data/zinc_clean.txt"
7 |
8 |
9 | with open(smiles_database, 'r') as fin:
10 | lines = fin.readlines()[1:]
11 | smiles_lst = [i.strip().strip('"') for i in lines]
12 |
13 | clean_smiles_lst = []
14 | for smiles in tqdm(smiles_lst):
15 | if is_valid(smiles):
16 | clean_smiles_lst.append(smiles)
17 | clean_smiles_set = set(clean_smiles_lst)
18 | with open(clean_smiles_database, 'w') as fout:
19 | for smiles in clean_smiles_set:
20 | fout.write(smiles + '\n')
21 |
22 |
23 |
--------------------------------------------------------------------------------
/src/dpp.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import math
3 | np.random.seed(1)
4 |
5 | class DPPModel(object):
6 | def __init__(self, smiles_lst, sim_matrix, f_scores, top_k, lamb):
7 | self.smiles_lst = smiles_lst
8 | self.sim_matrix = sim_matrix # (n,n)
9 | self.lamb = lamb
10 | self.f_scores = np.exp(f_scores) * self.lamb # (n,)
11 | self.max_iter = top_k
12 | self.n = len(smiles_lst)
13 | self.kernel_matrix = self.f_scores.reshape((self.n, 1)) \
14 | * sim_matrix * self.f_scores.reshape((1, self.n))
15 | self.log_det_V = np.sum(f_scores) * self.lamb
16 | self.log_det_S = np.log(np.linalg.det(np.mat(self.kernel_matrix)))
17 |
18 | def dpp(self):
19 | c = np.zeros((self.max_iter, self.n))
20 | d = np.copy(np.diag(self.kernel_matrix)) ### diagonal
21 | j = np.argmax(d)
22 | Yg = [j]
23 | _iter = 0
24 | Z = list(range(self.n))
25 | while len(Yg) < self.max_iter:
26 | Z_Y = set(Z).difference(set(Yg))
27 | for i in Z_Y:
28 | if _iter == 0:
29 | ei = self.kernel_matrix[j, i] / np.sqrt(d[j])
30 | else:
31 | ei = (self.kernel_matrix[j, i] - np.dot(c[:_iter, j], c[:_iter, i])) / np.sqrt(d[j])
32 | c[_iter, i] = ei
33 | d[i] = d[i] - ei * ei
34 | d[j] = 0
35 | j = np.argmax(d)
36 | Yg.append(j)
37 | _iter += 1
38 |
39 | return [self.smiles_lst[i] for i in Yg], self.log_det_V, self.log_det_S
40 |
41 |
42 |
43 | if __name__ == "__main__":
44 | rank_score = np.random.random(size=(100))
45 | item_embedding = np.random.randn(100, 5)
46 | item_embedding = item_embedding / np.linalg.norm(item_embedding, axis=1, keepdims=True)
47 | sim_matrix = np.dot(item_embedding, item_embedding.T)
48 |
49 | dpp = DPPModel(smiles_lst=list(range(100)), sim_matrix = sim_matrix, f_scores = rank_score, top_k = 10)
50 | Yg = dpp.dpp()
51 | print(Yg)
52 |
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/src/vocabulary.py:
--------------------------------------------------------------------------------
1 | # from chemutils import smiles2word
2 |
3 | import os
4 | from collections import defaultdict
5 | from tqdm import tqdm
6 | from rdkit import Chem, DataStructs
7 |
8 |
9 | def smiles2mol(smiles):
10 | mol = Chem.MolFromSmiles(smiles)
11 | if mol is None:
12 | return None
13 | Chem.Kekulize(mol)
14 | return mol
15 |
16 | ## input: smiles, output: word lst;
17 | def smiles2word(smiles):
18 | mol = smiles2mol(smiles)
19 | if mol is None:
20 | return None
21 | word_lst = []
22 |
23 | cliques = [list(x) for x in Chem.GetSymmSSSR(mol)]
24 | cliques_smiles = []
25 | for clique in cliques:
26 | clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True)
27 | cliques_smiles.append(clique_smiles)
28 | atom_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()]
29 | return cliques_smiles + atom_not_in_rings_list
30 |
31 |
32 |
33 | all_vocabulary_file = "data/substructure.txt"
34 | rawdata_file = "data/zinc.tab"
35 | select_vocabulary_file = "data/vocabulary.txt"
36 |
37 | if not os.path.exists(all_vocabulary_file):
38 | with open(rawdata_file) as fin:
39 | lines = fin.readlines()[1:]
40 | smiles_lst = [line.strip().strip('"') for line in lines]
41 | word2cnt = defaultdict(int)
42 | for smiles in tqdm(smiles_lst):
43 | word_lst = smiles2word(smiles)
44 | for word in word_lst:
45 | word2cnt[word] += 1
46 | word_cnt_lst = [(word,cnt) for word,cnt in word2cnt.items()]
47 | word_cnt_lst = sorted(word_cnt_lst, key=lambda x:x[1], reverse = True)
48 |
49 | with open(all_vocabulary_file, 'w') as fout:
50 | for word, cnt in word_cnt_lst:
51 | fout.write(word + '\t' + str(cnt) + '\n')
52 | else:
53 | with open(all_vocabulary_file, 'r') as fin:
54 | lines = fin.readlines()
55 | word_cnt_lst = [(line.split('\t')[0], int(line.split('\t')[1])) for line in lines]
56 |
57 |
58 | word_cnt_lst = list(filter(lambda x:x[1]>5000, word_cnt_lst))
59 | print(len(word_cnt_lst))
60 |
61 | with open(select_vocabulary_file, 'w') as fout:
62 | for word, cnt in word_cnt_lst:
63 | fout.write(word + '\t' + str(cnt) + '\n')
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/conda.yml:
--------------------------------------------------------------------------------
1 | name: mimosa
2 | channels:
3 | - rdkit
4 | - soumith
5 | - defaults
6 | dependencies:
7 | - _libgcc_mutex=0.1=main
8 | - blas=1.0=mkl
9 | - bzip2=1.0.8=h7b6447c_0
10 | - ca-certificates=2021.1.19=h06a4308_0
11 | - cairo=1.14.12=h8948797_3
12 | - certifi=2020.12.5=py37h06a4308_0
13 | - fontconfig=2.13.0=h9420a91_0
14 | - freetype=2.10.4=h5ab3b9f_0
15 | - glib=2.66.1=h92f7085_0
16 | - icu=58.2=he6710b0_3
17 | - intel-openmp=2020.2=254
18 | - jpeg=9b=h024ee3a_2
19 | - lcms2=2.11=h396b838_0
20 | - ld_impl_linux-64=2.33.1=h53a641e_7
21 | - libboost=1.73.0=h3ff78a5_11
22 | - libedit=3.1.20191231=h14c3975_1
23 | - libffi=3.3=he6710b0_2
24 | - libgcc-ng=9.1.0=hdf63c60_0
25 | - libpng=1.6.37=hbc83047_0
26 | - libstdcxx-ng=9.1.0=hdf63c60_0
27 | - libtiff=4.1.0=h2733197_1
28 | - libuuid=1.0.3=h1bed415_2
29 | - libxcb=1.14=h7b6447c_0
30 | - libxml2=2.9.10=hb55368b_3
31 | - lz4-c=1.9.3=h2531618_0
32 | - mkl=2020.2=256
33 | - mkl-service=2.3.0=py37he8ac12f_0
34 | - mkl_fft=1.2.0=py37h23d657b_0
35 | - mkl_random=1.1.1=py37h0573a6f_0
36 | - ncurses=6.2=he6710b0_1
37 | - numpy=1.19.2=py37h54aff64_0
38 | - numpy-base=1.19.2=py37hfa32c7d_0
39 | - olefile=0.46=py_0
40 | - openssl=1.1.1i=h27cfd23_0
41 | - pandas=1.2.1=py37ha9443f7_0
42 | - pcre=8.44=he6710b0_0
43 | - pillow=8.1.0=py37he98fc37_0
44 | - pip=20.3.3=py37h06a4308_0
45 | - pixman=0.40.0=h7b6447c_0
46 | - py-boost=1.73.0=py37ha9443f7_11
47 | - python=3.7.9=h7579374_0
48 | - python-dateutil=2.8.1=py_0
49 | - pytz=2020.5=pyhd3eb1b0_0
50 | - rdkit=2020.09.1.0=py37hd50e099_1
51 | - readline=8.0=h7b6447c_0
52 | - setuptools=52.0.0=py37h06a4308_0
53 | - six=1.15.0=pyhd3eb1b0_0
54 | - sqlite=3.33.0=h62c20be_0
55 | - tk=8.6.10=hbc83047_0
56 | - wheel=0.36.2=pyhd3eb1b0_0
57 | - xz=5.2.5=h7b6447c_0
58 | - zlib=1.2.11=h7b6447c_3
59 | - zstd=1.4.5=h9ceee32_0
60 | - pip:
61 | - chardet==4.0.0
62 | - cycler==0.10.0
63 | - decorator==4.4.2
64 | - fuzzywuzzy==0.18.0
65 | - idna==2.10
66 | - joblib==1.0.0
67 | - kiwisolver==1.3.1
68 | - matplotlib==3.3.4
69 | - networkx==2.5
70 | - pyparsing==2.4.7
71 | - pytdc==0.1.5
72 | - requests==2.25.1
73 | - scikit-learn==0.23.2
74 | - scipy==1.6.0
75 | - threadpoolctl==2.1.0
76 | - torch==1.7.1
77 | - torchvision==0.8.2
78 | - tqdm==4.56.0
79 | - typing-extensions==3.7.4.3
80 | - urllib3==1.26.3
81 |
82 |
--------------------------------------------------------------------------------
/mimosa.yml:
--------------------------------------------------------------------------------
1 | name: mimosa
2 | channels:
3 | - rdkit
4 | - pytorch
5 | - anaconda
6 | - defaults
7 | dependencies:
8 | - blas=1.0=mkl
9 | - bzip2=1.0.8=h1de35cc_0
10 | - ca-certificates=2020.1.1=0
11 | - cairo=1.14.12=hc4e6be7_4
12 | - certifi=2020.4.5.1=py37_0
13 | - cffi=1.14.0=py37hb5b8e2f_0
14 | - fontconfig=2.13.0=h5d5b041_1
15 | - freetype=2.9.1=hb4e5f40_0
16 | - gettext=0.19.8.1=h15daf44_3
17 | - glib=2.63.1=hd977a24_0
18 | - icu=58.2=h0a44026_3
19 | - intel-openmp=2019.4=233
20 | - joblib=0.14.1=py_0
21 | - jpeg=9b=he5867d9_2
22 | - libboost=1.67.0=hebc422b_4
23 | - libcxx=4.0.1=hcfea43d_1
24 | - libcxxabi=4.0.1=hcfea43d_1
25 | - libedit=3.1.20181209=hb402a30_0
26 | - libffi=3.2.1=h0a44026_6
27 | - libgfortran=3.0.1=h93005f0_2
28 | - libiconv=1.16=h1de35cc_0
29 | - libpng=1.6.37=ha441bb4_0
30 | - libtiff=4.1.0=hcb84e12_0
31 | - libxml2=2.9.9=hf6e021a_1
32 | - llvm-openmp=4.0.1=hcfea43d_1
33 | - mkl=2019.4=233
34 | - mkl-service=2.3.0=py37hfbe908c_0
35 | - mkl_fft=1.0.15=py37h5e564d8_0
36 | - mkl_random=1.1.0=py37ha771720_0
37 | - ncurses=6.2=h0a44026_1
38 | - ninja=1.9.0=py37h04f5b5a_0
39 | - numpy=1.18.1=py37h7241aed_0
40 | - numpy-base=1.18.1=py37h6575580_1
41 | - olefile=0.46=py37_0
42 | - openssl=1.1.1g=h1de35cc_0
43 | - pandas=1.0.3=py37h6c726b0_0
44 | - pcre=8.43=h0a44026_0
45 | - pillow=7.0.0=py37h4655f20_0
46 | - pip=20.0.2=py37_1
47 | - pixman=0.38.0=h1de35cc_0
48 | - py-boost=1.67.0=py37h6440ff4_4
49 | - pycparser=2.20=py_0
50 | - python=3.7.7=hc70fcce_0_cpython
51 | - python-dateutil=2.8.1=py_0
52 | - pytorch=1.0.1=py3.7_2
53 | - pytz=2019.3=py_0
54 | - rdkit=2020.03.1.0=py37h65625ec_1
55 | - readline=8.0=h1de35cc_0
56 | - scikit-learn=0.21.3=py37h27c97d8_0
57 | - scipy=1.4.1=py37h9fa6033_0
58 | - setuptools=46.1.3=py37_0
59 | - six=1.14.0=py37_0
60 | - sqlite=3.31.1=h5c1f38d_1
61 | - tk=8.6.8=ha441bb4_0
62 | - tqdm=4.45.0=py_0
63 | - wheel=0.34.2=py37_0
64 | - xz=5.2.5=h1de35cc_0
65 | - zlib=1.2.11=h1de35cc_3
66 | - zstd=1.3.7=h5bba6e5_0
67 | - pip:
68 | - decorator==4.4.2
69 | - isodate==0.6.0
70 | - molvs==0.1.1
71 | - networkx==2.4
72 | - plyfile==0.7.2
73 | - protobuf==3.11.3
74 | - pyparsing==2.4.7
75 | - rdflib==5.0.0
76 | - tensorboardx==2.0
77 | - torch-cluster==1.2.4
78 | - torch-geometric==1.0.3
79 | - torch-scatter==1.1.2
80 | - torch-sparse==0.2.4
81 | - torch-spline-conv==1.0.6
82 |
83 |
--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | import numpy as np
6 | from tqdm import tqdm
7 | from matplotlib import pyplot as plt
8 | import pickle
9 | from random import shuffle
10 | torch.manual_seed(4)
11 | np.random.seed(2)
12 | from module import GCN
13 | from chemutils import smiles2graph, vocabulary, smiles2feature
14 | from utils import Molecule_Dataset
15 |
16 |
17 | device = 'cpu'
18 | data_file = "data/zinc_clean.txt"
19 | with open(data_file, 'r') as fin:
20 | lines = fin.readlines()
21 |
22 | shuffle(lines)
23 | lines = [line.strip() for line in lines]
24 | N = int(len(lines) * 0.9)
25 | train_data = lines[:N]
26 | valid_data = lines[N:]
27 |
28 |
29 |
30 | training_set = Molecule_Dataset(train_data)
31 | valid_set = Molecule_Dataset(valid_data)
32 | params = {'batch_size': 1,
33 | 'shuffle': True,
34 | 'num_workers': 1}
35 | # exit()
36 |
37 |
38 | def collate_fn(batch_lst):
39 | return batch_lst
40 |
41 | train_generator = torch.utils.data.DataLoader(training_set, collate_fn = collate_fn, **params)
42 | valid_generator = torch.utils.data.DataLoader(valid_set, collate_fn = collate_fn, **params)
43 |
44 | gnn = GCN(nfeat = 50, nhid = 100, num_layer = 3).to(device)
45 | print('GNN is built!')
46 | # exit()
47 |
48 | cost_lst = []
49 | valid_loss_lst = []
50 | epoch = 5
51 | every_k_iters = 5000
52 | save_folder = "save_model/GNN_epoch_"
53 | for ep in tqdm(range(epoch)):
54 | for i, smiles in tqdm(enumerate(train_generator)):
55 | ### 1. training
56 | smiles = smiles[0]
57 | node_mat, adjacency_matrix, idx, label = smiles2feature(smiles) ### smiles2feature: only mask leaf node
58 | # idx_lst, node_mat, substructure_lst, atomidx_2substridx, adjacency_matrix, leaf_extend_idx_pair = smiles2graph(smiles)
59 | node_mat = torch.FloatTensor(node_mat).to(device)
60 | adjacency_matrix = torch.FloatTensor(adjacency_matrix).to(device)
61 | label = torch.LongTensor([label]).view(-1).to(device)
62 | # print('label', label)
63 | cost = gnn.learn(node_mat, adjacency_matrix, idx, label)
64 | cost_lst.append(cost)
65 |
66 | #### 2. validation
67 | if i % every_k_iters == 0:
68 | gnn.eval()
69 | valid_loss, valid_num = 0,0
70 | for smiles in valid_generator:
71 | smiles = smiles[0]
72 | node_mat, adjacency_matrix, idx, label = smiles2feature(smiles)
73 | node_mat = torch.FloatTensor(node_mat).to(device)
74 | adjacency_matrix = torch.FloatTensor(adjacency_matrix).to(device)
75 | label = torch.LongTensor([label]).view(-1).to(device)
76 | cost, _ = gnn.infer(node_mat, adjacency_matrix, idx, label)
77 | valid_loss += cost
78 | valid_num += 1
79 | valid_loss = valid_loss / valid_num
80 | valid_loss_lst.append(valid_loss)
81 | file_name = save_folder + str(ep) + "_validloss_" + str(valid_loss)[:7] + ".ckpt"
82 | torch.save(gnn, file_name)
83 | gnn.train()
84 |
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/src/evaluate.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | import numpy as np
3 | from time import time
4 | from tqdm import tqdm
5 | from matplotlib import pyplot as plt
6 | import pickle
7 | from random import shuffle
8 | import torch
9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | from tdc import Oracle
12 | torch.manual_seed(1)
13 | np.random.seed(2)
14 | from tdc import Evaluator
15 |
16 | from chemutils import *
17 | ## 2. data and oracle
18 | # qed = Oracle(name = 'qed')
19 | # logp = Oracle(name = 'logp')
20 | # jnk = Oracle(name = 'JNK3')
21 | # gsk = Oracle(name = 'GSK3B')
22 | # def foracle(smiles):
23 | # return logp(smiles)
24 |
25 | oracle_name = sys.argv[1]
26 | # 'jnkgsk', 'qedsajnkgsk', 'qed', 'jnk', 'gsk'
27 |
28 |
29 | diversity = Evaluator(name = 'Diversity')
30 | novelty = Evaluator(name = 'Novelty')
31 |
32 |
33 | file = "data/zinc_clean.txt"
34 | with open(file, 'r') as fin:
35 | lines = fin.readlines()
36 | train_smiles_lst = [line.strip().split()[0] for line in lines][:1000]
37 |
38 |
39 | ## 5. run
40 | if __name__ == "__main__":
41 |
42 | # result_file = "result/denovo_from_" + start_smiles_lst[0] + "_generation_" + str(generations) + "_population_" + str(population_size) + ".pkl"
43 | # result_pkl = "result/ablation_dmg_topo_dmg_substr.pkl"
44 | # pkl_file = "result/denovo_qedlogpjnkgsk_start_ncncccn.pkl"
45 | pkl_file = "result/"+oracle_name+".pkl"
46 | idx_2_smiles2f, trace_dict = pickle.load(open(pkl_file, 'rb'))
47 | # bestvalue, best_smiles = 0, ''
48 | topk = 100
49 | whole_smiles2f = dict()
50 | for idx, (smiles2f,current_set) in tqdm(idx_2_smiles2f.items()):
51 | whole_smiles2f.update(smiles2f)
52 | # for smiles,f in smiles2f.items():
53 | # if f > bestvalue:
54 | # bestvalue = f
55 | # print("best", f)
56 | # best_smiles = smiles
57 |
58 | smiles_f_lst = [(smiles,f) for smiles,f in whole_smiles2f.items()]
59 | smiles_f_lst.sort(key=lambda x:x[1], reverse=True)
60 | best_smiles_lst = [smiles for smiles,f in smiles_f_lst[:topk]]
61 | best_f_lst = [f for smiles,f in smiles_f_lst[:topk]]
62 | avg, std = np.mean(best_f_lst), np.std(best_f_lst)
63 | print('average of top-'+str(topk), str(avg)[:5], str(std)[:5])
64 | #### evaluate novelty
65 | t1 = time()
66 | nov = novelty(best_smiles_lst, train_smiles_lst)
67 | t2 = time()
68 | print("novelty", nov, "takes", str(int(t2-t1)), 'seconds')
69 |
70 | ### evaluate diversity
71 | t1 = time()
72 | div = diversity(best_smiles_lst)
73 | t2 = time()
74 | print("diversity", div, 'takes', str(int(t2-t1)), 'seconds')
75 |
76 |
77 | # ### evaluate mean of property
78 | # for oracle_name in oracle_lst:
79 | # oracle = Oracle(name = oracle_name)
80 | # scores = oracle(best_smiles_lst)
81 | # avg = np.mean(scores)
82 | # std = np.std(scores)
83 | # print(oracle_name, str(avg)[:7], str(std)[:7])
84 |
85 | # for ii,smiles in enumerate(best_smiles_lst[:20]):
86 | # print(smiles, str(gsk(smiles)))
87 | # draw_smiles(smiles, "figure/best_"+oracle_name+"_"+str(ii)+'.png')
88 |
89 |
90 |
91 |
92 |
93 |
--------------------------------------------------------------------------------
/src/gnn_layer.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import numpy as np
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | from torch.nn.parameter import Parameter
7 | from torch.nn.modules.module import Module
8 | torch.manual_seed(3)
9 | np.random.seed(1)
10 |
11 | class GraphConvolution(Module):
12 | """
13 | Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
14 | """
15 |
16 | def __init__(self, in_features, out_features, bias=True, init='xavier'):
17 | super(GraphConvolution, self).__init__()
18 | self.in_features = in_features
19 | self.out_features = out_features
20 | self.weight = Parameter(torch.FloatTensor(in_features, out_features))
21 | if bias:
22 | self.bias = Parameter(torch.FloatTensor(out_features))
23 | else:
24 | self.register_parameter('bias', None)
25 | if init == 'uniform':
26 | print("| Uniform Initialization")
27 | self.reset_parameters_uniform()
28 | elif init == 'xavier':
29 | print("| Xavier Initialization")
30 | self.reset_parameters_xavier()
31 | elif init == 'kaiming':
32 | print("| Kaiming Initialization")
33 | self.reset_parameters_kaiming()
34 | else:
35 | raise NotImplementedError
36 |
37 | def reset_parameters_uniform(self):
38 | stdv = 1. / math.sqrt(self.weight.size(1))
39 | self.weight.data.uniform_(-stdv, stdv)
40 | if self.bias is not None:
41 | self.bias.data.uniform_(-stdv, stdv)
42 |
43 | def reset_parameters_xavier(self):
44 | nn.init.xavier_normal_(self.weight.data, gain=0.02) # Implement Xavier Uniform
45 | if self.bias is not None:
46 | nn.init.constant_(self.bias.data, 0.0)
47 |
48 | def reset_parameters_kaiming(self):
49 | nn.init.kaiming_normal_(self.weight.data, a=0, mode='fan_in')
50 | if self.bias is not None:
51 | nn.init.constant_(self.bias.data, 0.0)
52 |
53 | def forward(self, input, adj):
54 | support = torch.mm(input, self.weight)
55 | # print("adj", adj.dtype, "support", support.dtype)
56 | output = torch.spmm(adj, support)
57 | if self.bias is not None:
58 | return output + self.bias
59 | else:
60 | return output
61 |
62 | def __repr__(self):
63 | return self.__class__.__name__ + ' (' \
64 | + str(self.in_features) + ' -> ' \
65 | + str(self.out_features) + ')'
66 |
67 |
68 | class GraphAttention(nn.Module):
69 | """
70 | Simple GAT layer, similar to https://arxiv.org/abs/1710.10903
71 | """
72 |
73 | def __init__(self, in_features, out_features, dropout, alpha, concat=True):
74 | super(GraphAttention, self).__init__()
75 | self.dropout = dropout
76 | self.in_features = in_features
77 | self.out_features = out_features
78 | self.alpha = alpha
79 | self.concat = concat
80 |
81 | self.W = nn.Parameter(nn.init.xavier_normal_(torch.Tensor(in_features, out_features).type(torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor), gain=np.sqrt(2.0)), requires_grad=True)
82 | self.a1 = nn.Parameter(nn.init.xavier_normal_(torch.Tensor(out_features, 1).type(torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor), gain=np.sqrt(2.0)), requires_grad=True)
83 | self.a2 = nn.Parameter(nn.init.xavier_normal_(torch.Tensor(out_features, 1).type(torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor), gain=np.sqrt(2.0)), requires_grad=True)
84 |
85 | self.leakyrelu = nn.LeakyReLU(self.alpha)
86 |
87 | def forward(self, input, adj):
88 | h = torch.mm(input, self.W)
89 | N = h.size()[0]
90 |
91 | f_1 = torch.matmul(h, self.a1)
92 | f_2 = torch.matmul(h, self.a2)
93 | e = self.leakyrelu(f_1 + f_2.transpose(0,1))
94 |
95 | zero_vec = -9e15*torch.ones_like(e)
96 | attention = torch.where(adj > 0, e, zero_vec)
97 | attention = F.softmax(attention, dim=1)
98 | attention = F.dropout(attention, self.dropout, training=self.training)
99 | h_prime = torch.matmul(attention, h)
100 |
101 | if self.concat:
102 | return F.elu(h_prime)
103 | else:
104 | return h_prime
105 |
106 | def __repr__(self):
107 | return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
--------------------------------------------------------------------------------
/src/module.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from copy import deepcopy
5 | from torch.autograd import Variable
6 | from torch.utils import data
7 | from torch.utils.data import SequentialSampler
8 | import matplotlib.pyplot as plt
9 | import numpy as np
10 | sigmoid = torch.nn.Sigmoid()
11 | from tqdm import tqdm
12 |
13 | from gnn_layer import GraphConvolution, GraphAttention
14 | from chemutils import smiles2graph, vocabulary
15 |
16 | torch.manual_seed(4)
17 | np.random.seed(1)
18 |
19 | # def sigmoid(x):
20 | # return 1/(1+np.exp(-x))
21 | # device = 'cuda' if torch.cuda.is_available() else 'cpu'
22 | device = 'cpu'
23 |
24 | class GCN(nn.Module):
25 | def __init__(self, nfeat, nhid, num_layer):
26 | super(GCN, self).__init__()
27 | self.gc1 = GraphConvolution(in_features = nfeat, out_features = nhid)
28 | self.gcs = [GraphConvolution(in_features = nhid, out_features = nhid) for i in range(num_layer)]
29 | # self.dropout = dropout
30 | from chemutils import vocabulary
31 | self.vocabulary_size = len(vocabulary)
32 | self.out_fc = nn.Linear(nhid, self.vocabulary_size)
33 | self.nfeat = nfeat
34 | self.nhid = nhid
35 | self.num_layer = num_layer
36 | # self.embedding = nn.Embedding(self.vocabulary_size, nfeat)
37 | self.embedding = nn.Linear(self.vocabulary_size + 1, nfeat)
38 | self.criteria = torch.nn.CrossEntropyLoss()
39 | self.opt = torch.optim.Adam(self.parameters(), lr=1e-3, betas=(0.9, 0.99))
40 | self.device = device
41 | self = self.to(device)
42 |
43 | def switch_device(self, device):
44 | self.device = device
45 | self = self.to(device)
46 |
47 | def forward(self, node_mat, adj, idx):
48 | ''' N: # substructure & d: vocabulary size
49 | Input:
50 | node_mat: [N,d] row sum is 1.
51 | adj: [N,N]
52 | idx: integer
53 |
54 | Output:
55 | scalar prediction before sigmoid [-inf, inf]
56 | '''
57 | node_mat, adj = node_mat.to(self.device), adj.to(self.device)
58 | x = self.embedding(node_mat)
59 | x = F.relu(self.gc1(x,adj))
60 | for gc in self.gcs:
61 | x = F.relu(gc(x,adj))
62 | x = x[idx].view(1,-1)
63 | logits = self.out_fc(x)
64 | return logits
65 | ## without sigmoid
66 |
67 | def smiles2embed(self, smiles):
68 | idx_lst, node_mat, substructure_lst, atomidx_2substridx, adj, leaf_extend_idx_pair = smiles2graph(smiles)
69 | idx_vec = torch.LongTensor(idx_lst).to(device)
70 | node_mat = torch.FloatTensor(node_mat).to(device)
71 | adj = torch.FloatTensor(adj).to(device)
72 | weight = torch.ones_like(idx_vec).to(device)
73 |
74 | ### forward
75 | node_mat, adj, weight = node_mat.to(self.device), adj.to(self.device), weight.to(self.device)
76 | x = self.embedding(node_mat) ## bug
77 | x = F.relu(self.gc1(x,adj))
78 | for gc in self.gcs:
79 | x = F.relu(gc(x,adj))
80 | return torch.mean(x, 0)
81 |
82 |
83 | def smiles2pred(self, smiles):
84 | idx_lst, node_mat, substructure_lst, atomidx_2substridx, adj, leaf_extend_idx_pair = smiles2graph(smiles)
85 | idx_vec = torch.LongTensor(idx_lst).to(device)
86 | node_mat = torch.FloatTensor(node_mat).to(device)
87 | adj = torch.FloatTensor(adj).to(device)
88 | weight = torch.ones_like(idx_vec).to(device)
89 | logits = self.forward(node_mat, adj, weight)
90 | pred = torch.sigmoid(logits)
91 | return pred.item()
92 |
93 | def learn(self, node_mat, adj, idx, label):
94 | pred_y = self.forward(node_mat, adj, idx)
95 | pred_y = pred_y.view(1,-1)
96 | # print(pred_y, pred_y.shape, label, label.shape)
97 | cost = self.criteria(pred_y, label)
98 | self.opt.zero_grad()
99 | cost.backward()
100 | self.opt.step()
101 | return cost.data.numpy(), pred_y.data.numpy()
102 |
103 | def infer(self, node_mat, adj, idx, target):
104 | pred_y = self.forward(node_mat, adj, idx)
105 | pred_y = pred_y.view(1,-1)
106 | cost = self.criteria(pred_y, target)
107 | return cost.data.numpy(), pred_y.data.numpy()
108 |
109 |
110 | if __name__ == "__main__":
111 | gnn = GCN(nfeat = 50, nhid = 100, num_layer = 2)
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
--------------------------------------------------------------------------------
/src/run.py:
--------------------------------------------------------------------------------
1 | import os, pickle, torch, random
2 | import numpy as np
3 | import argparse
4 | from time import time
5 | from tqdm import tqdm
6 | from matplotlib import pyplot as plt
7 | from random import shuffle
8 | import torch.nn as nn
9 | import torch.nn.functional as F
10 | from tdc import Oracle
11 | torch.manual_seed(1)
12 | np.random.seed(2)
13 | random.seed(1)
14 | from chemutils import *
15 | from inference_utils import *
16 |
17 |
18 | def optimization(start_smiles_lst, gnn, oracle, oracle_num, oracle_name, generations, population_size, lamb, topk, epsilon, result_pkl):
19 | smiles2score = dict() ### oracle_num
20 | def oracle_new(smiles):
21 | if smiles not in smiles2score:
22 | value = oracle(smiles)
23 | smiles2score[smiles] = value
24 | return smiles2score[smiles]
25 | trace_dict = dict()
26 | existing_set = set(start_smiles_lst)
27 | current_set = set(start_smiles_lst)
28 | average_f = np.mean([oracle_new(smiles) for smiles in current_set])
29 | f_lst = [(average_f, 0.0)]
30 | idx_2_smiles2f = {}
31 | smiles2f_new = {smiles:oracle_new(smiles) for smiles in start_smiles_lst}
32 | idx_2_smiles2f[-1] = smiles2f_new, current_set
33 | for i_gen in tqdm(range(generations)):
34 | next_set = set()
35 | for smiles in current_set:
36 | smiles_set = optimize_single_molecule_one_iterate(smiles, gnn)
37 |
38 | for smi in smiles_set:
39 | if smi not in trace_dict:
40 | trace_dict[smi] = smiles ### ancestor -> offspring
41 | next_set = next_set.union(smiles_set)
42 | # next_set = next_set.difference(existing_set) ### if allow repeat molecule
43 | smiles_score_lst = oracle_screening(next_set, oracle_new) ### sorted smiles_score_lst
44 | print(smiles_score_lst[:5], "Oracle num", len(smiles2score))
45 |
46 | # current_set = [i[0] for i in smiles_score_lst[:population_size]] # Option I: top-k
47 | current_set,_,_ = dpp(smiles_score_lst = smiles_score_lst, num_return = population_size, lamb = lamb) # Option II: DPP
48 | existing_set = existing_set.union(next_set)
49 |
50 | # save
51 | smiles2f_new = {smiles:score for smiles,score in smiles_score_lst}
52 | idx_2_smiles2f[i_gen] = smiles2f_new, current_set
53 | pickle.dump((idx_2_smiles2f, trace_dict), open(result_pkl, 'wb'))
54 |
55 | #### compute f-score
56 | score_lst = [smiles2f_new[smiles] for smiles in current_set]
57 | average_f = np.mean(score_lst)
58 | std_f = np.std(score_lst)
59 | f_lst.append((average_f, std_f))
60 | str_f_lst = [str(i[0])[:5]+'\t'+str(i[1])[:5] for i in f_lst]
61 | with open("result/" + oracle_name + "_f_t.txt", 'w') as fout:
62 | fout.write('\n'.join(str_f_lst))
63 | if len(smiles2score) > oracle_num:
64 | break
65 |
66 | def main():
67 | parser = argparse.ArgumentParser()
68 | parser.add_argument('--oracle_num', type=int, default=1500)
69 | parser.add_argument('--oracle_name', type=str, default="qed", choices=['jnkgsk', 'qedsajnkgsk', 'qed', 'jnk', 'gsk'])
70 | parser.add_argument('--generations', type=int, default=50)
71 | parser.add_argument('--population_size', type=int, default=20)
72 | args = parser.parse_args()
73 |
74 | oracle_num = args.oracle_num
75 | oracle_name = args.oracle_name
76 | generations = args.generations
77 | population_size = args.population_size
78 |
79 | start_smiles_lst = ['C1(N)=NC=CC=N1'] ## 'C1=CC=CC=C1NC2=NC=CC=N2'
80 | qed = Oracle('qed')
81 | sa = Oracle('sa')
82 | jnk = Oracle('JNK3')
83 | gsk = Oracle('GSK3B')
84 | logp = Oracle('logp')
85 | mu = 2.230044
86 | sigma = 0.6526308
87 | def normalize_sa(smiles):
88 | sa_score = sa(smiles)
89 | mod_score = np.maximum(sa_score, mu)
90 | return np.exp(-0.5 * np.power((mod_score - mu) / sigma, 2.))
91 |
92 |
93 | if oracle_name == 'jnkgsk':
94 | def oracle(smiles):
95 | return np.mean((jnk(smiles), gsk(smiles)))
96 | elif oracle_name == 'qedsajnkgsk':
97 | def oracle(smiles):
98 | return np.mean((qed(smiles), normalize_sa(smiles), jnk(smiles), gsk(smiles)))
99 | elif oracle_name == 'qed':
100 | def oracle(smiles):
101 | return qed(smiles)
102 | elif oracle_name == 'jnk':
103 | def oracle(smiles):
104 | return jnk(smiles)
105 | elif oracle_name == 'gsk':
106 | def oracle(smiles):
107 | return gsk(smiles)
108 | elif oracle_name == 'logp':
109 | def oracle(smiles):
110 | return logp(smiles)
111 |
112 | # device = 'cuda' if torch.cuda.is_available() else 'cpu'
113 | device = 'cpu' ## cpu is better
114 | model_ckpt = "save_model/GNN_epoch_0_validloss_1.61160.ckpt"
115 | gnn = torch.load(model_ckpt)
116 | gnn.switch_device(device)
117 |
118 | result_pkl = "result/" + oracle_name + ".pkl"
119 | optimization(start_smiles_lst, gnn, oracle, oracle_num, oracle_name,
120 | generations = generations,
121 | population_size = population_size,
122 | lamb=2,
123 | topk = 5,
124 | epsilon = 0.7,
125 | result_pkl = result_pkl)
126 |
127 |
128 |
129 | if __name__ == "__main__":
130 | main()
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 💊 MIMOSA: Multi-constraint Molecule Sampling for Molecule Optimization
2 |
3 | [](https://opensource.org/licenses/BSD-2-Clause)
4 | [](https://www.python.org/downloads/release/python-370/)
5 | [](https://github.com/futianfan/MIMOSA/stargazers)
6 | [](https://github.com/futianfan/MIMOSA/network/members)
7 |
8 |
9 |
10 | This repository hosts MIMOSA: Multi-constraint Molecule Sampling for Molecule Optimization (AAAI) 2021 (Tianfan Fu, Cao Xiao, Xinhao Li, Lucas Glass, Jimeng Sun), which used pretrained graph neural network (GNN) and MCMC for molecule optimization.
11 |
12 | 
13 |
14 |
15 | ## Table Of Contents
16 |
17 | - [Installation](#installation)
18 | - [Data](#data)
19 | - [Pretraining](#pretrain)
20 | - [Run](#run)
21 | - [Contact](#contact)
22 |
23 |
24 |
25 | ## ⚙️ 1. Installation
26 |
27 | To install locally, we recommend to install from `pip` and `conda`. Please see `conda.yml` for the package dependency.
28 | ```bash
29 | conda create -n mimosa python=3.7
30 | conda activate mimosa
31 | pip install torch
32 | pip install PyTDC
33 | conda install -c rdkit rdkit
34 | ```
35 |
36 | Activate conda environment.
37 | ```bash
38 | conda activate mimosa
39 | ```
40 |
41 | make directory
42 | ```bash
43 | mkdir -p save_model result
44 | ```
45 |
46 |
47 | ## 📊 2. Data
48 | In our setup, we restrict the number of oracle calls. In realistic discovery settings, the oracle acquisition cost is usually not negligible.
49 |
50 | ### Raw Data
51 | We use [`ZINC`](https://tdcommons.ai/generation_tasks/molgen/) database, which contains around 250K drug-like molecules and can be downloaded [`download ZINC`](https://tdcommons.ai/generation_tasks/molgen/).
52 | ```bash
53 | python src/download.py
54 | ```
55 | - output
56 | - `data/zinc.tab`: all the smiles in ZINC, around 250K.
57 |
58 | ### Oracle
59 | Oracle is a property evaluator and is a function whose input is molecular structure, and output is the property.
60 | We consider following oracles:
61 | * `JNK3`: biological activity to JNK3, ranging from 0 to 1.
62 | * `GSK3B` biological activity to GSK3B, ranging from 0 to 1.
63 | * `QED`: Quantitative Estimate of Drug-likeness, ranging from 0 to 1.
64 | * `SA`: Synthetic Accessibility, we normalize SA to (0,1).
65 | * `LogP`: solubility and synthetic accessibility of a compound. It ranges from negative infinity to positive infinity.
66 |
67 | For all the property scores above, higher is more desirable.
68 |
69 | ### Optimization Task
70 | There are two kinds of optimization tasks: single-objective and multi-objective optimization.
71 | Multi-objective optimization contains `jnkgsk` (JNK3 + GSK3B), `qedsajnkgsk` (QED + SA + JNK3 + GSK3B).
72 |
73 |
74 | ### Generate Vocabulary
75 | In this project, the basic unit is `substructure`, which can be atoms or single rings.
76 | The vocabulary is the set of frequent `substructures`.
77 | ```bash
78 | python src/vocabulary.py
79 | ```
80 | - input
81 | - `data/zinc.tab`: all the smiles in ZINC, around 250K.
82 | - output
83 | - `data/substructure.txt`: including all the substructures in ZINC.
84 | - `data/vocabulary.txt`: vocabulary, frequent substructures.
85 |
86 | ### data cleaning
87 | We remove the molecules that contains substructure that is not in vocabulary.
88 |
89 | ```bash
90 | python src/clean.py
91 | ```
92 |
93 | - input
94 | - `data/vocabulary.txt`: vocabulary
95 | - `data/zinc.tab`: all the smiles in ZINC
96 | - output
97 | - `data/zinc_clean.txt`
98 |
99 |
100 |
101 |
102 |
103 |
104 | ## Pre-train graph neural network (GNN)
105 | ```bash
106 | python src/train.py
107 | ```
108 | - input
109 | - `data/zinc_clean.txt`
110 | - output
111 | - `save_model/GNN.ckpt`: trained GNN model.
112 | - log
113 | - `gnn_loss.pkl`: the valid loss.
114 |
115 |
116 | ## 🤖 Run
117 |
118 | ### de novo molecule design
119 |
120 | ```bash
121 | python src/run.py
122 | ```
123 | - input
124 | - `save_model/GNN.ckpt`: pretrained GNN model.
125 | - output
126 | - `result/{$prop}.pkl`: set of generated molecules.
127 |
128 | For example,
129 | ```bash
130 | python src/run.py
131 | ```
132 |
133 | ### evaluate
134 |
135 | ```bash
136 | python src/evaluate.py $prop
137 | ```
138 | - input
139 | - `result/{$prop}.pkl`
140 | - output
141 | - `diversity`, `novelty`, `average property` of top-100 molecules with highest property.
142 |
143 | For example,
144 | ```bash
145 | python src/evaluate.py jnkgsk
146 | ```
147 |
148 |
149 | ## 📞 Contact
150 | Please contact futianfan@gmail.com for help or submit an issue.
151 |
152 |
153 | ## Cite Us
154 | If you found this package useful, please cite our paper:
155 | ```
156 | @inproceedings{fu2021mimosa,
157 | title={MIMOSA: Multi-constraint Molecule Sampling for Molecule Optimization},
158 | author={Fu, Tianfan and Xiao, Cao and Li, Xinhao and Glass, Lucas M and Sun, Jimeng},
159 | booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
160 | volume={35},
161 | number={1},
162 | pages={125--133},
163 | year={2021}
164 | }
165 | ```
166 |
167 |
168 |
169 |
170 |
171 |
172 |
--------------------------------------------------------------------------------
/src/inference_utils.py:
--------------------------------------------------------------------------------
1 |
2 | ### 1. import
3 | import numpy as np
4 | from tqdm import tqdm
5 | from matplotlib import pyplot as plt
6 | import pickle
7 | from random import shuffle
8 | import torch
9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | from tdc import Oracle
12 | torch.manual_seed(1)
13 | np.random.seed(2)
14 | import random
15 | random.seed(1)
16 | from chemutils import *
17 | '''
18 | optimize_single_molecule_one_iterate
19 | gnn_prediction_of_single_smiles
20 | oracle_screening
21 | gnn_screening
22 | optimize_single_molecule_all_generations
23 | similarity_matrix(smiles_lst)
24 | '''
25 | from dpp import DPPModel
26 |
27 |
28 |
29 | def gnn_prediction_of_single_smiles(smiles, gnn):
30 | if not is_valid(smiles):
31 | return 0
32 | return gnn.smiles2pred(smiles)
33 | # idx_lst, node_mat, substructure_lst, atomidx_2substridx, adjacency_matrix, leaf_extend_idx_pair = smiles2graph(smiles)
34 | # idx_vec = torch.LongTensor(idx_lst)
35 | # node_mat = torch.FloatTensor(node_mat)
36 | # adjacency_matrix = torch.FloatTensor(adjacency_matrix)
37 | # weight = torch.ones_like(idx_vec)
38 | # logits = gnn(node_mat = node_mat, adj = adjacency_matrix, weight = weight)
39 | # logits = logits.item()
40 | # print("gnn prediction", logits)
41 | # return logits
42 |
43 |
44 | def oracle_screening(smiles_set, oracle):
45 | smiles_score_lst = []
46 | for smiles in smiles_set:
47 | score = oracle(smiles)
48 | smiles_score_lst.append((smiles, score))
49 | smiles_score_lst.sort(key=lambda x:x[1], reverse=True)
50 | return smiles_score_lst
51 |
52 | def dpp(smiles_score_lst, num_return, lamb):
53 | smiles_lst = [i[0] for i in smiles_score_lst]
54 | if len(smiles_lst) <= num_return:
55 | return smiles_lst, None, None
56 | score_arr = np.array([i[1] for i in smiles_score_lst])
57 | sim_mat = similarity_matrix(smiles_lst)
58 | dpp_model = DPPModel(smiles_lst = smiles_lst, sim_matrix = sim_mat, f_scores = score_arr, top_k = num_return, lamb = lamb)
59 | smiles_lst, log_det_V, log_det_S = dpp_model.dpp()
60 | return smiles_lst, log_det_V, log_det_S
61 |
62 |
63 | def gnn_screening(smiles_set, gnn):
64 | smiles_score_lst = []
65 | for smiles in smiles_set:
66 | score = gnn_prediction_of_single_smiles(smiles, gnn)
67 | smiles_score_lst.append((smiles, score))
68 | smiles_score_lst.sort(key=lambda x:x[1], reverse=True)
69 | return smiles_score_lst
70 | # smiles_lst = [i[0] for i in smiles_score_lst]
71 | # return smiles_lst
72 |
73 | def optimize_single_node(smiles):
74 | assert substr_num(smiles)==1
75 | vocabulary = load_vocabulary()
76 | atoms = ['N', 'C']
77 |
78 | # bondtype_list = [rdkit.Chem.rdchem.BondType.SINGLE, rdkit.Chem.rdchem.BondType.DOUBLE] ### chemutils
79 |
80 | def optimize_single_molecule_one_iterate(smiles, gnn):
81 | target_ = torch.LongTensor([0]).view(-1)
82 | if smiles == None:
83 | return set()
84 | if not is_valid(smiles):
85 | return set()
86 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(smiles))
87 | new_smiles_set = set()
88 | jj=-100
89 |
90 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \
91 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(smiles)
92 |
93 | feature_lst = smiles2expandfeature(smiles)
94 | for node_mat, adj_mat, mask_idx in feature_lst:
95 | node_mat = torch.FloatTensor(node_mat)
96 | adj_mat = torch.FloatTensor(adj_mat)
97 | N = adj_mat.shape[0]
98 | for jj in range(N):
99 | if adj_mat[jj,N-1]==1:
100 | break
101 |
102 | _, prediction = gnn.infer(node_mat, adj_mat, mask_idx, target_)
103 | top_idxs = prediction.reshape(-1).argsort().tolist()[::-1][:3]
104 | top_words = [vocabulary[ii] for ii in top_idxs]
105 | for substru_idx, word in zip(top_idxs, top_words):
106 | leaf_atom_idx_lst = origin_substructure_lst[jj]
107 |
108 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer
109 | leaf_atom_idx_lst = [leaf_atom_idx_lst]
110 | for leaf_atom_idx in leaf_atom_idx_lst:
111 | for new_bond in bondtype_list:
112 | if ith_substructure_is_atom(substru_idx):
113 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
114 | new_atom = word, new_bond = new_bond)
115 | new_smiles_set.add(new_smiles)
116 | else:
117 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
118 | fragment = word , new_bond = new_bond)
119 | new_smiles_set = new_smiles_set.union(new_smiles_batch)
120 |
121 | new_smiles_set = set([new_smiles for new_smiles in new_smiles_set if new_smiles != None])
122 | return new_smiles_set
123 |
124 |
125 |
126 |
127 | def optimize_single_molecule_all_generations(input_smiles, gnn, oracle, generations, population_size, lamb):
128 | smiles2f = dict()
129 | traceback_dict = dict()
130 | input_smiles = canonical(input_smiles)
131 | input_score = oracle(input_smiles)
132 | best_mol_score_list = []
133 | existing_set = set([input_smiles])
134 | current_mol_score_list = [(input_smiles, input_score)]
135 | for it in tqdm(range(generations)):
136 | new_smiles_set = set()
137 | #### optimize each single smiles
138 | for smiles,score in current_mol_score_list:
139 | # proposal_smiles_set = optimize_single_molecule_one_iterate(smiles, gnn)
140 | proposal_smiles_set = optimize_single_molecule_one_iterate_v2(smiles, gnn)
141 | proposal_smiles_set = proposal_smiles_set.difference(set([input_smiles]))
142 | for new_smiles in proposal_smiles_set:
143 | if new_smiles not in traceback_dict:
144 | traceback_dict[new_smiles] = smiles
145 | new_smiles_set = new_smiles_set.union(proposal_smiles_set)
146 |
147 | ### remove the repetition
148 | # new_smiles_set = new_smiles_set.difference(existing_set)
149 |
150 | ### add smiles into existing_set
151 | existing_set = existing_set.union(new_smiles_set)
152 |
153 | ### scoring new smiles
154 | ####### I:GNN & oracle scoring
155 | # gnn_smiles_lst = gnn_screening(new_smiles_set, gnn)
156 | # gnn_smiles_lst = gnn_smiles_lst[:population_size*3]
157 | # mol_score_list = oracle_screening(gnn_smiles_lst, oracle)
158 | ############ oracle call <= generations * population_size * 3 + 1
159 |
160 | ####### II: only oracle scoring
161 | mol_score_list = oracle_screening(new_smiles_set, oracle)
162 | ############ oracle call: unbounded, with better performance
163 | for smiles, score in mol_score_list:
164 | if score > 0.50:
165 | print('example', smiles, score)
166 |
167 |
168 | ### save results
169 | best_mol_score_list.extend(mol_score_list)
170 |
171 |
172 | ### only keep top-k
173 | # mol_score_list = mol_score_list[:population_size]
174 | ### dpp(smiles_score_lst, num_return, lamb)
175 | smiles_lst = dpp(mol_score_list, num_return = population_size, lamb = lamb)
176 |
177 |
178 | ### for next generation
179 | # current_mol_score_list = mol_score_list
180 | current_mol_score_list = [(smiles,0.0) for smiles in smiles_lst]
181 |
182 | ### endfor
183 |
184 | best_mol_score_list.sort(key=lambda x:x[1], reverse=True)
185 | return best_mol_score_list, input_score, traceback_dict
186 |
187 |
188 |
189 | def calculate_results(input_smiles, input_score, best_mol_score_list):
190 | if best_mol_score_list == []:
191 | with open(result_file, 'a') as fout:
192 | fout.write("fail to optimize" + input_smiles + '\n')
193 | return None
194 | output_scores = [i[1] for i in best_mol_score_list]
195 | smiles_lst = [i[0] for i in best_mol_score_list]
196 | with open(result_file, 'a') as fout:
197 | fout.write(str(input_score) + '\t' + str(output_scores[0]) + '\t' + str(np.mean(output_scores[:3]))
198 | + '\t' + input_smiles + '\t' + ' '.join(smiles_lst[:3]) + '\n')
199 | return input_score, output_scores[0]
200 |
201 | def inference_single_molecule(input_smiles, gnn, result_file, generations, population_size):
202 | best_mol_score_list, input_score, traceback_dict = optimize_single_molecule_all_generations(input_smiles, gnn, oracle, generations, population_size)
203 | return calculate_results(input_smiles, input_score, result_file, best_mol_score_list, oracle)
204 |
205 |
206 |
207 |
208 | def inference_molecule_set(input_smiles_lst, gnn, result_file, generations, population_size):
209 | score_lst = []
210 | for input_smiles in tqdm(input_smiles_lst):
211 | if not is_valid(input_smiles):
212 | continue
213 | result = inference_single_molecule(input_smiles, gnn, result_file, generations, population_size)
214 | if result is None:
215 | score_lst.append(None)
216 | else:
217 | input_score, output_score = result
218 | score_lst.append((input_score, output_score))
219 | return score_lst
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
--------------------------------------------------------------------------------
/data/substructure.txt:
--------------------------------------------------------------------------------
1 | C 1158545
2 | O 500212
3 | N 280451
4 | C1=CC=CC=C1 257945
5 | F 79430
6 | S 51103
7 | Cl 42872
8 | C1=CC=NC=C1 27852
9 | C1CCCCC1 20256
10 | C1=CNN=C1 18920
11 | C1=CSC=C1 17515
12 | C1CCNCC1 15912
13 | C1CC1 15462
14 | C1CCCC1 14328
15 | Br 12722
16 | C1=CSC=N1 12617
17 | C1COCCN1 11924
18 | C1CNCCN1 11701
19 | C1=COC=C1 11274
20 | C1CCCN1 9739
21 | C1=CN=CN=C1 7964
22 | C1CC[NH+]CC1 7948
23 | C1CCNC1 7634
24 | C1CCCNC1 7277
25 | C1=CCCC=C1 6243
26 | C1=NN=CN1 5748
27 | C1CNCC1 5513
28 | C1CCOC1 5310
29 | C1=CNC=N1 5201
30 | C1=NOC=N1 5141
31 | C1=CON=C1 4917
32 | C1C[NH+]CCN1 4893
33 | C1CC[NH+]C1 4027
34 | C1=CCCCC1 3985
35 | C1=NNC=N1 3958
36 | C1COCC1 3829
37 | C1=CCNC=C1 3752
38 | C1=C[NH]N=C1 3575
39 | C1=CNC=C1 3521
40 | C1CCC1 2924
41 | C1CCOCC1 2906
42 | C1CCC[NH+]C1 2784
43 | C1CNC=N1 2772
44 | C1CCCCN1 2771
45 | C1CSC=N1 2756
46 | C1COCO1 2643
47 | C1CSCC1 2614
48 | C1=CN=CNC1 2566
49 | C1CNCN1 2548
50 | C1CNCCC1 2540
51 | C1CSCN1 2360
52 | C1=C[NH]CC1 2336
53 | C1=CCCC1 2217
54 | C1COCCC1 2178
55 | C1COCCO1 2140
56 | C1=NN=NN1 2117
57 | C1=NN=CS1 2079
58 | C1=NN=CO1 1994
59 | C1=CCNCC1 1991
60 | C1=COC=N1 1936
61 | C1=CC=[NH+]C=C1 1876
62 | C1=C[NH+]=CN1 1716
63 | C1=CN=CC=N1 1706
64 | C1=CNCN=C1 1630
65 | C1=N[NH]C=N1 1598
66 | C1C[NH+]CC1 1477
67 | C1=CNN=N1 1444
68 | C1=CCOCC1 1434
69 | C1=COCC1 1406
70 | C1=C[NH]C=C1 1389
71 | C1CSCCN1 1385
72 | C1=CC=NCC1 1375
73 | C1C=CCCC1 1364
74 | C1=CCNN=C1 1329
75 | C1=CNCC1 1317
76 | C1=CN=C[NH]C1 1268
77 | C1=CNCNC1 1265
78 | C1C[NH]C=N1 1230
79 | C1CN=CNC1 1145
80 | C1CCCCCC1 1145
81 | C1COC=N1 1139
82 | C1CCCNCC1 1102
83 | C1=CSCN1 993
84 | C1C=CCN1 955
85 | C1CC[NH2+]CC1 940
86 | C1=CN=NC=C1 917
87 | I 888
88 | C1CNCC[NH+]C1 834
89 | C1=CC=[NH+]C1 816
90 | C1=C[NH]C=N1 766
91 | C1C=NNC1 759
92 | C1=CSCC1 746
93 | C1=NNCN1 735
94 | C1CCCCCN1 717
95 | C1=CC[NH]C=C1 694
96 | C1CCCO1 692
97 | C1CNCNC1 672
98 | C1C[NH+]CCC1 654
99 | C1CCSCC1 650
100 | C1C=C[NH]C1 625
101 | C1NCCN1 589
102 | C1CNCCNC1 575
103 | C1CCNCN1 566
104 | C1=NNCC1 563
105 | C1CNC1 538
106 | C1=CN=NC1 536
107 | C1NCCS1 523
108 | C1C=CSC1 505
109 | C1=CC[NH]CC1 500
110 | C1NCCCN1 494
111 | C1CNCS1 492
112 | C1CCC=CN1 489
113 | C1C[NH2+]CCN1 488
114 | C1C=N[NH]C1 486
115 | C1CC[NH2+]C1 482
116 | C1COCN1 481
117 | C1CCCS1 470
118 | C1CCNCCN1 459
119 | C1=NNCS1 459
120 | C1=C[NH+]=C[NH]1 444
121 | C1=CC=[N+]C=C1 443
122 | C1CCNN=C1 430
123 | C1CCC[NH+]CC1 430
124 | C1=N[NH]CN1 428
125 | C1=COCO1 426
126 | C1C[NH2+]CC1 415
127 | C1CCC[NH+]1 398
128 | C1=CC[NH+]CC1 397
129 | C1CCC[NH2+]C1 388
130 | C1NC=CCN1 388
131 | C1C=COCC1 386
132 | C1=CNCCC1 381
133 | C1=COCCO1 379
134 | C1COCCCO1 378
135 | C1C=NC=NC1 377
136 | C1NCCO1 370
137 | C1CCCSC1 356
138 | C1C=COC1 354
139 | C1CCCOC1 350
140 | C1=CNC[NH]C1 349
141 | C1CN=CC=N1 326
142 | C1=NNN=N1 321
143 | C1CN=C[NH]C1 320
144 | C1CCSC1 320
145 | C1=CNC=CC1 318
146 | C1=CSN=N1 311
147 | C1=COC=CC1 310
148 | C1CNN=N1 309
149 | C1=NSC=N1 308
150 | C1=[NH+]CCN1 307
151 | C1=COCCC1 296
152 | C1C=CNCC1 286
153 | C1C=CC=CC1 278
154 | C1=CCOC=C1 272
155 | C1=NON=C1 262
156 | C1CC=NC=N1 251
157 | C1C=CN=CC1 250
158 | C1=NC=CNC1 249
159 | C1=CC=CCC1 249
160 | C1=CNNC1 244
161 | C1=CSN=C1 242
162 | C1CNCN=C1 230
163 | C1CNC[NH]1 223
164 | C1=NC=NC=N1 216
165 | C1=NOCC1 215
166 | C1C=NOC1 213
167 | C1=CCN=C1 208
168 | C1COCC[NH+]1 203
169 | C1C=CNC1 202
170 | C1=CCCCCC1 198
171 | C1=[NH+]C[NH+]=C1 197
172 | C1COC1 195
173 | C1=NN=CC1 193
174 | C1C[NH2+]CCC1 192
175 | C1CCCCO1 190
176 | C1CSCCC1 190
177 | C1CCCCCCC1 189
178 | C1=NC=NN=C1 188
179 | C1CCOC=C1 184
180 | C1C[NH]C[NH]1 181
181 | C1=NSN=C1 181
182 | C1CSCC[NH+]1 181
183 | C1CCCCNC1 181
184 | C1CN=CN=C1 178
185 | C1=NCCNC1 176
186 | C1CNCCCN1 176
187 | C1CC[NH+]CCN1 174
188 | C1=C[NH]CNC1 172
189 | C1=NCCN=C1 171
190 | C1=CN=C[NH+]=C1 171
191 | C1CCSN1 169
192 | C1C=CCS1 168
193 | C1COCCNC1 164
194 | C1=NNN=C1 162
195 | C1C=CCC1 154
196 | C1C=C[NH]CC1 153
197 | C1C[NH]CNC1 152
198 | C1=C[NH+]=CN=C1 151
199 | C1COCC[NH2+]1 146
200 | C1CCC=NN1 145
201 | C1C=CNC=C1 142
202 | C1=NN=N[N-]1 141
203 | C1CSCCS1 140
204 | C1=CNCN1 139
205 | C1=NCCS1 139
206 | C1COCOC1 138
207 | C1=[N+]CCC1 137
208 | C1C[NH+]C1 137
209 | C1=CNCC=N1 136
210 | C1=CCOC1 135
211 | C1C=COC=C1 135
212 | C1CCOCCN1 135
213 | C1COC=C1 131
214 | C1=CN=CC1 131
215 | C1CO1 130
216 | C1CNN=C1 130
217 | C1=NNCO1 130
218 | C1CNSC1 129
219 | C1=NCNCC1 129
220 | C1=CCSCC1 123
221 | C1=CC=[NH+]CC1 123
222 | C1CSCCO1 121
223 | C1=C[NH]C[NH]C1 117
224 | C1CNCCSC1 116
225 | C1CCN1 116
226 | C1CCCC[NH+]C1 109
227 | C1C[NH2+]C1 109
228 | C1CC=CN1 108
229 | C1=NSCC1 108
230 | C1CNCCO1 108
231 | C1NN=CS1 106
232 | C1=CC=NN=C1 105
233 | C1CN=NNC1 104
234 | C1=[NH+]CCS1 102
235 | C1CC=CC=[NH+]1 102
236 | C1CCC[NH2+]CC1 100
237 | C1=NC=CCC1 100
238 | P 99
239 | C1=N[NH]C[NH]1 99
240 | C1COCCCC1 98
241 | C1C[NH+]=CN1 97
242 | C1=CSC[NH]1 97
243 | C1C=C[NH]C=C1 96
244 | C1CC[NH]N=C1 94
245 | C1=NCCN1 92
246 | C1CCNCCC1 92
247 | C1COCNC1 92
248 | C1=CNC=NC1 92
249 | C1=CN=CCC1 91
250 | C1CCC[NH2+]1 91
251 | C1=COCOC1 90
252 | C1CCCNCCC1 89
253 | C1=CC[NH2+]CC1 89
254 | C1=COCCCO1 88
255 | C1CNNC1 87
256 | C1C=CCO1 87
257 | C1=C[NH]C=NC1 87
258 | C1OCCO1 85
259 | C1=C[NH]CN1 82
260 | C1CCSCCN1 82
261 | C1CCSCN1 82
262 | C1COC=CN1 82
263 | C1=NC=NC1 80
264 | C1C=CNCN1 78
265 | C1=N[N-]C=N1 76
266 | C1NCCCO1 76
267 | C1=CN=[NH+]C=C1 75
268 | C1CNN=CN1 74
269 | C1COCCCN1 74
270 | C1=C[NH]NC1 73
271 | C1=NOCN1 73
272 | C1C=CCCN1 72
273 | C1CNC=C1 72
274 | C1C=CNN=C1 71
275 | C1=NSCCN1 71
276 | C1C=NCNC1 71
277 | C1=C[NH+]=CNC1 70
278 | C1C=CCC=C1 70
279 | C1=CNCN=N1 70
280 | C1CSCCCN1 70
281 | C1=CNCCNC1 68
282 | C1NCCCS1 68
283 | C1=CSC=[N+]1 68
284 | C1CC=NCN1 67
285 | C1CNCC[NH2+]C1 67
286 | C1=CC[NH]N=C1 67
287 | C1=CN[NH]C1 65
288 | C1=CNCCN1 64
289 | C1NCNN1 64
290 | C1N=CCN1 63
291 | C1CC=NN=C1 62
292 | C1C[NH+]CCCN1 62
293 | C1=NNCNC1 61
294 | C1CCNC=N1 59
295 | C1NNCCS1 59
296 | C1COC[NH]1 57
297 | C1CNCC=C1 57
298 | C1=NN=CNC1 55
299 | C1=CN[NH+]=C1 55
300 | C1CNCCCC1 55
301 | C1CN=CN1 55
302 | C1=C[NH]CN=C1 53
303 | C1C=CC=CN1 52
304 | C1=CSCCC1 52
305 | C1COCC[NH+]C1 51
306 | C1C[NH+]=C[NH]1 49
307 | C1CC[NH]C1 48
308 | C1NCN=[NH+]1 48
309 | C1=CCNC1 48
310 | C1CCOCO1 47
311 | C1=COCNC1 45
312 | C1CNCO1 45
313 | C1=CCCCCCC1 44
314 | C1CC=CC=N1 44
315 | C1CNC=CN1 43
316 | C1=CON=[N+]1 43
317 | C1CSCS1 43
318 | C1NN=NN1 42
319 | C1CCNN1 42
320 | C1=NCNC1 42
321 | C1CCOCNC1 41
322 | C1CC[N+]CC1 40
323 | C1N=NCN1 40
324 | C1C=NCS1 39
325 | C1CC=NN1 39
326 | C1N=NCS1 39
327 | C1=CCC1 39
328 | C1C=NCCC1 39
329 | C1CCC[NH+]CCC1 39
330 | C1=CCCNC1 39
331 | C1=N[NH]N=C1 38
332 | C1CCNC=C1 37
333 | C1C[N+]=CN1 36
334 | C1NCNCN1 36
335 | C1CCSNC1 36
336 | C1=C[NH]C[NH]1 36
337 | C1=NCN=C1 36
338 | C1C[N+]CCN1 36
339 | C1C=NCN1 35
340 | C1CCC=CO1 35
341 | C1CC=CCN1 34
342 | C1=CN=NNC1 34
343 | C1=[N+]CCN1 34
344 | C1=CONC1 34
345 | C1C=NSC1 32
346 | C1C[N-]C=N1 32
347 | C1CNCCOC1 32
348 | C1=COCCCC1 32
349 | C1=CN=[NH+]C1 31
350 | C1=NN=C[NH]1 31
351 | C1CSNCN1 31
352 | C1C[NH+]CCNC1 31
353 | C1N=CNN1 30
354 | C1=NC=CCSC1 30
355 | C1C=NC=CC1 30
356 | C1=NCSC1 29
357 | C1CSC[NH]1 29
358 | C1CCOCN1 28
359 | C1C[NH+]CCSC1 28
360 | C1=NCNCN1 28
361 | C1=NNCSC1 28
362 | C1C=CN=N1 28
363 | C1CNCOC1 27
364 | C1CSCC[NH2+]1 27
365 | C1CCCC=N1 27
366 | C1=NCCC1 27
367 | C1=C[NH+]=CCN1 26
368 | C1=NNC[NH]1 26
369 | C1CNCN=N1 26
370 | C1CC[NH+]CCC1 26
371 | C1=CNC[NH]1 26
372 | C1=NCCCN1 26
373 | C1CCCCCCN1 26
374 | C1CCC=N1 26
375 | C1=C[N-]C=N1 26
376 | C1=C[N+]=CN1 25
377 | C1CCN=CN1 25
378 | C1CSCC=CN1 25
379 | C1C[NH]N=N1 24
380 | C1CNOC1 24
381 | C1=CSCCO1 24
382 | C1C=CSN1 23
383 | C1=CCSC1 23
384 | C1C[NH2+]CS1 23
385 | C1=CCCNCC1 23
386 | C1CNCC[NH]1 22
387 | C1CSC=[N+]1 22
388 | C1=CC=[N+]CC1 22
389 | C1NNCS1 22
390 | C1CC=COC1 22
391 | C1CCON1 21
392 | C1CCCC=C1 21
393 | C1NCNCS1 21
394 | C1CN1 21
395 | C1=NCNC=N1 21
396 | C1C[NH]CC[NH]1 21
397 | C1C=NC=[NH+]C1 21
398 | C1NC=CS1 21
399 | C1=[NH+]CCCN1 21
400 | C1CNC=NN1 21
401 | C1=NCC[NH]C1 20
402 | C1C[NH]CN1 20
403 | C1=NCNN=C1 20
404 | C1CCCNN1 20
405 | C1C=CCCCN1 20
406 | C1=[NH+]CNCN1 20
407 | C1C=NC=N1 19
408 | C1CSCSC1 19
409 | C1=CNCSC1 19
410 | C1C[NH]NC1 19
411 | C1=[NH+]CCC1 19
412 | C1CN=COC1 19
413 | C1CC[NH]C=C1 19
414 | C1CNC[NH]C1 19
415 | C1CNSNC1 18
416 | C1=CCC=CC1 18
417 | C1CCC=CC1 18
418 | C1=C[N+]=CN=C1 18
419 | C1C=NNC=N1 18
420 | C1CSN1 17
421 | C1CC=CC=C1 17
422 | C1=[NH+]CCCCC1 17
423 | C1=NCNC=C1 17
424 | C1COC[NH+]C1 16
425 | C1CSCO1 16
426 | C1=CSCCN1 16
427 | C1C[NH+]=CC=[NH+]1 16
428 | C1C[NH2+]CCCN1 15
429 | C1NCCSN1 15
430 | C1CC=[NH+]N=C1 15
431 | C1=NN=CSC1 15
432 | C1=[NH+]CCCC1 15
433 | C1CC[N+]C1 15
434 | C1=CN=NC=N1 14
435 | C1CCN[NH]C1 14
436 | C1=NC[NH2+]CC1 14
437 | C1C=CNCCN1 14
438 | C1N=CCS1 14
439 | C1CN[NH]C1 14
440 | C1=COCCNC1 13
441 | C1=NN=C[NH]C1 13
442 | C1C[NH+]=CC=C1 13
443 | C1[N+]=CCS1 13
444 | C1CCC[N+]C1 13
445 | C1CSN=N1 13
446 | C1=C[NH]CC=N1 13
447 | C1=NC[NH]N=C1 13
448 | C1CSNCC1 12
449 | C1=C[N-]C=C1 12
450 | C1=CNSN=C1 12
451 | C1=CCC[NH2+]CC1 12
452 | C1CCCCOC1 12
453 | C1CSCCCC1 11
454 | C1COCC[N+]1 11
455 | C1=CC=COC1 11
456 | C1C[NH+]CCOC1 11
457 | C1=N[NH]C[NH]C1 11
458 | C1=CNOC1 11
459 | C1COCCC[NH2+]1 11
460 | C1COC=CC1 11
461 | C1CC=CCC1 11
462 | C1=NCCNCC1 11
463 | C1C=NCC1 11
464 | C1C=N[NH]C=N1 11
465 | C1C=C[NH+]=CC1 11
466 | C1=C[NH]N=N1 11
467 | C1C=CCCO1 11
468 | C1C[NH2+]CCNC1 11
469 | C1CCSC=N1 11
470 | C1CON=C1 10
471 | C1=CCSC=C1 10
472 | C1C=CC=N1 10
473 | C1CC=CN=N1 10
474 | C1NN=CCS1 10
475 | C1=N[NH+]=CSC1 10
476 | C1=COCN1 10
477 | C1CCN=CCN1 10
478 | C1=NCCO1 10
479 | C1=NCCCC1 10
480 | C1CN=NC1 10
481 | C1CN=NCN1 10
482 | C1=COC[NH]1 10
483 | C1CNC[NH+]C1 10
484 | C1=NCNN1 10
485 | C1CN=C[NH+]=N1 10
486 | C1=NC=C[NH]C1 10
487 | C1CCNNC1 10
488 | C1C[NH+]CC[NH2+]1 9
489 | C1C=CSCC1 9
490 | C1CCC=CCN1 9
491 | C1N=CC=CN1 9
492 | C1=NCCSC1 9
493 | C1CN=CCCN1 9
494 | C1NC=NS1 9
495 | C1CC[NH2+]CCC1 9
496 | C1=[N+]CCCS1 9
497 | C1=C[NH+]CCC1 9
498 | C1CN=C[N+]C1 9
499 | C1=NCCCS1 9
500 | C1=NCCOC1 9
501 | C1=CC[NH2+]C1 9
502 | C1=CC=CNC1 9
503 | C1=CNSCC1 9
504 | C1CCONC1 9
505 | C1=CSCNC1 9
506 | C1=CC=[O+]C=C1 9
507 | C1=[N+]CCCCC1 9
508 | C1N=[NH+]CS1 9
509 | C1COPOC1 9
510 | C1=CC[N+]CC1 9
511 | C1=[N+]CCCN1 9
512 | C1CC=NC=C1 9
513 | C1C[NH+]CN1 8
514 | C1=COCCN1 8
515 | C1=NC=NCN1 8
516 | C1CSNC=C1 8
517 | C1=CC=CCC=C1 8
518 | C1CSC=CN1 8
519 | C1=NCN=N1 8
520 | C1=CN=NCC1 8
521 | C1C=CON1 8
522 | C1CCCCCCCCCCC1 8
523 | C1CONC1 8
524 | C1CN=NN1 8
525 | C1CCCCS1 8
526 | C1=CCC=C1 8
527 | C1C=CN=CN1 8
528 | C1=COCC[NH+]C1 8
529 | C1CCOCCCN1 8
530 | C1CNCCS1 8
531 | C1CCNO1 8
532 | C1CC[NH+]CN1 8
533 | C1CNCC=N1 7
534 | C1CC=CO1 7
535 | C1=CSNCC1 7
536 | C1=C[NH]CN=N1 7
537 | C1=[N+]CC[N+]=C1 7
538 | C1CCC=C1 7
539 | C1NCON1 7
540 | C1=C[NH][NH+]=C1 7
541 | C1C[N+]CC[N+]1 7
542 | C1C=C[N+]=CC1 7
543 | C1=CCNNC1 7
544 | C1N=NN=N1 7
545 | C1SCCS1 7
546 | C1N=NNN1 7
547 | C1=NO[N+]=C1 7
548 | C1[NH+]=CC=[NH+]1 7
549 | C1CCCSCC1 7
550 | C1=[N+]CNC1 7
551 | C1=COC[NH+]C1 7
552 | C1C[N+]CCC1 7
553 | C1=CNC[NH+]C1 6
554 | C1=CSNC1 6
555 | C1C[NH+]CCO1 6
556 | C1CCNC=CN1 6
557 | C1=[NH+]CON1 6
558 | C1CCCNCN1 6
559 | C1NC[N+]CN1 6
560 | C1CNCC[NH2+]1 6
561 | C1C=NNCN1 6
562 | C1=CCN[NH]C1 6
563 | C1C=CC=C1 6
564 | C1=[N+]CCCC1 6
565 | C1=NCCOCC1 6
566 | C1C[NH2+]CCO1 6
567 | C1CCCC[NH2+]C1 6
568 | C1NC=NN1 6
569 | C1=NC[NH]C=N1 6
570 | C1CSSC1 6
571 | C1=NC=CCS1 6
572 | C1=CN=COC1 6
573 | C1CC[NH2+]CCN1 6
574 | C1=C[N+]CCN1 6
575 | C1=CCC[NH+]CC1 6
576 | C1=[NH+]CCCS1 6
577 | C1CCCNC=N1 6
578 | C1C[NH+]=CC=N1 6
579 | C1CC=CNC1 5
580 | C1=CCCOC=C1 5
581 | C1=[NH+]CNN1 5
582 | C1=C[N+]=CC=[N+]1 5
583 | C1C[N+]CC1 5
584 | C1=CC[NH+]C1 5
585 | C1N=NCO1 5
586 | C1=C[NH+]=NC1 5
587 | C1=C[NH][NH]C1 5
588 | C1CCCCNCC1 5
589 | C1CN=NC=N1 5
590 | C1=NNCCC1 5
591 | C1C[NH+]CCCC1 5
592 | C1CC[NH+]1 5
593 | C1C=NC=C1 5
594 | C1=C[NH+]=NN1 5
595 | C1=CCOCCC1 5
596 | C1CNC=CC1 5
597 | C1C=[N+]CCN1 5
598 | C1CC[NH]CC1 5
599 | C1C[NH]C[NH]C1 5
600 | C1C=CN=C1 5
601 | C1CC=CC1 5
602 | C1CC=[NH+]C=N1 5
603 | C1=NC=[NH+]CN1 5
604 | C1CNCC[NH+]1 5
605 | C1C=CSCN1 5
606 | C1=N[N-]N=C1 5
607 | C1N=CCCN1 5
608 | C1CCN=CC1 5
609 | C1=N[NH]C=[NH+]1 5
610 | C1=CSC=CC1 5
611 | C1=CNNCC1 4
612 | C1=CNCCCN1 4
613 | C1=NCOC1 4
614 | C1CCC=CCC1 4
615 | C1C[NH]N=CN1 4
616 | C1NNC[NH+]1 4
617 | C1C[NH+]1 4
618 | C1=CNCCO1 4
619 | C1CSCCSC1 4
620 | C1C[NH2+]CCS1 4
621 | C1=CCCSC1 4
622 | C1CSCC[NH2+]C1 4
623 | C1=NNSCC1 4
624 | C1=CNCCN=C1 4
625 | C1=CN=C[N+]=C1 4
626 | C1C=NCN=C1 4
627 | C1=NCC[N+]1 4
628 | C1C=CNC=[N+]1 4
629 | C1C=CCNCC1 4
630 | C1C[NH]CC=N1 4
631 | C1CCCCCCCCC1 4
632 | C1CCC[NH+]=N1 4
633 | C1NNCO1 4
634 | C1[N-]C=NN1 4
635 | C1C[NH2+]CCSC1 4
636 | C1=NC=CCN1 4
637 | C1=[NH+]CN=N1 4
638 | C1=CSCO1 4
639 | C1CCCOCC1 4
640 | C1CCOCCC1 4
641 | C1=NCCSCC1 4
642 | C1NCCNN1 4
643 | C1=CC[NH+]CCC1 4
644 | C1=CCC[NH+]C1 4
645 | C1CN=CC=CN1 4
646 | C1CC=CCCN1 4
647 | C1CN=CN=N1 4
648 | C1NCC=CO1 4
649 | C1=CC=[NH+]N=C1 4
650 | C1=C[NH+]=C[NH]C1 4
651 | C1=NCCCCC1 4
652 | C1CNCCC[NH2+]C1 4
653 | C1=NCCSN1 4
654 | C1=C[N-]CC1 4
655 | C1=NN[N+]=C1 4
656 | C1=NCCC=NN1 4
657 | C1COCC[NH2+]C1 4
658 | C1CNSCC1 4
659 | C1C[NH+]CO1 3
660 | C1=N[NH]CNC1 3
661 | C1COCCOCCOCCOCCO1 3
662 | C1CCCSN1 3
663 | C1=NN=NC1 3
664 | C1CN=CCC1 3
665 | C1CSC1 3
666 | C1CNS[NH+]C1 3
667 | C1NC[NH+]CN1 3
668 | C1=NOC[N-]1 3
669 | C1=NNCCS1 3
670 | C1CNN=C[NH]1 3
671 | C1CN=[NH+]C=N1 3
672 | C1C=CC1 3
673 | C1=COCCC[NH+]C1 3
674 | C1C=CCNN1 3
675 | C1=[NH+]CCCCN1 3
676 | C1=[NH+]CCO1 3
677 | C1C[NH+]=CSC1 3
678 | C1=NC=NNC1 3
679 | C1CN=C[N+]=C1 3
680 | C1=CCSN1 3
681 | C1=CSC=[NH+]1 3
682 | C1=CCCCOC1 3
683 | C1=CN=C[N-]C1 3
684 | C1C=CC[NH+]C1 3
685 | C1=NSCC[N-]1 3
686 | C1=NNC[NH]C1 3
687 | C1CC=[N+]CC1 3
688 | C1C=CNN1 3
689 | C1=CO[NH]C1 3
690 | C1=NC=NCC1 3
691 | C1C=NC[NH]C1 3
692 | C1=NC[NH+]CN1 3
693 | C1CC=NCC1 3
694 | C1COC[NH2+]1 3
695 | C1C[NH+]CC[NH+]1 3
696 | C1COCCOCCOCCN1 3
697 | C1CS1 3
698 | C1=COCCCNC1 3
699 | C1=NN=CN=N1 3
700 | C1C=CNNC1 3
701 | C1=NC[NH+]C1 3
702 | C1=CCNCCC1 3
703 | C1C=CCNC1 3
704 | C1CN=C[NH+]=C1 3
705 | C1N=C[NH]CN1 3
706 | C1NNCNN1 3
707 | C1C=NCCN1 3
708 | C1C=CC=[NH+]1 3
709 | C1CS[N-]CN1 3
710 | C1=NCN=CO1 3
711 | C1=[NH+]CCC=C1 3
712 | C1=CNC=[N+]C1 3
713 | C1CNSN1 3
714 | C1=NNNN1 3
715 | C1CCNCO1 3
716 | C1CCCC=CN1 3
717 | C1CSCNN1 3
718 | C1=CNCCCC1 3
719 | C1COC[NH+]1 3
720 | C1=NNCNN1 3
721 | C1=NSC=CN1 2
722 | C1=CNCC[NH2+]C1 2
723 | C1C[NH2+]CN1 2
724 | C1CCSS1 2
725 | C1C=CCCCC1 2
726 | C1=[N+]CCNC1 2
727 | C1CCCC[NH+]1 2
728 | C1=N[NH]CS1 2
729 | C1CNCC=CN1 2
730 | C1N=CNCN1 2
731 | C1=NSSC1 2
732 | C1CNNCN1 2
733 | C1=CCCOC1 2
734 | C1CCCCCCCC1 2
735 | C1COCCOCCOCCO1 2
736 | C1CCCCC=C1 2
737 | C1CCCNCCCN1 2
738 | C1=NCCOC=C1 2
739 | C1=CSSC1 2
740 | C1CCOC=NN1 2
741 | C1CCS1 2
742 | C1C=CCOC1 2
743 | C1=CNCCSC1 2
744 | C1=NC=[NH+]C=N1 2
745 | C1NN1 2
746 | C1=NCCC=[NH+]1 2
747 | C1CCC=[NH+]1 2
748 | C1C[N+]=CC=N1 2
749 | C1NCSS1 2
750 | C1=CN=CSC1 2
751 | C1SCSCS1 2
752 | C1C=NNP1 2
753 | C1C=COCO1 2
754 | C1=CNNN1 2
755 | C1CN=CCSC1 2
756 | C1=CNCCC=C1 2
757 | C1CC[NH][NH]C1 2
758 | C1C=NN=C1 2
759 | C1=C/CCCCCC/1 2
760 | C1CC[NH+]CSC1 2
761 | C1CNNC=N1 2
762 | C1COCC[NH+]CCOCCOCC[NH+]CCO1 2
763 | C1NC=CC=NN1 2
764 | C1CCCC[NH+]CC1 2
765 | C1CC=[N+]C=C1 2
766 | C1=[NH+]CCSC[CH-]1 2
767 | C1C[NH]COC1 2
768 | C1=N[N-]N=N1 2
769 | C1CC[N-]C=N1 2
770 | C1N=C[NH+]=C[NH]1 2
771 | C1CCN=N1 2
772 | C1=CNC=[NH+]C1 2
773 | C1NC=[NH+]CN1 2
774 | C1NCCC=[NH+]1 2
775 | C1=CN=N[NH]C1 2
776 | C1N=N1 2
777 | C1N=CS1 2
778 | C1=CN=C[N+]C1 2
779 | C1N[NH]CS1 2
780 | C1COCCC[NH+]1 2
781 | C1=[N+]CCS1 2
782 | C1=NCCCSC1 2
783 | C1C=NCO1 2
784 | C1=CS[N+]=C1 2
785 | C1=NCCC=C1 2
786 | C1=CSOC1 2
787 | C1=CC=[NH+]CN1 2
788 | C1NCC[NH+]1 2
789 | C1COCON1 2
790 | C1=NN[N-]N1 2
791 | C1CSCCNC1 2
792 | C1CNSCCN1 2
793 | C1=CN=NN=C1 2
794 | C1=CCC[NH2+]C1 2
795 | C1C=CC=CC=C1 2
796 | C1COCS1 2
797 | C1COCSN1 2
798 | C1C[NH+]CS1 2
799 | C1CCCC=[NH+]1 2
800 | C1CSC=CCN1 2
801 | C1=NC=C[N+]=C1 2
802 | C1=CNPN=C1 2
803 | C1CN[NH2+]N1 2
804 | C1=[N+]NCC1 2
805 | C1CCN[N+]CC1 2
806 | C1CSCOC1 2
807 | C1C[NH2+]CSC1 2
808 | C1COPO1 2
809 | C1=[NH+]CCNC1 2
810 | C1CNNCC1 2
811 | C1N=CN=CN1 2
812 | C1CCOCOC1 2
813 | C1CCCCCO1 2
814 | C1N=[NH+]CO1 2
815 | C1NCC[N+]1 2
816 | C1=CC=CC1 2
817 | C1C=CCN=C1 2
818 | C1=NOCCN1 2
819 | C1=CC[N+]C=C1 2
820 | C1C=NCCN=C1 2
821 | C1=CC1 2
822 | C1CCOCC[NH2+]C1 2
823 | C1CC[N+]=[N+]CC1 2
824 | C1CSC=C1 2
825 | C1COCPCO1 2
826 | C1NOSN1 2
827 | C1C[NH+]C[NH+]1 1
828 | C1=CSNCN1 1
829 | C1=NCCC=[N+]1 1
830 | C1C=NC[NH+]C1 1
831 | C1C=COCCO1 1
832 | C1CNC[NH2+]C1 1
833 | C1CSCNCCC1 1
834 | C1CC=CCCO1 1
835 | C1C[NH+]COC1 1
836 | C1[NH+]CCO1 1
837 | C1=COC=CN1 1
838 | C1NCCCCCN1 1
839 | C1C=CSNC1 1
840 | C1C=C[NH+]C=C1 1
841 | C1=NCN=C[N+]1 1
842 | C1CSCCOC1 1
843 | C1=C[N+]=CNC1 1
844 | C1=CCC=CCC1 1
845 | C1=CC=NCN1 1
846 | C1=CC=CCCC1 1
847 | C1=CCCCC/C=C/1 1
848 | C1=[NH+]NCN1 1
849 | C1CC=CS1 1
850 | C1=CC=COC=C1 1
851 | C1=NNCCP1 1
852 | C1=NC=CCNC1 1
853 | C1=NC=CSC1 1
854 | C1COCCOCCOCCOCCOCCOCCOCCO1 1
855 | C1NCNO1 1
856 | C1C[N+]CN1 1
857 | C1CCCN[P+]N1 1
858 | C1CCC/C=C/C=CCOCCCC1 1
859 | C1N=C[N-]N1 1
860 | C1C=CNCN=C1 1
861 | C1CC[S+]C1 1
862 | C1=CSC=[N+]C1 1
863 | C1=CC=[NH+]C=NN1 1
864 | C1C=[NH+]C=N1 1
865 | C1NCN1 1
866 | C1OCCCO1 1
867 | C1C[NH+]CCC[NH2+]C1 1
868 | C1SC=CS1 1
869 | C1C=CNO1 1
870 | C1C=CC=NN1 1
871 | C1CC=CCC=C1 1
872 | C1CCC[NH]CC1 1
873 | C1=CNCNN1 1
874 | C1C[N+]=CC=C1 1
875 | C1C=[NH+]CC1 1
876 | C1CCNCCCCC1 1
877 | C1N=CSS1 1
878 | C1N=N[NH]N1 1
879 | C1CSCNCCN1 1
880 | C1=CNCCS1 1
881 | C1=CCCSCC1 1
882 | C1CNC[NH+]CN1 1
883 | C1C=CNSC1 1
884 | C1C[N+]=CC=[N+]1 1
885 | C1=C\CCCC/C=C/CC/1 1
886 | C1CCCC[N+]C1 1
887 | C1CCNCCOC1 1
888 | C1=NSCN1 1
889 | C1=CC=C[NH]C1 1
890 | C1[NH]C=[NH+]CN1 1
891 | C1C=[N+]CC1 1
892 | C1COCCOCC[NH+]CCOCCOCC[NH+]1 1
893 | C1=NCSN1 1
894 | C1=[NH+]CCC[NH2+]C1 1
895 | C1C=[NH+]C=C[NH]1 1
896 | C1=CC=CCCC=CC=CCC1 1
897 | C1C=CCCCC=CC=CCC1 1
898 | C1C=CCCCCC=CCCC1 1
899 | C1=CC=CCCCC=CCCC1 1
900 | C1=NNC=[NH+]1 1
901 | C1=C/CCCCCC\C=C/CC\1 1
902 | C1=NCN=CN1 1
903 | C1CCSCCC1 1
904 | C1=CNCC[NH]1 1
905 | C1C[NH+]=CN=N1 1
906 | C1CCPN1 1
907 | C1N=CC=N1 1
908 | C1C=CNSN1 1
909 | C1[N+]CCO1 1
910 | C1COCC=C1 1
911 | C1=NCC=C[N-]1 1
912 | C1CCC[N+]1 1
913 | C1=C[NH2+]C=CC1 1
914 | C1CCC=CC=C1 1
915 | C1C=CCC=N1 1
916 | C1C=NNS1 1
917 | C1=CNN=CN1 1
918 | C1=C[NH]CSC1 1
919 | C1CNCCNN1 1
920 | C1CSOC1 1
921 | C1C/C=C\C=CCN1 1
922 | C1CN=CC1 1
923 | C1=COCCOCCOCCOCCO1 1
924 | C1=CC=NC1 1
925 | C1CCCSCNC1 1
926 | C1=NN=C[N-]1 1
927 | C1COCCOCCOCCSCCOCCO1 1
928 | C1=CN[N+]=C1 1
929 | C1CC\C=C/CCC1 1
930 | C1CN[N+]=C1 1
931 | C1CNC[NH+]1 1
932 | C1CCCCSC1 1
933 | C1C=[NH+]C=NC1 1
934 | C1C[NH+]CCNN1 1
935 | C1=C[N+]=C[NH]1 1
936 | C1=CC=[NH+]CCN1 1
937 | C1C=N[N+]=C1 1
938 | C1COC[N-]1 1
939 | C1=C[NH+]CC1 1
940 | C1=CNC[NH2+]C1 1
941 | C1=C[S+]=CS1 1
942 | C1=NNCSCC1 1
943 | C1=C[N-]CN=C1 1
944 | C1=NN=N[NH]1 1
945 | C1CCCCNCCC1 1
946 | C1=C[NH2+]CCN=C1 1
947 | C1=C[NH2+]NC1 1
948 | C1CC[N+]N1 1
949 | C1=C/CCCC/C=C/CC/1 1
950 | C1C=NCC=C1 1
951 | C1=NCC=C1 1
952 | C1=CSC=CS1 1
953 | C1C[NH2+]CCOC1 1
954 | C1NNCCNN1 1
955 | C1CN=CO1 1
956 | C1=C[NH2+]CCC1 1
957 | C1=COC[N+]1 1
958 | C1=CSCC=C1 1
959 | C1CN=[NH+]C1 1
960 | C1CC=CCCC1 1
961 | C1NN=NS1 1
962 | C1=N\CC/N=C\CC/1 1
963 | C1NCCC=[N+]1 1
964 | C1=CCC=CC=C1 1
965 | C1CCSCC[NH2+]1 1
966 | C1CC[N+]C=N1 1
967 | C1NCC=[N+]1 1
968 | C1C[NH]C=C[NH]1 1
969 | C1CC[NH+]=N1 1
970 | C1=NC=[O+]C=N1 1
971 | C1=NCC=CN1 1
972 | C1CN=CSC1 1
973 | C1NNC=[NH+]1 1
974 | C1=C[NH+]=CSC1 1
975 | C1C=CNC=CN1 1
976 | C1C[NH+]=NC1 1
977 | C1CSSCCSS1 1
978 | C1CCCOCCCCO1 1
979 | C1=CNN=CC1 1
980 | C1C=NSN=C1 1
981 | C1C[NH+]CC[NH+]C1 1
982 | C1=CC[NH]C1 1
983 | C1C=NCCCN1 1
984 | C1CNC=NC1 1
985 | C1CSCCSCCS1 1
986 | C1C=NNCS1 1
987 | C1NC=CC=[NH+]1 1
988 | C1C=CC[N+]CC1 1
989 | C1CSC=NN1 1
990 | C1=[NH+]NCCN1 1
991 | C1=CN=P[NH+]=C1 1
992 | C1COPN1 1
993 | C1=NNSC1 1
994 | C1C[NH+]=NCN1 1
995 | C1N=CSN1 1
996 | C1=NNC=[N+]1 1
997 | C1CCNC=NN1 1
998 | C1CSNS1 1
999 | C1COSO1 1
1000 | C1=CC=CCSCC=NCCSC1 1
1001 | C1C=CCCSCC=NCCSC1 1
1002 | N1NNN1 1
1003 | C1CSSCCNN1 1
1004 | C1=C[NH]COC1 1
1005 | C1=CN=SCC1 1
1006 | C1CCC=NCC1 1
1007 | C1=CC[NH2+]CCC1 1
1008 | C1CCNNCC1 1
1009 | C1=CCNC=CC1 1
1010 | C1=CCCOCC1 1
1011 | C1=[NH+]CCNCC1 1
1012 | C1CN[PH]O1 1
1013 | C1CO[PH]O1 1
1014 | C1C=[N+]CCCC1 1
1015 | C1CCOCCCCCCOC1 1
1016 | C1\C=C/CNCCC1 1
1017 | C1=CSCNN1 1
1018 | C1=NNC[NH2+]C1 1
1019 | C1CC[NH]NC1 1
1020 | C1C=CC=NC1 1
1021 | C1CSNN1 1
1022 | C1=N[NH+]=CNC1 1
1023 | C1=NC=CC1 1
1024 | C1=NCSCC1 1
1025 | C1C[NH+]CC[N+]1 1
1026 | C1=[N+]C[N+]=C1 1
1027 | C1CSCN[NH]1 1
1028 | C1COC=CCN1 1
1029 | C1=C[N+]=CC=N1 1
1030 | C1=NCCCCO1 1
1031 | C1=CNC[NH+]=C1 1
1032 | C1COCC[NH2+]CCOCCNCCCCCN1 1
1033 | C1CC[N+]NC1 1
1034 | C1=CNCC[NH+]=C1 1
1035 | C1CSCCSCCCSCCSC1 1
1036 | C1NN[NH2+]N1 1
1037 | C1=NNCN=N1 1
1038 | C1NC[NH+]CS1 1
1039 | C1=[NH+]C=CCN1 1
1040 | C1CCOPN1 1
1041 | C1=CSC[N-]1 1
1042 | C1CCC=CS1 1
1043 | C1C=[N+]C=CN1 1
1044 | C1=CC=COCC1 1
1045 | C1OCCS1 1
1046 | C1C=CC[N+]1 1
1047 | C1=NC[NH+]=C1 1
1048 | C1=NC=N[N+]=C1 1
1049 | C1CNC[N+]=C1 1
1050 | C1=NCC[N+]=C1 1
1051 | C1=NCCCO1 1
1052 | C1CNC=[NH+]1 1
1053 | C1C[NH+]SN1 1
1054 | C1NCOCN1 1
1055 | C1CC=CC=CN1 1
1056 | C1CC[N+]C=CN1 1
1057 | C1=NCCN=CC1 1
1058 | C1COCNN1 1
1059 | C1CNCNN1 1
1060 | C1=CN=[N+]C=C1 1
1061 | C1CCC=C[N-]1 1
1062 | C1CN=NC=C1 1
1063 | C1=[N+]C=[NH+]CC1 1
1064 | C1CNC[N+]C1 1
1065 | C1=C[N+]=NC=N1 1
1066 | C1=[NH+]C[NH][NH]1 1
1067 | C1C=CC=CCN1 1
1068 | C1COCCOCCOCC[NH+]1 1
1069 | C1=NC=[NH+]C1 1
1070 | C1=C[NH+]=COC1 1
1071 | C1C[NH2+]CC[NH2+]1 1
1072 | C1=CCCSCCC1 1
1073 | C1NOCNO1 1
1074 | C1=C\CNC/C=C\CNC/1 1
1075 | C1COCCOCC[NH2+]CCOCCOCCN1 1
1076 | C1=C[N+]CN=C1 1
1077 | C1=NCOCS1 1
1078 | C1CNCC[N-]1 1
1079 | C1=NCC=[NH+]C1 1
1080 | C1C=COCCN1 1
1081 | C1COCCOCCOCCOCC[NH2+]CCO1 1
1082 | C1CCC=NC=N1 1
1083 | C1C[N-]SCCSO1 1
1084 | C1C=CNC=N1 1
1085 | C1=C[N+]=CCC1 1
1086 | C1CCNCCCN1 1
1087 | C1CC=NNCN1 1
1088 | C1NNN=[NH+]1 1
1089 | C1=NC=CC[N-]1 1
1090 | C1C[O+]=CC=N1 1
1091 | C1[N-]CCS1 1
1092 | C1COC=[NH+]1 1
1093 | C1=CC=NCC[NH2+]1 1
1094 | C1=C[NH2+]CCCN1 1
1095 | C1C=NCC[NH]C1 1
1096 | C1=NCCP1 1
1097 | C1CC[NH+]NC1 1
1098 | C1N[NH+]1 1
1099 | C1NNC=CS1 1
1100 | C1=CC[NH+]=C1 1
1101 | C1=CCCC=CC1 1
1102 | C1C=NNCC1 1
1103 | C1C[N-]NC1 1
1104 | C1CC/C=C/CCCCCCCCO1 1
1105 | C1NCN[NH]1 1
1106 | C1CCCCOCCCCCC1 1
1107 | C1N=[NH+]CN1 1
1108 | C1CN=CC=C1 1
1109 | C1=NC=CPN1 1
1110 | C1=CCC\C=C/CCCC1 1
1111 | C1=NCSS1 1
1112 | C1COCCOCCOCCOCCOCCO1 1
1113 | C1=CSC1 1
1114 | C1NSC=[N+]1 1
1115 | C1SCCCS1 1
1116 | C1=NN=COC1 1
1117 | C1=NCC=CO1 1
1118 | C1C[N-]CC1 1
1119 | C1[NH]CN[NH]1 1
1120 | C1C=C[NH+]=N1 1
1121 | C1C[NH+]CCNCC[NH+]CCNCC[NH+]CCN1 1
1122 | C1N=CCC=N1 1
1123 | C1C=CCNCN1 1
1124 | C1=C[N-]N=C1 1
1125 | C1CCC[NH+]CCC=CC1 1
1126 | C1C[NH]C=[N+]C1 1
1127 | C1=CNPC=C1 1
1128 | C1=NCNPN1 1
1129 | C1=CSC=NC1 1
1130 | C1C[NH+]=CCCN1 1
1131 | C1=CSCC=[NH+]1 1
1132 | C1COCCOCCOCCOCCN1 1
1133 | C1CCNS1 1
1134 | C1=CC[N+]C1 1
1135 | C1C[NH2+]CC[NH+]1 1
1136 | C1=CCCC[NH2+]C1 1
1137 | C1CSNC1 1
1138 | C1=C[NH+]CCN1 1
1139 | C1CN=[NH+]CC1 1
1140 | C1=CC=[O+]C1 1
1141 | C1N=NN[N+]1 1
1142 | C1CCOPO1 1
1143 | C1=NCCNN1 1
1144 | C1COCOO1 1
1145 | C1CCOOCC1 1
1146 | C1CCC[N+]C=N1 1
1147 | C1C[NH+]=COC1 1
1148 | C1NC=CSCCO1 1
1149 |
--------------------------------------------------------------------------------
/src/chemutils.py:
--------------------------------------------------------------------------------
1 | import rdkit
2 | from rdkit import Chem, DataStructs
3 | from rdkit.Chem import AllChem
4 | from rdkit.Chem import Draw
5 | from functools import reduce
6 | from tqdm import tqdm
7 | from copy import deepcopy
8 | import numpy as np
9 | import torch
10 | from torch.autograd import Variable
11 | torch.manual_seed(4)
12 | np.random.seed(1)
13 | import random
14 | random.seed(1)
15 |
16 | '''
17 | 1. vocabulary: find frequent words (atom and ring)
18 | 2. graph2tree
19 | 3. generate smiles set
20 | 4. chemical utility
21 | tanimot similarity
22 | canonicalize smiles
23 | is valid
24 | 5. score modifier
25 | logp_modifier [-inf, inf] -> [0,1]
26 |
27 | qed_logp_jnk_gsk_fusion
28 | qed, logp, jsn, gsk -> [0,1]
29 |
30 |
31 | '''
32 | def sigmoid(float_x):
33 | return 1.0 / (1 + np.exp(-float_x))
34 |
35 | from scipy.stats import gmean
36 |
37 | def logp_modifier(logp_score):
38 | return max(0.0,min(1.0,1/14*(logp_score+10)))
39 | '''
40 | [-inf, inf] -> [0,1]
41 | '''
42 |
43 | def docking_modifier(docking_score):
44 | '''
45 | [-12,-4] -> [0,1]
46 | -12 -----> 1
47 | -4 -----> 0
48 | '''
49 | docking_score = 1/(12-4)*(-docking_score - 4)
50 | docking_score = max(docking_score, 0.0)
51 | docking_score = min(docking_score, 1.0)
52 | return docking_score
53 |
54 | def qed_logp_fusion(qed_score, logp_score, jnk_score, gsk_score):
55 | logp_score = logp_modifier(logp_score)
56 | gmean_score = gmean([qed_score, logp_score])
57 | modified_score = min(1.0,gmean_score)
58 | return modified_score
59 |
60 | def logp_jnk_gsk_fusion(logp_score, jnk_score, gsk_score):
61 | logp_score = logp_modifier(logp_score)
62 | return np.mean([logp_score, jnk_score, gsk_score])
63 |
64 |
65 | def qed_logp_jnk_gsk_fusion(qed_score, logp_score, jnk_score, gsk_score):
66 | logp_score = logp_modifier(logp_score)
67 | gmean_score = gmean([qed_score, logp_score, jnk_score, gsk_score])
68 | modified_score = min(1.0,gmean_score)
69 | return modified_score
70 |
71 | def qed_logp_jnk_gsk_fusion2(qed_score, logp_score, jnk_score, gsk_score):
72 | logp_score = logp_modifier(logp_score)
73 | return np.mean([qed_score, logp_score, jnk_score, gsk_score])
74 |
75 | def qed_logp_fusion(qed_score, logp_score):
76 | logp_score = logp_modifier(logp_score)
77 | gmean_score = gmean([qed_score, logp_score])
78 | modified_score = min(1.0, gmean_score)
79 | return modified_score
80 |
81 | def jnk_gsk_fusion(jnk_score, gsk_score):
82 | gmean_score = gmean([jnk_score, gsk_score])
83 | modified_score = min(1.0,gmean_score)
84 | return modified_score
85 |
86 |
87 | def load_vocabulary():
88 | datafile = "data/vocabulary.txt"
89 | with open(datafile, 'r') as fin:
90 | lines = fin.readlines()
91 | vocabulary = [line.split()[0] for line in lines]
92 | return vocabulary
93 |
94 | vocabulary = load_vocabulary()
95 | bondtype_list = [rdkit.Chem.rdchem.BondType.SINGLE, rdkit.Chem.rdchem.BondType.DOUBLE]
96 |
97 |
98 | def ith_substructure_is_atom(i):
99 | substructure = vocabulary[i]
100 | return True if len(substructure)==1 else False
101 |
102 | def word2idx(word):
103 | return vocabulary.index(word)
104 |
105 |
106 | # def smiles2fingerprint(smiles):
107 | # mol = Chem.MolFromSmiles(smiles)
108 | # fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048, useChirality=False)
109 | # return np.array(fp)
110 | # ### shape: (2048,)
111 |
112 | def smiles2fingerprint(smiles):
113 | mol = Chem.MolFromSmiles(smiles)
114 | fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useChirality=False)
115 | return np.array(fp)
116 | ### shape: (1024,)
117 |
118 |
119 | ## similarity of two SMILES
120 | def similarity(a, b):
121 | if a is None or b is None:
122 | return 0.0
123 | amol = Chem.MolFromSmiles(a)
124 | bmol = Chem.MolFromSmiles(b)
125 | if amol is None or bmol is None:
126 | return 0.0
127 | fp1 = AllChem.GetMorganFingerprintAsBitVect(amol, 2, nBits=2048, useChirality=False)
128 | fp2 = AllChem.GetMorganFingerprintAsBitVect(bmol, 2, nBits=2048, useChirality=False)
129 | return DataStructs.TanimotoSimilarity(fp1, fp2)
130 |
131 |
132 | def similarity_matrix(smiles_lst):
133 | n = len(smiles_lst)
134 | sim_matrix = np.eye(n)
135 | mol_lst = [Chem.MolFromSmiles(smiles) for smiles in smiles_lst]
136 | fingerprint_lst = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048, useChirality=False) for mol in mol_lst]
137 | for i in range(n):
138 | fp1 = fingerprint_lst[i]
139 | for j in range(i+1,n):
140 | fp2 = fingerprint_lst[j]
141 | sim = DataStructs.TanimotoSimilarity(fp1, fp2)
142 | sim_matrix[i,j] = sim_matrix[j,i] = sim
143 | return sim_matrix
144 |
145 |
146 | def canonical(smiles):
147 | try:
148 | mol = Chem.MolFromSmiles(smiles)
149 | except:
150 | return None
151 | if mol is not None:
152 | return Chem.MolToSmiles(mol, isomericSmiles=True) ### todo double check
153 | else:
154 | return None
155 |
156 |
157 | def smiles2mol(smiles):
158 | try:
159 | mol = Chem.MolFromSmiles(smiles)
160 | except:
161 | return None
162 | if mol is None:
163 | return None
164 | Chem.Kekulize(mol)
165 | return mol
166 |
167 | ## input: smiles, output: word lst;
168 | def smiles2word(smiles):
169 | mol = smiles2mol(smiles)
170 | if mol is None:
171 | return None
172 | word_lst = []
173 |
174 | cliques = [list(x) for x in Chem.GetSymmSSSR(mol)]
175 | cliques_smiles = []
176 | for clique in cliques:
177 | clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True)
178 | cliques_smiles.append(clique_smiles)
179 | atom_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()]
180 | return cliques_smiles + atom_not_in_rings_list
181 |
182 | ## is_valid_smiles
183 | def is_valid(smiles):
184 | word_lst = smiles2word(smiles)
185 | word_set = set(word_lst)
186 | return word_set.issubset(vocabulary)
187 |
188 |
189 | def is_valid_mol(mol):
190 | try:
191 | smiles = Chem.MolToSmiles(mol)
192 | except:
193 | return False
194 | if smiles.strip() == '':
195 | return False
196 | mol = Chem.MolFromSmiles(smiles)
197 | if mol is None or mol.GetNumAtoms() == 0:
198 | return False
199 | return True
200 |
201 | def substr_num(smiles):
202 | mol = smiles2mol(smiles)
203 | clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)]
204 | return len(clique_lst)
205 |
206 |
207 | def smiles2substrs(smiles):
208 | if not is_valid(smiles):
209 | return None
210 | mol = smiles2mol(smiles)
211 | if mol is None:
212 | return None
213 | idx_lst = []
214 |
215 | clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)]
216 | # print(clique_lst) ## [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15]]
217 | for clique in clique_lst:
218 | clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True)
219 | # print("clique_smiles", clique_smiles) ## C1=CC=CC=C1, C1=COCC1, C1=CC=CC=C1
220 | idx_lst.append(word2idx(clique_smiles))
221 | atom_symbol_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()]
222 | atom_idx_not_in_rings_list = [atom.GetIdx() for atom in mol.GetAtoms() if not atom.IsInRing()]
223 | # print(atom_idx_not_in_rings_list) ## [0, 1, 2, 3, 11, 12, 13, 14, 21] nonring atom's index in molecule
224 | for atom in atom_symbol_not_in_rings_list:
225 | idx_lst.append(word2idx(atom))
226 |
227 | return idx_lst
228 |
229 |
230 |
231 | def smiles2graph(smiles):
232 | ''' N is # of substructures in the molecule
233 |
234 | Output:
235 | 1.
236 | idx_lst [N] list of substructure's index
237 | node_mat [N,d]
238 | 2.
239 | substructure_lst
240 | atomidx_2substridx dict
241 | 3.
242 | adjacency_matrix [N,N] 0/1 np.zeros((4,4))
243 | 4.
244 | leaf_extend_idx_pair [(x1,y1), (x2,y2), ...]
245 | '''
246 |
247 | ### 0. smiles -> mol
248 | if not is_valid(smiles):
249 | return None
250 | mol = smiles2mol(smiles)
251 | if mol is None:
252 | return None
253 |
254 | ### 1. idx_lst & node_mat
255 | idx_lst = []
256 | clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)]
257 | # print(clique_lst) ## [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15]]
258 | for clique in clique_lst:
259 | clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True)
260 | # print("clique_smiles", clique_smiles) ## C1=CC=CC=C1, C1=COCC1, C1=CC=CC=C1
261 | idx_lst.append(word2idx(clique_smiles))
262 |
263 | atom_symbol_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()]
264 | atom_idx_not_in_rings_list = [atom.GetIdx() for atom in mol.GetAtoms() if not atom.IsInRing()]
265 | # print(atom_idx_not_in_rings_list) ## [0, 1, 2, 3, 11, 12, 13, 14, 21] nonring atom's index in molecule
266 | for atom in atom_symbol_not_in_rings_list:
267 | idx_lst.append(word2idx(atom))
268 | # print(idx_lst) ## [3, 68, 3, 0, 0, 0, 0, 0, 0, 1, 2, 4]
269 | d = len(vocabulary)
270 | N = len(idx_lst)
271 | node_mat = np.zeros((N, d))
272 | for i,v in enumerate(idx_lst):
273 | node_mat[i,v]=1
274 |
275 |
276 | ### 2. substructure_lst & atomidx_2substridx
277 | ### map from atom index to substructure index
278 | atomidx_2substridx = dict()
279 | substructure_lst = clique_lst + atom_idx_not_in_rings_list
280 | ### [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15], 0, 1, 2, 3, 11, 12, 13, 14, 21]
281 | ### 4:0 23:0, 22:0, ... 8:1, 7:1, 22:1, ... 16:2, 17:2, 18:2, ... 0:3, 1:4,
282 | for idx, substructure in enumerate(substructure_lst):
283 | if type(substructure)==list:
284 | for atom in substructure:
285 | atomidx_2substridx[atom] = idx
286 | else:
287 | atomidx_2substridx[substructure] = idx
288 |
289 |
290 | ### 3. adjacency_matrix
291 | adjacency_matrix = np.zeros((N,N),dtype=np.int32)
292 |
293 | ####### 3.1 atom-atom bonds and atom-ring bonds
294 | for bond in mol.GetBonds():
295 | if not bond.IsInRing():
296 | a1 = bond.GetBeginAtom().GetIdx()
297 | a2 = bond.GetEndAtom().GetIdx()
298 | idx1 = atomidx_2substridx[a1]
299 | idx2 = atomidx_2substridx[a2]
300 | adjacency_matrix[idx1,idx2] = adjacency_matrix[idx2,idx1] = 1
301 | ####### 3.2 ring-ring connection
302 | for i1,c1 in enumerate(clique_lst):
303 | for i2,c2 in enumerate(clique_lst):
304 | if i1>=i2:
305 | continue
306 | if len(set(c1).intersection(set(c2))) > 0:
307 | adjacency_matrix[i1,i2] = adjacency_matrix[i2,i1] = 1
308 | assert np.sum(adjacency_matrix)>=2*(N-1)
309 |
310 | leaf_idx_lst = list(np.where(np.sum(adjacency_matrix,1)==1)[0])
311 | M = len(leaf_idx_lst)
312 | extend_idx_lst = list(range(N,N+M))
313 | leaf_extend_idx_pair = list(zip(leaf_idx_lst, extend_idx_lst))
314 | ####### [(3, 12), (5, 13), (6, 14), (9, 15), (11, 16)]
315 |
316 | return idx_lst, node_mat, substructure_lst, atomidx_2substridx, adjacency_matrix, leaf_extend_idx_pair
317 |
318 |
319 | def smiles2feature(smiles):
320 | """
321 | (1) molecule2tree
322 | (2) mask leaf node
323 | """
324 | ### 0. smiles -> mol
325 | if not is_valid(smiles):
326 | return None
327 | mol = smiles2mol(smiles)
328 | if mol is None:
329 | return None
330 |
331 | ### 1. idx_lst
332 | idx_lst = []
333 | clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)]
334 | # print(clique_lst) ## [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15]]
335 | for clique in clique_lst:
336 | clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True)
337 | # print("clique_smiles", clique_smiles) ## C1=CC=CC=C1, C1=COCC1, C1=CC=CC=C1
338 | idx_lst.append(word2idx(clique_smiles))
339 |
340 | atom_symbol_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()]
341 | atom_idx_not_in_rings_list = [atom.GetIdx() for atom in mol.GetAtoms() if not atom.IsInRing()]
342 | # print(atom_idx_not_in_rings_list) ## [0, 1, 2, 3, 11, 12, 13, 14, 21] nonring atom's index in molecule
343 | for atom in atom_symbol_not_in_rings_list:
344 | idx_lst.append(word2idx(atom))
345 | # print(idx_lst) ## [3, 68, 3, 0, 0, 0, 0, 0, 0, 1, 2, 4]
346 | d = len(vocabulary)
347 | N = len(idx_lst)
348 |
349 | ### 2. substructure_lst & atomidx_2substridx
350 | ### map from atom index to substructure index
351 | atomidx_2substridx = dict()
352 | substructure_lst = clique_lst + atom_idx_not_in_rings_list
353 | ### [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15], 0, 1, 2, 3, 11, 12, 13, 14, 21]
354 | ### 4:0 23:0, 22:0, ... 8:1, 7:1, 22:1, ... 16:2, 17:2, 18:2, ... 0:3, 1:4,
355 | for idx, substructure in enumerate(substructure_lst):
356 | if type(substructure)==list:
357 | for atom in substructure:
358 | atomidx_2substridx[atom] = idx
359 | else:
360 | atomidx_2substridx[substructure] = idx
361 |
362 | ### 3. adjacency_matrix
363 | adjacency_matrix = np.zeros((N,N),dtype=np.int32)
364 | ####### 3.1 atom-atom bonds and atom-ring bonds
365 | for bond in mol.GetBonds():
366 | if not bond.IsInRing():
367 | a1 = bond.GetBeginAtom().GetIdx()
368 | a2 = bond.GetEndAtom().GetIdx()
369 | idx1 = atomidx_2substridx[a1]
370 | idx2 = atomidx_2substridx[a2]
371 | adjacency_matrix[idx1,idx2] = adjacency_matrix[idx2,idx1] = 1
372 | ####### 3.2 ring-ring connection
373 | for i1,c1 in enumerate(clique_lst):
374 | for i2,c2 in enumerate(clique_lst):
375 | if i1>=i2:
376 | continue
377 | if len(set(c1).intersection(set(c2))) > 0:
378 | adjacency_matrix[i1,i2] = adjacency_matrix[i2,i1] = 1
379 | assert np.sum(adjacency_matrix)>=2*(N-1)
380 |
381 | # print(adjacency_matrix, smiles)
382 | leaf_idx_lst = list(np.where(np.sum(adjacency_matrix,1)==1)[0])
383 | mask_idx = random.choice(leaf_idx_lst)
384 | label = idx_lst[mask_idx]
385 |
386 | node_mat = np.zeros((N, d + 1))
387 | for i,v in enumerate(idx_lst):
388 | if i==mask_idx:
389 | node_mat[i,d] = 1
390 | else:
391 | node_mat[i,v] = 1
392 |
393 | return node_mat, adjacency_matrix, mask_idx, label
394 |
395 |
396 |
397 | def smiles2expandfeature(smiles):
398 | """
399 | (1) molecule2tree
400 | (2) mask leaf node
401 | """
402 | ### 0. smiles -> mol
403 | if not is_valid(smiles):
404 | return None
405 | mol = smiles2mol(smiles)
406 | if mol is None:
407 | return None
408 |
409 | ### 1. idx_lst
410 | idx_lst = []
411 | clique_lst = [list(x) for x in Chem.GetSymmSSSR(mol)]
412 | # print(clique_lst) ## [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15]]
413 | for clique in clique_lst:
414 | clique_smiles = Chem.MolFragmentToSmiles(mol, clique, kekuleSmiles=True)
415 | # print("clique_smiles", clique_smiles) ## C1=CC=CC=C1, C1=COCC1, C1=CC=CC=C1
416 | idx_lst.append(word2idx(clique_smiles))
417 |
418 | atom_symbol_not_in_rings_list = [atom.GetSymbol() for atom in mol.GetAtoms() if not atom.IsInRing()]
419 | atom_idx_not_in_rings_list = [atom.GetIdx() for atom in mol.GetAtoms() if not atom.IsInRing()]
420 | # print(atom_idx_not_in_rings_list) ## [0, 1, 2, 3, 11, 12, 13, 14, 21] nonring atom's index in molecule
421 | for atom in atom_symbol_not_in_rings_list:
422 | idx_lst.append(word2idx(atom))
423 | # print(idx_lst) ## [3, 68, 3, 0, 0, 0, 0, 0, 0, 1, 2, 4]
424 | d = len(vocabulary)
425 | N = len(idx_lst)
426 |
427 | ### 2. substructure_lst & atomidx_2substridx
428 | ### map from atom index to substructure index
429 | atomidx_2substridx = dict()
430 | substructure_lst = clique_lst + atom_idx_not_in_rings_list
431 | ### [[4, 23, 22, 7, 6, 5], [8, 7, 22, 10, 9], [16, 17, 18, 19, 20, 15], 0, 1, 2, 3, 11, 12, 13, 14, 21]
432 | ### 4:0 23:0, 22:0, ... 8:1, 7:1, 22:1, ... 16:2, 17:2, 18:2, ... 0:3, 1:4,
433 | for idx, substructure in enumerate(substructure_lst):
434 | if type(substructure)==list:
435 | for atom in substructure:
436 | atomidx_2substridx[atom] = idx
437 | else:
438 | atomidx_2substridx[substructure] = idx
439 |
440 | ### 3. adjacency_matrix
441 | adjacency_matrix = np.zeros((N+1,N+1),dtype=np.int32)
442 | ####### 3.1 atom-atom bonds and atom-ring bonds
443 | for bond in mol.GetBonds():
444 | if not bond.IsInRing():
445 | a1 = bond.GetBeginAtom().GetIdx()
446 | a2 = bond.GetEndAtom().GetIdx()
447 | idx1 = atomidx_2substridx[a1]
448 | idx2 = atomidx_2substridx[a2]
449 | adjacency_matrix[idx1,idx2] = adjacency_matrix[idx2,idx1] = 1
450 | ####### 3.2 ring-ring connection
451 | for i1,c1 in enumerate(clique_lst):
452 | for i2,c2 in enumerate(clique_lst):
453 | if i1>=i2:
454 | continue
455 | if len(set(c1).intersection(set(c2))) > 0:
456 | adjacency_matrix[i1,i2] = adjacency_matrix[i2,i1] = 1
457 | # assert np.sum(adjacency_matrix)>=2*(N-1)
458 |
459 | # print(adjacency_matrix, smiles)
460 | leaf_idx_lst = list(np.where(np.sum(adjacency_matrix,1)==1)[0])
461 | mask_idx = random.choice(leaf_idx_lst)
462 | label = idx_lst[mask_idx]
463 |
464 |
465 | node_mat = np.zeros((N + 1, d + 1))
466 | for i,v in enumerate(idx_lst):
467 | node_mat[i,v] = 1
468 |
469 | feature_lst = []
470 | for idx in range(N):
471 | new_node_mat = deepcopy(node_mat)
472 | new_adj_mat = deepcopy(adjacency_matrix)
473 | new_node_mat[-1,d] = 1
474 | new_adj_mat[idx,N] = 1
475 | new_adj_mat[N,idx] = 1
476 | feature_lst.append((new_node_mat, new_adj_mat, N))
477 |
478 |
479 | return feature_lst
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 | def copy_atom(atom):
488 | new_atom = Chem.Atom(atom.GetSymbol())
489 | new_atom.SetFormalCharge(atom.GetFormalCharge())
490 | new_atom.SetAtomMapNum(atom.GetAtomMapNum())
491 | return new_atom
492 |
493 | def add_atom_at_position(editmol, position_idx, new_atom, new_bond):
494 | '''
495 | position_idx: index of edited atom in editmol
496 | new_atom: 'C', 'N', 'O', ...
497 | new_bond: SINGLE, DOUBLE
498 | '''
499 | ###### 1 edit mol
500 | new_atom = Chem.rdchem.Atom(new_atom)
501 | rwmol = deepcopy(editmol)
502 | new_atom_idx = rwmol.AddAtom(new_atom)
503 | rwmol.AddBond(position_idx, new_atom_idx, order = new_bond)
504 | ###### 2 check valid of new mol
505 | if not is_valid_mol(rwmol):
506 | return None
507 | try:
508 | rwmol.UpdatePropertyCache()
509 | except:
510 | return None
511 | smiles = Chem.MolToSmiles(rwmol)
512 | assert '.' not in smiles
513 | return canonical(smiles)
514 |
515 |
516 | def add_fragment_at_position(editmol, position_idx, fragment, new_bond):
517 | '''
518 | position_idx: index of edited atom in editmol
519 | fragment: e.g., "C1=CC=CC=C1", "C1=CC=NC=C1", ...
520 | new_bond: {SINGLE, DOUBLE}
521 |
522 | Return:
523 | list of SMILES
524 | '''
525 | new_smiles_set = set()
526 | fragment_mol = Chem.MolFromSmiles(fragment)
527 | current_atom = editmol.GetAtomWithIdx(position_idx)
528 | neighbor_atom_set = set() ## index of neighbor of current atom in new_mol
529 |
530 |
531 | ## (A) add a bond between atom and ring
532 | #### 1. initialize empty new_mol
533 | new_mol = Chem.RWMol(Chem.MolFromSmiles(''))
534 |
535 | #### 2. add editmol into new_mol
536 | old_idx2new_idx = dict()
537 | for atom in editmol.GetAtoms():
538 | old_idx = atom.GetIdx()
539 | new_atom = copy_atom(atom)
540 | new_idx = new_mol.AddAtom(new_atom)
541 | old_idx2new_idx[old_idx] = new_idx
542 | assert old_idx == new_idx
543 | for bond in editmol.GetBonds():
544 | a1 = bond.GetBeginAtom()
545 | a2 = bond.GetEndAtom()
546 | i1 = a1.GetIdx()
547 | i2 = a2.GetIdx()
548 | i1_new = old_idx2new_idx[i1]
549 | i2_new = old_idx2new_idx[i2]
550 | bt = bond.GetBondType()
551 | new_mol.AddBond(i1_new, i2_new, bt)
552 | ### collect the neighbor atoms of current atom, both are in ring.
553 | if (i1==position_idx or i2==position_idx) and (a1.IsInRing() and a2.IsInRing()):
554 | neighbor_atom_set.add(i1_new)
555 | neighbor_atom_set.add(i2_new)
556 | if neighbor_atom_set != set():
557 | neighbor_atom_set.remove(old_idx2new_idx[position_idx])
558 |
559 | #### 3. combine two components
560 | #### 3.1 add fragment into new_mol
561 | new_atom_idx_lst = []
562 | old_idx2new_idx2 = dict() ### fragment idx -> new mol idx
563 | for atom in fragment_mol.GetAtoms():
564 | old_atom_idx = atom.GetIdx()
565 | new_atom = copy_atom(atom)
566 | new_atom_idx = new_mol.AddAtom(new_atom)
567 | new_atom_idx_lst.append(new_atom_idx)
568 | old_idx2new_idx2[old_atom_idx] = new_atom_idx
569 | for bond in fragment_mol.GetBonds():
570 | a1 = bond.GetBeginAtom().GetIdx()
571 | a2 = bond.GetEndAtom().GetIdx()
572 | i1 = old_idx2new_idx2[a1]
573 | i2 = old_idx2new_idx2[a2]
574 | bt = bond.GetBondType()
575 | new_mol.AddBond(i1, i2, bt)
576 |
577 | #### 3.2 enumerate possible binding atoms and generate new smiles
578 | for i in new_atom_idx_lst: ### enumeration
579 | copy_mol = deepcopy(new_mol)
580 | copy_mol.AddBond(old_idx2new_idx[position_idx], i, new_bond)
581 | if is_valid_mol(copy_mol):
582 | try:
583 | copy_mol.UpdatePropertyCache()
584 | new_smiles = Chem.MolToSmiles(copy_mol)
585 | new_smiles = canonical(new_smiles)
586 | if new_smiles is not None:
587 | assert '.' not in new_smiles
588 | new_smiles_set.add(new_smiles)
589 | except:
590 | pass
591 |
592 |
593 | # if not current_atom.IsInRing() or new_bond != rdkit.Chem.rdchem.BondType.SINGLE:
594 | if not current_atom.IsInRing():
595 | return new_smiles_set
596 |
597 |
598 | # print(new_smiles_set)
599 | ## (B) share bond between rings
600 | #### 1. initialize empty new_mol
601 | new_mol = Chem.RWMol(Chem.MolFromSmiles(''))
602 |
603 | #### 2. add editmol into new_mol
604 | old_idx2new_idx = dict()
605 | for atom in editmol.GetAtoms():
606 | old_idx = atom.GetIdx()
607 | new_atom = copy_atom(atom)
608 | new_idx = new_mol.AddAtom(new_atom)
609 | old_idx2new_idx[old_idx] = new_idx
610 | assert old_idx == new_idx
611 | for bond in editmol.GetBonds():
612 | a1 = bond.GetBeginAtom().GetIdx()
613 | a2 = bond.GetEndAtom().GetIdx()
614 | i1 = old_idx2new_idx[a1]
615 | i2 = old_idx2new_idx[a2]
616 | bt = bond.GetBondType()
617 | new_mol.AddBond(i1, i2, bt)
618 |
619 | # print(Chem.MolToSmiles(new_mol))
620 | #### 3. fragment mol
621 | ####### 3.1 find 2 common atoms and 1 bond
622 | current_atom = editmol.GetAtomWithIdx(old_idx2new_idx[position_idx])
623 | current_atom_symbol = current_atom.GetSymbol()
624 |
625 | atom_lst = list(fragment_mol.GetAtoms())
626 | for neighbor_atom in neighbor_atom_set:
627 | neighbor_atom_symbol = editmol.GetAtomWithIdx(neighbor_atom).GetSymbol()
628 | bondtype_edit = new_mol.GetBondBetweenAtoms(neighbor_atom, old_idx2new_idx[position_idx]).GetBondType()
629 | for i,v in enumerate(atom_lst):
630 | v_idx = v.GetIdx()
631 | ### v1 is neighbor of v
632 | for v1 in [atom_lst[i-1], atom_lst[i+1-len(atom_lst)]]:
633 | v1_idx = v1.GetIdx()
634 | bondtype_frag = fragment_mol.GetBondBetweenAtoms(v_idx, v1_idx).GetBondType()
635 | # print("current:", current_atom_symbol, "neighbor:", neighbor_atom_symbol, bondtype_edit)
636 | # print(v.GetSymbol(), v1.GetSymbol(), bondtype_frag)
637 | if v.GetSymbol()==current_atom_symbol and v1.GetSymbol()==neighbor_atom_symbol and bondtype_edit==bondtype_frag:
638 | ####### 3.1 find 2 common atoms and 1 bond
639 | # print("2 common atoms and 1 bond ")
640 | ############################################
641 | ####### 3.2 add other atoms and bonds
642 | new_mol2 = deepcopy(new_mol)
643 | old_idx2new_idx2 = dict()
644 | old_idx2new_idx2[v_idx] = current_atom.GetIdx()
645 | old_idx2new_idx2[v1_idx] = neighbor_atom
646 | for atom in fragment_mol.GetAtoms():
647 | old_idx = atom.GetIdx()
648 | if not (old_idx==v_idx or old_idx==v1_idx):
649 | new_atom = copy_atom(atom)
650 | new_idx = new_mol2.AddAtom(new_atom)
651 | old_idx2new_idx2[old_idx] = new_idx
652 | for bond in fragment_mol.GetBonds():
653 | a1 = bond.GetBeginAtom()
654 | a2 = bond.GetEndAtom()
655 | i1 = a1.GetIdx()
656 | i2 = a2.GetIdx()
657 | i1_new = old_idx2new_idx2[i1]
658 | i2_new = old_idx2new_idx2[i2]
659 | bt = bond.GetBondType()
660 | if not (set([i1,i2]) == set([v1.GetIdx(), v.GetIdx()])):
661 | new_mol2.AddBond(i1_new, i2_new, bt)
662 | ####### 3.2 add other atoms and bonds
663 | ####### 3.3 check validity and canonicalize
664 | if not is_valid_mol(new_mol2):
665 | continue
666 | try:
667 | new_mol2.UpdatePropertyCache()
668 | # print("success")
669 | except:
670 | continue
671 | new_smiles = Chem.MolToSmiles(new_mol2)
672 | new_smiles = canonical(new_smiles)
673 | if new_smiles is not None:
674 | assert '.' not in new_smiles
675 | new_smiles_set.add(new_smiles)
676 | # print(new_smiles)
677 | # print(new_smiles_set)
678 | return new_smiles_set
679 |
680 |
681 |
682 | def delete_substructure_at_idx(editmol, atom_idx_lst):
683 | edit_smiles = Chem.MolToSmiles(editmol)
684 | #### 1. initialize with empty mol
685 | new_mol = Chem.RWMol(Chem.MolFromSmiles(''))
686 |
687 | #### 2. add editmol into new_mol
688 | old_idx2new_idx = dict()
689 | for atom in editmol.GetAtoms():
690 | old_idx = atom.GetIdx()
691 | if old_idx in atom_idx_lst:
692 | continue
693 | new_atom = copy_atom(atom)
694 | new_idx = new_mol.AddAtom(new_atom)
695 | old_idx2new_idx[old_idx] = new_idx
696 | for bond in editmol.GetBonds():
697 | a1 = bond.GetBeginAtom().GetIdx()
698 | a2 = bond.GetEndAtom().GetIdx()
699 | if a1 in atom_idx_lst or a2 in atom_idx_lst:
700 | continue
701 | a1_new = old_idx2new_idx[a1]
702 | a2_new = old_idx2new_idx[a2]
703 | bt = bond.GetBondType()
704 | new_mol.AddBond(a1_new, a2_new, bt)
705 |
706 | if not is_valid_mol(new_mol):
707 | return None
708 | try:
709 | new_mol.UpdatePropertyCache()
710 | except:
711 | return None
712 | return new_mol, old_idx2new_idx
713 |
714 |
715 |
716 |
717 |
718 |
719 | def differentiable_graph2smiles_lgp(origin_smiles, differentiable_graph,
720 | leaf_extend_idx_pair, leaf_nonleaf_lst,
721 | max_num_offspring = 100, topk = 3):
722 | '''
723 | origin_smiles:
724 | origin_idx_lst [N] 0,1,...,d-1
725 | origin_node_mat [N,d]
726 | origin_substructure_lst
727 | origin_atomidx_2substridx
728 | origin_adjacency_matrix [N,N] 0/1
729 | differentiable_graph: returned results
730 | node_indicator [N+M,d]
731 | adjacency_weight [N+M,N+M]
732 | N is # of substructures in the molecule
733 | M is # of leaf node, also number of extended node.
734 | main utility
735 | add_atom_at_position
736 | add_fragment_at_position
737 | delete_substructure_at_idx
738 | REPLACE = delete + add
739 | Output:
740 | new_smiles_set
741 | '''
742 | new_smiles_set = set()
743 | #### 1. data preparation
744 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles))
745 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \
746 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles)
747 | node_indicator, adjacency_weight = differentiable_graph
748 | N = len(origin_idx_lst)
749 | M = len(leaf_extend_idx_pair)
750 | d = len(vocabulary)
751 |
752 | ####### 2.3 add todo: use adjacency_weight to further narrow scope
753 | for leaf_idx, extend_idx in leaf_extend_idx_pair:
754 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
755 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer
756 | leaf_atom_idx_lst = [leaf_atom_idx_lst]
757 | for leaf_atom_idx in leaf_atom_idx_lst:
758 | added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk]
759 | for substructure_idx in added_substructure_lst:
760 | new_substructure = vocabulary[substructure_idx]
761 | for new_bond in bondtype_list:
762 | if ith_substructure_is_atom(substructure_idx):
763 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
764 | new_atom = new_substructure, new_bond = new_bond)
765 | new_smiles_set.add(new_smiles)
766 | else:
767 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
768 | fragment = new_substructure , new_bond = new_bond)
769 | new_smiles_set = new_smiles_set.union(new_smiles_batch)
770 |
771 | return new_smiles_set.difference(set([None]))
772 |
773 |
774 |
775 |
776 |
777 | def differentiable_graph2smiles_v0(origin_smiles, differentiable_graph,
778 | leaf_extend_idx_pair, leaf_nonleaf_lst,
779 | max_num_offspring = 100, topk = 3):
780 | '''
781 | origin_smiles:
782 | origin_idx_lst [N] 0,1,...,d-1
783 | origin_node_mat [N,d]
784 | origin_substructure_lst
785 | origin_atomidx_2substridx
786 | origin_adjacency_matrix [N,N] 0/1
787 | differentiable_graph: returned results
788 | node_indicator [N+M,d]
789 | adjacency_weight [N+M,N+M]
790 | N is # of substructures in the molecule
791 | M is # of leaf node, also number of extended node.
792 | main utility
793 | add_atom_at_position
794 | add_fragment_at_position
795 | delete_substructure_at_idx
796 | REPLACE = delete + add
797 | Output:
798 | new_smiles_set
799 | '''
800 | new_smiles_set = set()
801 | #### 1. data preparation
802 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles))
803 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \
804 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles)
805 | node_indicator, adjacency_weight = differentiable_graph
806 | N = len(origin_idx_lst)
807 | M = len(leaf_extend_idx_pair)
808 | d = len(vocabulary)
809 |
810 | #### 2. edit the original molecule
811 | ####### 2.1 delete & 2.2 replace
812 | for leaf_idx, _ in leaf_extend_idx_pair:
813 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
814 | if type(leaf_atom_idx_lst)==int: ### single atom
815 | new_leaf_atom_idx_lst = [leaf_atom_idx_lst]
816 | else: #### ring
817 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond.
818 | new_leaf_atom_idx_lst = []
819 | remaining_atoms_idx_lst = []
820 | for i,v in enumerate(origin_substructure_lst):
821 | if i==leaf_idx:
822 | continue
823 | if type(v)==int:
824 | remaining_atoms_idx_lst.append(v)
825 | else: #### list
826 | remaining_atoms_idx_lst.extend(v)
827 | new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst]
828 | ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst
829 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond.
830 | result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst)
831 | if result is None:
832 | continue
833 | delete_mol, old_idx2new_idx = result
834 | delete_smiles = Chem.MolToSmiles(delete_mol)
835 | if delete_smiles is None or '.' in delete_smiles:
836 | continue
837 | delete_smiles = canonical(delete_smiles)
838 | new_smiles_set.add(delete_smiles) #### 2.1 delete done
839 | #### 2.2 replace a & b
840 | ######### (a) get neighbor substr
841 | neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1]
842 | assert len(neighbor_substructures_idx)==1
843 | neighbor_substructures_idx = neighbor_substructures_idx[0]
844 | neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx]
845 | if type(neighbor_atom_idx_lst)==int:
846 | neighbor_atom_idx_lst = [neighbor_atom_idx_lst]
847 | ######### (b) add new substructure todo, enumerate several possibility
848 | added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk] ### topk
849 | for substructure_idx in added_substructure_lst:
850 | new_substructure = vocabulary[substructure_idx]
851 | for new_bond in bondtype_list:
852 | for leaf_atom_idx in neighbor_atom_idx_lst:
853 | new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx]
854 | if ith_substructure_is_atom(substructure_idx):
855 | new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx,
856 | new_atom = new_substructure, new_bond = new_bond)
857 | new_smiles_set.add(new_smiles)
858 | else:
859 | new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx,
860 | fragment = new_substructure, new_bond = new_bond)
861 | new_smiles_set = new_smiles_set.union(new_smiles_batch)
862 |
863 |
864 |
865 | ####### 2.3 add todo: use adjacency_weight to further narrow scope
866 | for leaf_idx, extend_idx in leaf_extend_idx_pair:
867 | expand_prob = (adjacency_weight[leaf_idx][extend_idx] + adjacency_weight[extend_idx][leaf_idx])/2 ### [-inf, inf]
868 | # print("expand prob", expand_prob)
869 | if expand_prob < -3:
870 | continue
871 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
872 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer
873 | leaf_atom_idx_lst = [leaf_atom_idx_lst]
874 | for leaf_atom_idx in leaf_atom_idx_lst:
875 | added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk]
876 | for substructure_idx in added_substructure_lst:
877 | new_substructure = vocabulary[substructure_idx]
878 | for new_bond in bondtype_list:
879 | if ith_substructure_is_atom(substructure_idx):
880 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
881 | new_atom = new_substructure, new_bond = new_bond)
882 | new_smiles_set.add(new_smiles)
883 | else:
884 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
885 | fragment = new_substructure , new_bond = new_bond)
886 | new_smiles_set = new_smiles_set.union(new_smiles_batch)
887 |
888 |
889 |
890 | return new_smiles_set.difference(set([None]))
891 |
892 |
893 |
894 |
895 | def differentiable_graph2smiles(origin_smiles, differentiable_graph,
896 | leaf_extend_idx_pair, leaf_nonleaf_lst,
897 | max_num_offspring = 100, topk = 3):
898 | '''
899 | origin_smiles:
900 | origin_idx_lst [N] 0,1,...,d-1
901 | origin_node_mat [N,d]
902 | origin_substructure_lst
903 | origin_atomidx_2substridx
904 | origin_adjacency_matrix [N,N] 0/1
905 |
906 | differentiable_graph: returned results
907 | node_indicator [N+M,d]
908 | adjacency_weight [N+M,N+M]
909 |
910 | N is # of substructures in the molecule
911 | M is # of leaf node, also number of extended node.
912 |
913 |
914 | main utility
915 | add_atom_at_position
916 | add_fragment_at_position
917 | delete_substructure_at_idx
918 | REPLACE = delete + add
919 |
920 | Output:
921 | new_smiles_set
922 | '''
923 | leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst}
924 | leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair}
925 | new_smiles_set = set()
926 | #### 1. data preparation
927 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles))
928 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \
929 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles)
930 | node_indicator, adjacency_weight = differentiable_graph
931 | N = len(origin_idx_lst)
932 | M = len(leaf_extend_idx_pair)
933 | d = len(vocabulary)
934 |
935 |
936 | #### 2. edit the original molecule
937 | ####### 2.1 delete & 2.2 replace
938 | for leaf_idx, extend_idx in leaf_extend_idx_pair:
939 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
940 | if type(leaf_atom_idx_lst)==int: ### single atom
941 | new_leaf_atom_idx_lst = [leaf_atom_idx_lst]
942 | else: #### ring
943 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond.
944 | new_leaf_atom_idx_lst = []
945 | remaining_atoms_idx_lst = []
946 | for i,v in enumerate(origin_substructure_lst):
947 | if i==leaf_idx:
948 | continue
949 | if type(v)==int:
950 | remaining_atoms_idx_lst.append(v)
951 | else: #### list
952 | remaining_atoms_idx_lst.extend(v)
953 | new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst]
954 | ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst
955 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond.
956 | result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst)
957 | if result is None:
958 | continue
959 | delete_mol, old_idx2new_idx = result
960 | delete_smiles = Chem.MolToSmiles(delete_mol)
961 | if delete_smiles is None or '.' in delete_smiles:
962 | continue
963 | delete_smiles = canonical(delete_smiles)
964 | nonleaf_idx = leaf2nonleaf[leaf_idx]
965 | shrink_prob = (adjacency_weight[leaf_idx,nonleaf_idx] + adjacency_weight[nonleaf_idx,leaf_idx])/2
966 | if shrink_prob > -3: ### sigmoid(-3)=0.1
967 | new_smiles_set.add(delete_smiles)
968 | #### 2.1 delete done
969 | #### 2.2 replace a & b
970 | ######### (a) get neighbor substr
971 | neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1]
972 | assert len(neighbor_substructures_idx)==1
973 | neighbor_substructures_idx = neighbor_substructures_idx[0]
974 | neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx]
975 | if type(neighbor_atom_idx_lst)==int:
976 | neighbor_atom_idx_lst = [neighbor_atom_idx_lst]
977 | ######### (b) add new substructure todo, enumerate several possibility
978 | added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk]
979 | for substructure_idx in added_substructure_lst:
980 | new_substructure = vocabulary[substructure_idx]
981 | for new_bond in bondtype_list:
982 | for leaf_atom_idx in neighbor_atom_idx_lst:
983 | new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx]
984 | if ith_substructure_is_atom(substructure_idx):
985 | new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx,
986 | new_atom = new_substructure, new_bond = new_bond)
987 | new_smiles_set.add(new_smiles)
988 | else:
989 | new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx,
990 | fragment = new_substructure, new_bond = new_bond)
991 | new_smiles_set = new_smiles_set.union(new_smiles_batch)
992 | expand_prob = (adjacency_weight[leaf_idx,extend_idx] + adjacency_weight[extend_idx,leaf_idx])/2
993 | if expand_prob < -3:
994 | return new_smiles_set.difference(set([None]))
995 |
996 |
997 | ####### 2.3 add todo: use adjacency_weight to further narrow scope
998 | for leaf_idx, extend_idx in leaf_extend_idx_pair:
999 | expand_prob = (adjacency_weight[leaf_idx][extend_idx] + adjacency_weight[extend_idx][leaf_idx])/2 ### [-inf, inf]
1000 | # print("expand prob", expand_prob)
1001 | if expand_prob < -3:
1002 | continue
1003 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1004 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer
1005 | leaf_atom_idx_lst = [leaf_atom_idx_lst]
1006 | for leaf_atom_idx in leaf_atom_idx_lst:
1007 | added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk]
1008 | for substructure_idx in added_substructure_lst:
1009 | new_substructure = vocabulary[substructure_idx]
1010 | for new_bond in bondtype_list:
1011 | if ith_substructure_is_atom(substructure_idx):
1012 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
1013 | new_atom = new_substructure, new_bond = new_bond)
1014 | new_smiles_set.add(new_smiles)
1015 | else:
1016 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
1017 | fragment = new_substructure , new_bond = new_bond)
1018 | new_smiles_set = new_smiles_set.union(new_smiles_batch)
1019 |
1020 | return new_smiles_set.difference(set([None]))
1021 |
1022 |
1023 |
1024 |
1025 | def differentiable_graph2smiles_sample(origin_smiles, differentiable_graph,
1026 | leaf_extend_idx_pair, leaf_nonleaf_lst,
1027 | topk, epsilon):
1028 | '''
1029 | origin_smiles:
1030 | origin_idx_lst [N] 0,1,...,d-1
1031 | origin_node_mat [N,d]
1032 | origin_substructure_lst
1033 | origin_atomidx_2substridx
1034 | origin_adjacency_matrix [N,N] 0/1
1035 |
1036 | differentiable_graph: returned results
1037 | node_indicator [N+M,d]
1038 | adjacency_weight [N+M,N+M]
1039 |
1040 | N is # of substructures in the molecule
1041 | M is # of leaf node, also number of extended node.
1042 |
1043 |
1044 | main utility
1045 | add_atom_at_position
1046 | add_fragment_at_position
1047 | delete_substructure_at_idx
1048 | REPLACE = delete + add
1049 |
1050 | Output:
1051 | new_smiles_set
1052 | '''
1053 | leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst}
1054 | leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair}
1055 | new_smiles_set = set()
1056 | #### 1. data preparation
1057 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles))
1058 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \
1059 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles)
1060 | node_indicator, adjacency_weight = differentiable_graph
1061 | N = len(origin_idx_lst)
1062 | M = len(leaf_extend_idx_pair)
1063 | d = len(vocabulary)
1064 |
1065 |
1066 | #### 2. edit the original molecule
1067 | ####### 2.1 delete & 2.2 replace
1068 | for leaf_idx, extend_idx in leaf_extend_idx_pair:
1069 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1070 | if type(leaf_atom_idx_lst)==int: ### single atom
1071 | new_leaf_atom_idx_lst = [leaf_atom_idx_lst]
1072 | else: #### ring
1073 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond.
1074 | new_leaf_atom_idx_lst = []
1075 | remaining_atoms_idx_lst = []
1076 | for i,v in enumerate(origin_substructure_lst):
1077 | if i==leaf_idx:
1078 | continue
1079 | if type(v)==int:
1080 | remaining_atoms_idx_lst.append(v)
1081 | else: #### list
1082 | remaining_atoms_idx_lst.extend(v)
1083 | new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst]
1084 | ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst
1085 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond.
1086 | result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst)
1087 | if result is None:
1088 | continue
1089 | delete_mol, old_idx2new_idx = result
1090 | delete_smiles = Chem.MolToSmiles(delete_mol)
1091 | if delete_smiles is None or '.' in delete_smiles:
1092 | continue
1093 | delete_smiles = canonical(delete_smiles)
1094 | nonleaf_idx = leaf2nonleaf[leaf_idx]
1095 | shrink_prob = (adjacency_weight[leaf_idx,nonleaf_idx] + adjacency_weight[nonleaf_idx,leaf_idx])/2
1096 | if shrink_prob > -3: ### sigmoid(-3)=0.1
1097 | new_smiles_set.add(delete_smiles)
1098 | #### 2.1 delete done
1099 | #### 2.2 replace a & b
1100 | ######### (a) get neighbor substr
1101 | neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1]
1102 | assert len(neighbor_substructures_idx)==1
1103 | neighbor_substructures_idx = neighbor_substructures_idx[0]
1104 | neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx]
1105 | if type(neighbor_atom_idx_lst)==int:
1106 | neighbor_atom_idx_lst = [neighbor_atom_idx_lst]
1107 | ######### (b) add new substructure todo, enumerate several possibility
1108 | u = random.random()
1109 | if u < epsilon:
1110 | added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk] ### topk (greedy)
1111 | else:
1112 | added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator[leaf_idx], k=topk + 3)
1113 | added_substructure_lst = list(set(added_substructure_lst))[:topk] ### avoid repetition
1114 | for substructure_idx in added_substructure_lst:
1115 | new_substructure = vocabulary[substructure_idx]
1116 | for new_bond in bondtype_list:
1117 | for leaf_atom_idx in neighbor_atom_idx_lst:
1118 | new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx]
1119 | if ith_substructure_is_atom(substructure_idx):
1120 | new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx,
1121 | new_atom = new_substructure, new_bond = new_bond)
1122 | new_smiles_set.add(new_smiles)
1123 | else:
1124 | new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx,
1125 | fragment = new_substructure, new_bond = new_bond)
1126 | new_smiles_set = new_smiles_set.union(new_smiles_batch)
1127 | expand_prob = (adjacency_weight[leaf_idx,extend_idx] + adjacency_weight[extend_idx,leaf_idx])/2
1128 | if expand_prob < -3:
1129 | return new_smiles_set.difference(set([None]))
1130 |
1131 |
1132 | ####### 2.3 add todo: use adjacency_weight to further narrow scope
1133 | for leaf_idx, extend_idx in leaf_extend_idx_pair:
1134 | expand_prob = (adjacency_weight[leaf_idx][extend_idx] + adjacency_weight[extend_idx][leaf_idx])/2 ### [-inf, inf]
1135 | # print("expand prob", expand_prob)
1136 | if expand_prob < -3:
1137 | continue
1138 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1139 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer
1140 | leaf_atom_idx_lst = [leaf_atom_idx_lst]
1141 | for leaf_atom_idx in leaf_atom_idx_lst:
1142 | u = random.random()
1143 | if u < epsilon:
1144 | added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk]
1145 | else:
1146 | added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator[extend_idx], k=topk + 3)
1147 | added_substructure_lst = list(set(added_substructure_lst))[:topk] ### avoid repetition
1148 | for substructure_idx in added_substructure_lst:
1149 | new_substructure = vocabulary[substructure_idx]
1150 | for new_bond in bondtype_list:
1151 | if ith_substructure_is_atom(substructure_idx):
1152 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
1153 | new_atom = new_substructure, new_bond = new_bond)
1154 | new_smiles_set.add(new_smiles)
1155 | else:
1156 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
1157 | fragment = new_substructure , new_bond = new_bond)
1158 | new_smiles_set = new_smiles_set.union(new_smiles_batch)
1159 |
1160 | return new_smiles_set.difference(set([None]))
1161 |
1162 |
1163 |
1164 | def differentiable_graph2smiles_sample_v2(origin_smiles, differentiable_graph,
1165 | leaf_extend_idx_pair, leaf_nonleaf_lst,
1166 | topk, epsilon):
1167 | '''
1168 | origin_smiles:
1169 | origin_idx_lst [N] 0,1,...,d-1
1170 | origin_node_mat [N,d]
1171 | origin_substructure_lst
1172 | origin_atomidx_2substridx
1173 | origin_adjacency_matrix [N,N] 0/1
1174 |
1175 | differentiable_graph: returned results
1176 | node_indicator [N+M,d]
1177 | adjacency_weight [N+M,N+M]
1178 |
1179 | N is # of substructures in the molecule
1180 | M is # of leaf node, also number of extended node.
1181 |
1182 | main utility
1183 | add_atom_at_position
1184 | add_fragment_at_position
1185 | delete_substructure_at_idx
1186 | REPLACE = delete + add
1187 |
1188 | Output:
1189 | new_smiles_set
1190 | '''
1191 | leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst}
1192 | leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair}
1193 | new_smiles_set = set()
1194 | #### 1. data preparation
1195 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles))
1196 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \
1197 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles)
1198 | node_indicator, adjacency_weight = differentiable_graph #### both are np.array
1199 | N = len(origin_idx_lst)
1200 | M = len(leaf_extend_idx_pair)
1201 | d = len(vocabulary)
1202 |
1203 |
1204 | #### 2. edit the original molecule
1205 | ####### 2.1 delete & 2.2 replace
1206 | for leaf_idx, extend_idx in leaf_extend_idx_pair:
1207 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1208 | if type(leaf_atom_idx_lst)==int: ### single atom
1209 | new_leaf_atom_idx_lst = [leaf_atom_idx_lst]
1210 | else: #### ring
1211 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond.
1212 | new_leaf_atom_idx_lst = []
1213 | remaining_atoms_idx_lst = []
1214 | for i,v in enumerate(origin_substructure_lst):
1215 | if i==leaf_idx:
1216 | continue
1217 | if type(v)==int:
1218 | remaining_atoms_idx_lst.append(v)
1219 | else: #### list
1220 | remaining_atoms_idx_lst.extend(v)
1221 | new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst]
1222 | ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst
1223 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond.
1224 | result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst)
1225 | if result is None:
1226 | continue
1227 | delete_mol, old_idx2new_idx = result
1228 | delete_smiles = Chem.MolToSmiles(delete_mol)
1229 | if delete_smiles is None or '.' in delete_smiles:
1230 | continue
1231 | delete_smiles = canonical(delete_smiles)
1232 | nonleaf_idx = leaf2nonleaf[leaf_idx]
1233 | u = random.random()
1234 | shrink_prob = sigmoid(adjacency_weight[leaf_idx,nonleaf_idx]) + sigmoid(adjacency_weight[nonleaf_idx,leaf_idx])
1235 | if u < shrink_prob:
1236 | new_smiles_set.add(delete_smiles)
1237 | # if shrink_prob < 0: ### sigmoid(-3)=0.1
1238 | # new_smiles_set.add(delete_smiles)
1239 | #### 2.1 delete done
1240 | #### 2.2 replace a & b
1241 | ######### (a) get neighbor substr
1242 | neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1]
1243 | assert len(neighbor_substructures_idx)==1
1244 | neighbor_substructures_idx = neighbor_substructures_idx[0]
1245 | neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx]
1246 | if type(neighbor_atom_idx_lst)==int:
1247 | neighbor_atom_idx_lst = [neighbor_atom_idx_lst]
1248 | ######### (b) add new substructure todo, enumerate several possibility
1249 | u = random.random()
1250 |
1251 | node_indicator_leaf = node_indicator[leaf_idx] ### before softmax
1252 | node_indicator_leaf[12:] -= 5
1253 | node_indicator_leaf = np.exp(node_indicator_leaf)
1254 | node_indicator_leaf = node_indicator_leaf / np.sum(node_indicator_leaf)
1255 | if u < epsilon:
1256 | added_substructure_lst = list(np.argsort(-node_indicator_leaf))[:topk] ### topk (greedy)
1257 | else:
1258 | added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator_leaf, k=topk + 3)
1259 | added_substructure_lst = list(set(added_substructure_lst))[:topk] ### avoid repetition
1260 | for substructure_idx in added_substructure_lst:
1261 | new_substructure = vocabulary[substructure_idx]
1262 | for new_bond in bondtype_list:
1263 | for leaf_atom_idx in neighbor_atom_idx_lst:
1264 | new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx]
1265 | if ith_substructure_is_atom(substructure_idx):
1266 | new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx,
1267 | new_atom = new_substructure, new_bond = new_bond)
1268 | new_smiles_set.add(new_smiles)
1269 | else:
1270 | new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx,
1271 | fragment = new_substructure, new_bond = new_bond)
1272 | new_smiles_set = new_smiles_set.union(new_smiles_batch)
1273 |
1274 | expand_prob = sigmoid(adjacency_weight[leaf_idx,extend_idx]) + sigmoid(adjacency_weight[extend_idx,leaf_idx])/2
1275 | u = random.random()
1276 | if u > expand_prob:
1277 | return new_smiles_set.difference(set([None]))
1278 |
1279 |
1280 | ####### 2.3 add todo: use adjacency_weight to further narrow scope
1281 | for leaf_idx, extend_idx in leaf_extend_idx_pair:
1282 | expand_prob = (adjacency_weight[leaf_idx][extend_idx] + adjacency_weight[extend_idx][leaf_idx])/2 ### [-inf, inf]
1283 | # print("expand prob", expand_prob)
1284 | if expand_prob < -3:
1285 | continue
1286 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1287 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer
1288 | leaf_atom_idx_lst = [leaf_atom_idx_lst]
1289 | for leaf_atom_idx in leaf_atom_idx_lst:
1290 | u = random.random()
1291 | node_indicator_leaf = node_indicator[extend_idx]
1292 | node_indicator_leaf[12:]-=5
1293 | node_indicator_leaf = np.exp(node_indicator_leaf)
1294 | node_indicator_leaf = node_indicator_leaf / np.sum(node_indicator_leaf)
1295 | if u < epsilon:
1296 | added_substructure_lst = list(np.argsort(-node_indicator_leaf))[:topk]
1297 | else:
1298 | added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator_leaf, k=topk + 3)
1299 | added_substructure_lst = list(set(added_substructure_lst))[:topk] ### avoid repetition
1300 | for substructure_idx in added_substructure_lst:
1301 | new_substructure = vocabulary[substructure_idx]
1302 | for new_bond in bondtype_list:
1303 | if ith_substructure_is_atom(substructure_idx):
1304 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
1305 | new_atom = new_substructure, new_bond = new_bond)
1306 | new_smiles_set.add(new_smiles)
1307 | else:
1308 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
1309 | fragment = new_substructure , new_bond = new_bond)
1310 | new_smiles_set = new_smiles_set.union(new_smiles_batch)
1311 |
1312 | return new_smiles_set.difference(set([None]))
1313 |
1314 |
1315 | def differentiable_graph_to_smiles_purely_randomwalk(origin_smiles, differentiable_graph,
1316 | leaf_extend_idx_pair, leaf_nonleaf_lst,
1317 | topk = 3, epsilon = 0.7,):
1318 | # print(origin_smiles)
1319 | leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst}
1320 | leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair}
1321 | new_smiles_set = set()
1322 | #### 1. data preparation
1323 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles))
1324 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \
1325 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles)
1326 | node_indicator, adjacency_weight = differentiable_graph
1327 | N = len(origin_idx_lst)
1328 | M = len(leaf_extend_idx_pair)
1329 | d = len(vocabulary)
1330 |
1331 |
1332 |
1333 | #### 2. edit the original molecule
1334 | ####### 2.1 delete & 2.2 replace
1335 | for leaf_idx, extend_idx in leaf_extend_idx_pair:
1336 | u_shrink = random.random()
1337 | shrink, unchange, expand = False, False, False
1338 | if u_shrink < 0.7 and substr_num(origin_smiles) > 1:
1339 | shrink = True
1340 | else:
1341 | u_expand = random.random()
1342 | if u_expand < 0.3:
1343 | expand = True
1344 | else:
1345 | unchange = True
1346 |
1347 | if shrink or unchange:
1348 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1349 | if type(leaf_atom_idx_lst)==int: ### single atom
1350 | new_leaf_atom_idx_lst = [leaf_atom_idx_lst]
1351 | else: #### ring
1352 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond.
1353 | new_leaf_atom_idx_lst = []
1354 | remaining_atoms_idx_lst = []
1355 | for i,v in enumerate(origin_substructure_lst):
1356 | if i==leaf_idx:
1357 | continue
1358 | if type(v)==int:
1359 | remaining_atoms_idx_lst.append(v)
1360 | else: #### list
1361 | remaining_atoms_idx_lst.extend(v)
1362 | new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst]
1363 | ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst
1364 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond.
1365 | result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst)
1366 | if result is None:
1367 | continue
1368 | delete_mol, old_idx2new_idx = result
1369 | delete_smiles = Chem.MolToSmiles(delete_mol)
1370 | if delete_smiles is None or '.' in delete_smiles:
1371 | continue
1372 | delete_smiles = canonical(delete_smiles)
1373 | nonleaf_idx = leaf2nonleaf[leaf_idx]
1374 |
1375 | if shrink:
1376 | new_smiles_set.add(delete_smiles)
1377 | continue
1378 | #### 2.1 delete done
1379 | #### 2.2 replace a & b
1380 | ######### (a) get neighbor substr
1381 | neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1]
1382 | assert len(neighbor_substructures_idx)==1
1383 | neighbor_substructures_idx = neighbor_substructures_idx[0]
1384 | neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx]
1385 | if type(neighbor_atom_idx_lst)==int:
1386 | neighbor_atom_idx_lst = [neighbor_atom_idx_lst]
1387 | ######### (b) add new substructure todo, enumerate several possibility
1388 | # added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk]
1389 | added_substructure_lst = [random.choice(list(range(len(vocabulary)))) for i in range(topk)]
1390 | for substructure_idx in added_substructure_lst:
1391 | new_substructure = vocabulary[substructure_idx]
1392 | for new_bond in bondtype_list:
1393 | for leaf_atom_idx in neighbor_atom_idx_lst:
1394 | new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx]
1395 | if ith_substructure_is_atom(substructure_idx):
1396 | new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx,
1397 | new_atom = new_substructure, new_bond = new_bond)
1398 | new_smiles_set.add(new_smiles)
1399 | else:
1400 | new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx,
1401 | fragment = new_substructure, new_bond = new_bond)
1402 | new_smiles_set = new_smiles_set.union(new_smiles_batch)
1403 | continue ### end of shrink or unchange
1404 |
1405 | ####### 2.3 add todo: use adjacency_weight to further narrow scope
1406 | for leaf_idx, extend_idx in leaf_extend_idx_pair:
1407 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1408 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer
1409 | leaf_atom_idx_lst = [leaf_atom_idx_lst]
1410 | for leaf_atom_idx in leaf_atom_idx_lst:
1411 | added_substructure_lst = [random.choice(list(range(len(vocabulary)))) for i in range(topk)]
1412 | for substructure_idx in added_substructure_lst:
1413 | new_substructure = vocabulary[substructure_idx]
1414 | for new_bond in bondtype_list:
1415 | if ith_substructure_is_atom(substructure_idx):
1416 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
1417 | new_atom = new_substructure, new_bond = new_bond)
1418 | new_smiles_set.add(new_smiles)
1419 | else:
1420 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
1421 | fragment = new_substructure , new_bond = new_bond)
1422 | new_smiles_set = new_smiles_set.union(new_smiles_batch)
1423 |
1424 | return new_smiles_set.difference(set([None]))
1425 |
1426 |
1427 |
1428 |
1429 | def differentiable_graph2smiles_plus_random(origin_smiles, differentiable_graph,
1430 | leaf_extend_idx_pair, leaf_nonleaf_lst,
1431 | max_num_offspring = 100, topk = 3, epsilon = 0.7,
1432 | random_topology = False, random_substr = False):
1433 | '''
1434 | origin_smiles:
1435 | origin_idx_lst [N] 0,1,...,d-1
1436 | origin_node_mat [N,d]
1437 | origin_substructure_lst
1438 | origin_atomidx_2substridx
1439 | origin_adjacency_matrix [N,N] 0/1
1440 |
1441 | differentiable_graph: returned results
1442 | node_indicator [N+M,d]
1443 | adjacency_weight [N+M,N+M]
1444 |
1445 | N is # of substructures in the molecule
1446 | M is # of leaf node, also number of extended node.
1447 |
1448 |
1449 | main utility
1450 | add_atom_at_position
1451 | add_fragment_at_position
1452 | delete_substructure_at_idx
1453 | REPLACE = delete + add
1454 |
1455 | Output:
1456 | new_smiles_set
1457 | '''
1458 | leaf2nonleaf = {leaf:nonleaf for leaf,nonleaf in leaf_nonleaf_lst}
1459 | leaf2extend = {leaf:extend for leaf,extend in leaf_extend_idx_pair}
1460 | new_smiles_set = set()
1461 | #### 1. data preparation
1462 | origin_mol = Chem.rdchem.RWMol(Chem.MolFromSmiles(origin_smiles))
1463 | origin_idx_lst, origin_node_mat, origin_substructure_lst, \
1464 | origin_atomidx_2substridx, origin_adjacency_matrix, leaf_extend_idx_pair = smiles2graph(origin_smiles)
1465 | node_indicator, adjacency_weight = differentiable_graph
1466 | N = len(origin_idx_lst)
1467 | M = len(leaf_extend_idx_pair)
1468 | d = len(vocabulary)
1469 |
1470 | u_topology = random.random()
1471 | ### shrink, unchange, expand prob = 0.2, 0.3, 0.5
1472 | shrink, unchange, expand = False, False, False
1473 | for leaf_idx, extend_idx in leaf_extend_idx_pair:
1474 | u_topology = random.random()
1475 | #### 1. topology
1476 | if random_topology:
1477 | # if u_topology < 0.1:
1478 | # shrink = True
1479 | # elif 0.4 > u_topology >= 0.2:
1480 | # unchange = True
1481 | if u_topology < 0.2:
1482 | unchange = True
1483 | else:
1484 | expand = True
1485 | else: ## dmg topology
1486 | nonleaf_idx = leaf2nonleaf[leaf_idx]
1487 | shrink_prob = sigmoid((adjacency_weight[leaf_idx,nonleaf_idx] + adjacency_weight[nonleaf_idx,leaf_idx])/2)
1488 | # if u_topology < shrink_prob:
1489 | if False:
1490 | shrink = True
1491 | else:
1492 | u_topology2 = random.random()
1493 | expand_prob = (adjacency_weight[leaf_idx,extend_idx] + adjacency_weight[extend_idx,leaf_idx])/2
1494 | if u_topology2 < expand_prob:
1495 | expand_prob = True
1496 | else:
1497 | unchange = True
1498 |
1499 | if shrink or unchange:
1500 |
1501 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1502 | if type(leaf_atom_idx_lst)==int: ### single atom
1503 | new_leaf_atom_idx_lst = [leaf_atom_idx_lst]
1504 | else: #### ring
1505 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond.
1506 | new_leaf_atom_idx_lst = []
1507 | remaining_atoms_idx_lst = []
1508 | for i,v in enumerate(origin_substructure_lst):
1509 | if i==leaf_idx:
1510 | continue
1511 | if type(v)==int:
1512 | remaining_atoms_idx_lst.append(v)
1513 | else: #### list
1514 | remaining_atoms_idx_lst.extend(v)
1515 | new_leaf_atom_idx_lst = [leaf_atom_idx for leaf_atom_idx in leaf_atom_idx_lst if leaf_atom_idx not in remaining_atoms_idx_lst]
1516 | ### leaf_atom_idx_lst v.s. new_leaf_atom_idx_lst
1517 | ### consider the case that ring1 and ring2 share 2 atoms and 1 bond.
1518 | result = delete_substructure_at_idx(editmol = origin_mol, atom_idx_lst = new_leaf_atom_idx_lst)
1519 | if result is None:
1520 | continue
1521 | delete_mol, old_idx2new_idx = result
1522 | delete_smiles = Chem.MolToSmiles(delete_mol)
1523 | if delete_smiles is None or '.' in delete_smiles:
1524 | continue
1525 | delete_smiles = canonical(delete_smiles)
1526 | if shrink:
1527 | new_smiles_set.add(delete_smiles)
1528 | if unchange:
1529 | ######### (a) get neighbor substr
1530 | neighbor_substructures_idx = [idx for idx,value in enumerate(origin_adjacency_matrix[leaf_idx]) if value==1]
1531 | assert len(neighbor_substructures_idx)==1
1532 | neighbor_substructures_idx = neighbor_substructures_idx[0]
1533 | neighbor_atom_idx_lst = origin_substructure_lst[neighbor_substructures_idx]
1534 | if type(neighbor_atom_idx_lst)==int:
1535 | neighbor_atom_idx_lst = [neighbor_atom_idx_lst]
1536 | ######### (b) add new substructure todo, enumerate several possibility
1537 | if random_substr: ## random sample
1538 | added_substructure_lst = random.choices(list(range(len(vocabulary))), k=topk)
1539 | else: ## dmg sampling
1540 | u = random.random()
1541 | if u < epsilon:
1542 | added_substructure_lst = list(np.argsort(-node_indicator[leaf_idx]))[:topk] ### topk (greedy)
1543 | else:
1544 | added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator[leaf_idx], k=topk + 3)
1545 | added_substructure_lst = list(set(added_substructure_lst))[:topk] ### avoid repetition
1546 | for substructure_idx in added_substructure_lst:
1547 | new_substructure = vocabulary[substructure_idx]
1548 | for new_bond in bondtype_list:
1549 | for leaf_atom_idx in neighbor_atom_idx_lst:
1550 | new_leaf_atom_idx = old_idx2new_idx[leaf_atom_idx]
1551 | if ith_substructure_is_atom(substructure_idx):
1552 | new_smiles = add_atom_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx,
1553 | new_atom = new_substructure, new_bond = new_bond)
1554 | new_smiles_set.add(new_smiles)
1555 | else:
1556 | new_smiles_batch = add_fragment_at_position(editmol = delete_mol, position_idx = new_leaf_atom_idx,
1557 | fragment = new_substructure, new_bond = new_bond)
1558 | new_smiles_set = new_smiles_set.union(new_smiles_batch)
1559 | else: ## expand
1560 |
1561 | leaf_atom_idx_lst = origin_substructure_lst[leaf_idx]
1562 | if type(leaf_atom_idx_lst)==int: ### int: single atom; else: list of integer
1563 | leaf_atom_idx_lst = [leaf_atom_idx_lst]
1564 | if random_substr:
1565 | added_substructure_lst = random.choices(list(range(len(vocabulary))), k=topk)
1566 | else:
1567 | for leaf_atom_idx in leaf_atom_idx_lst:
1568 | u = random.random()
1569 | if u < epsilon:
1570 | added_substructure_lst = list(np.argsort(-node_indicator[extend_idx]))[:topk]
1571 | else:
1572 | added_substructure_lst = random.choices(population=list(range(len(vocabulary))), weights = node_indicator[extend_idx], k=topk + 3)
1573 | added_substructure_lst = list(set(added_substructure_lst))[:topk] ### avoid repetition
1574 | for substructure_idx in added_substructure_lst:
1575 | new_substructure = vocabulary[substructure_idx]
1576 | for new_bond in bondtype_list:
1577 | if ith_substructure_is_atom(substructure_idx):
1578 | new_smiles = add_atom_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
1579 | new_atom = new_substructure, new_bond = new_bond)
1580 | new_smiles_set.add(new_smiles)
1581 | else:
1582 | new_smiles_batch = add_fragment_at_position(editmol = origin_mol, position_idx = leaf_atom_idx,
1583 | fragment = new_substructure , new_bond = new_bond)
1584 | new_smiles_set = new_smiles_set.union(new_smiles_batch)
1585 |
1586 |
1587 |
1588 | return new_smiles_set.difference(set([None]))
1589 |
1590 |
1591 | def draw_smiles(smiles, figfile_name):
1592 | mol = Chem.MolFromSmiles(smiles)
1593 | Draw.MolToImageFile(mol, figfile_name, size = (300,180))
1594 | return
1595 |
1596 |
1597 |
1598 |
1599 |
1600 | if __name__ == "__main__":
1601 |
1602 | # s = 'FC1NCC(-C1=CC=CC(Br)=C1)C1'
1603 | s = 'C1=CC=CC=C1NC2=NC=CC(F)=N2'
1604 | draw_smiles(s, "figure/tmp.png")
1605 | # rawdata_file = "raw_data/zinc.tab"
1606 | # with open(rawdata_file) as fin:
1607 | # lines = fin.readlines()[1:]
1608 | # smiles_lst = [line.strip().strip('"') for line in lines]
1609 |
1610 | # from random import shuffle
1611 | # # shuffle(smiles_lst)
1612 | # fragment_lst = ['C1NCC1', 'C1CNCCN1', 'C1=CC=CC=C1', 'C1CNNC1']
1613 |
1614 |
1615 | # smiles = smiles_lst[0]
1616 | # differentiable_graph = smiles2differentiable_graph(smiles)
1617 | # ### optimize differentiable_graph using GNN
1618 | # smiles_set = differentiable_graph2smiles(origin_smiles = smiles, differentiable_graph = differentiable_graph, max_num_offspring = 100)
1619 | # print(len(smiles_set))
1620 |
1621 | # s = "CCc1ccc(Nc2nc(-c3ccccc3)cs2)cc1"
1622 | # s = 'Oc1ccc(Nc2nc(-c3ccc(Cl)cc3)cs2)cc1'
1623 | # draw_smiles(s, "figure/tmp.png")
1624 | # from tdc import Oracle
1625 | # qed = Oracle('qed')
1626 | # logp = Oracle('logp')
1627 | # jnk = Oracle('jnk3')
1628 | # gsk = Oracle('gsk3b')
1629 | # print(qed(s), logp(s), jnk(s), gsk(s))
1630 |
1631 |
1632 | # smiles_lst = ['NO', 'ONO', 'CNO', 'CS']
1633 | # print(similarity_matrix(smiles_lst))
1634 |
1635 |
1636 |
1637 | ##### test over zinc
1638 | # for smiles in tqdm(smiles_lst):
1639 | # mol = Chem.MolFromSmiles(smiles)
1640 | # print(smiles)
1641 | # new_smiles_lst = []
1642 | # for idx in range(mol.GetNumAtoms()):
1643 | # for fragment in fragment_lst:
1644 | # smiles_set = add_fragment_at_position(editmol = mol, position_idx = idx, fragment = fragment, new_bond = bondtype_list[0])
1645 | # new_smiles_lst.extend(list(smiles_set))
1646 | # new_smiles_lst = list(set(new_smiles_lst))
1647 | # print("length of smiles set is", len(new_smiles_lst))
1648 |
1649 |
1650 |
1651 | ### single test
1652 | # smiles = 'CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1'
1653 | # draw_smiles(smiles, "figure/origin.png")
1654 | # fragment = 'C1CCNCN1'
1655 | # mol = Chem.MolFromSmiles(smiles)
1656 | # for idx in range(mol.GetNumAtoms()):
1657 | # smiles_set = add_fragment_at_position(editmol = mol, position_idx = idx, fragment = fragment, new_bond = bondtype_list[0])
1658 | # print("length of smiles set is", len(smiles_set), smiles_set)
1659 | # for i,smiles in enumerate(smiles_set):
1660 | # name = "figure/" + str(idx) + '_' + str(i) + '.png'
1661 | # draw_smiles(smiles, name)
1662 |
1663 |
1664 |
1665 |
1666 | '''
1667 |
1668 | "CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1",
1669 | "C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1",
1670 | "N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)cc2)cc1",
1671 | "CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c2CCCCC3)C1",
1672 | "N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#N)C12CCCCC2",
1673 | "CC[NH+](CC)[C@](C)(CC)[C@H](O)c1cscc1Br"
1674 |
1675 | CCc1ccc(Nc2nc(-c3ccccc3)cs2)cc1
1676 |
1677 |
1678 |
1679 | rawdata_file = "raw_data/zinc.tab"
1680 | with open(rawdata_file) as fin:
1681 | lines = fin.readlines()[1:]
1682 | smiles_lst = [line.strip().strip('"') for line in lines]
1683 |
1684 |
1685 |
1686 | test case:
1687 |
1688 | smiles fragment
1689 | C1CCCC1 C1NCC1
1690 | C1=CC=CC=C1 C1CNCCN1
1691 | C1=CC=CC=C1 C1CCNCN1
1692 | CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1
1693 | '''
1694 |
1695 |
1696 |
--------------------------------------------------------------------------------