├── README.md ├── .gitignore ├── dnn_model ├── pk_queue.py ├── PickOutTop1000.py ├── single_vs_chemdiv.py ├── dnn_model.py ├── pk_eval.py ├── single_train.py ├── pk_virtual_screen.py ├── single_eval.py ├── pk_train.py ├── single_vs.py └── pk_input.py ├── data_files ├── chembl_cal_fp.py ├── chembl_cal_mask.py ├── chembl_input.py ├── 3_chembl_analyse_fp.ipynb ├── 3_fingerprint_analyse_additional.ipynb └── chembl_preparation.ipynb ├── rf_model ├── chembl_rf_vs.py └── chembl_rf.py └── LICENSE /README.md: -------------------------------------------------------------------------------- 1 | # Abstract 2 | Author: xiaotaw@qq.com (Any bug report is welcome) 3 | 4 | Time Created: Aug 2016 5 | 6 | Time Updated: Dec 2016 7 | 8 | Addr: Shenzhen, China 9 | 10 | Description: We attempt to explore ChEMBL's Inhibitors by deep neural network 11 | 12 | Website: https://xiaotaw.github.io/chembl/ 13 | 14 | 15 | # Background 16 | (add background for using DNN and RF to build this qsar model) 17 | 18 | # Problem 19 | (add one sentence abstract for current challenge) 20 | 21 | # Solution 22 | (how we solve the problem) 23 | 24 | # Method 25 | 26 | ## 1 get data 27 | 1.1 positive dataset was downloaded from chembl database 28 | 1.2 negtive dataset was selected from pubchem and chembl database(based on a reasonable assumption that almost the compound in pubchem was NOT the substrate of a protein kinase) 29 | 30 | ## 2 build the model 31 | 2.1 deep neural network(based on tensorflow) 32 | 2.2 random forest(based on scikit-learn) 33 | 2.3 a 'Tree' comprises one 'Term' and several 'Branches', where the 'Term' extracts the mutual figures of all the protein kinase. 34 | 35 | ## 3 train and evaluation 36 | 3.1 we train the model seperately and jointly, and then apply the model on pubchem dataset for virtual screening. 37 | 38 | 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | .venv/ 83 | venv/ 84 | ENV/ 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | 92 | 93 | # costom ingores 94 | # 95 | 96 | !.gitignore 97 | 98 | # log files 99 | log_files/ 100 | *.log 101 | 102 | # pred files 103 | pred_files/ 104 | *.pred 105 | 106 | # data file 107 | structure_files/ 108 | txt_files/ 109 | csv_files/ 110 | mask_files/ 111 | 112 | # cpkt files 113 | ckpt*/ 114 | *ckpt/ 115 | tmp*/ 116 | 117 | # back up files 118 | *.bk 119 | 120 | # edit temp files 121 | *~ 122 | 123 | # rf model files 124 | *.m 125 | 126 | # excel xlsx files 127 | *.xlsx 128 | 129 | # png files 130 | *.png 131 | -------------------------------------------------------------------------------- /dnn_model/pk_queue.py: -------------------------------------------------------------------------------- 1 | # Author: xiaotaw@qq.com (Any bug report is welcome) 2 | # Time Created: Oct 2016 3 | # Time Last Updated: Oct 2016 4 | # Addr: Shenzhen, China 5 | # Description: using multi-thread to load input data and generate batch. 6 | 7 | import time 8 | 9 | import Queue 10 | import threading 11 | 12 | import pk_input as pki 13 | 14 | 15 | 16 | target_list = ["cdk2", "egfr_erbB1", "gsk3b", "hgfr", "map_k_p38a", "tpk_lck", "tpk_src", "vegfr2"] 17 | target = target_list[0] 18 | d = pki.Datasets(target_list) 19 | 20 | 21 | # using queue 22 | 23 | # producer thread 24 | class Producer(threading.Thread): 25 | def __init__(self, t_name, d, queue): 26 | threading.Thread.__init__(self, name=t_name) 27 | self.queue = queue 28 | self.d = d 29 | def run(self): 30 | for i in range(10): 31 | t0 = time.time() 32 | batch = self.d.next_train_batch(target, 100, 200) 33 | t1 = time.time() 34 | #print("%s: %s generate batch with neg_begin=%d %5.3f" % (time.ctime(), self.getName(), self.d.neg.train_begin, t1-t0)) 35 | self.queue.put(batch, block=True, timeout=None) 36 | time.sleep(0.5) 37 | #print("%s: %s finished!" % (time.ctime(), self.getName())) 38 | 39 | # consumer thread 40 | class Consumer(threading.Thread): 41 | def __init__(self, t_name, queue): 42 | threading.Thread.__init__(self, name=t_name) 43 | self.queue = queue 44 | def run(self): 45 | while True: 46 | try: 47 | t0 = time.time() 48 | batch = self.queue.get(block=True, timeout=5) 49 | time.sleep(0.5) 50 | t1 = time.time() 51 | #print("%s: %s generate batch %5.3f" % (time.ctime(), self.getName(), t1-t0)) 52 | except: 53 | #print("%s: %s finished!" % (time.ctime(), self.getName())) 54 | break 55 | 56 | 57 | if __name__ == "__main__": 58 | queue = Queue.Queue(50) 59 | pro_list = [] 60 | for i in range(10): 61 | pro_list.append(Producer("Pro%d" % i, d, queue)) 62 | 63 | con = Consumer("Con", queue) 64 | 65 | for pro in pro_list: 66 | pro.start() 67 | 68 | con.start() 69 | 70 | for pro in pro_list: 71 | pro.join() 72 | 73 | con.join() 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /dnn_model/PickOutTop1000.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[16]: 5 | 6 | import pandas as pd 7 | from rdkit import Chem 8 | from rdkit.Chem import Draw 9 | 10 | 11 | # In[17]: 12 | 13 | target_list = ["CHEMBL203", "CHEMBL204", "CHEMBL235", "CHEMBL236", 14 | "CHEMBL244", "CHEMBL260", "CHEMBL4805", "CHEMBL4822"] 15 | 16 | g_step_list = [2161371, 2236500, 2086841, 2236500, 17 | 2161951, 2252100, 2168041, 1936221] 18 | 19 | # In[18]: 20 | 21 | ChemDiv_dir = "/raid/xiaotaw/ChemDiv/" 22 | fn_list = ["DC01_350000.sdf", "DC02_350000.sdf", 23 | "DC03_222773.sdf", "DC_saltdata_not-available_124145.sdf", 24 | "IC_non-excl_82693.sdf", "NC_340320.sdf"] 25 | #sup0 = Chem.SDMolSupplier(ChemDiv_dir + fn_list[0]) 26 | #ms0 = [x for x in sup0 if x is not None] 27 | 28 | 29 | # In[19]: 30 | 31 | #sup1 = Chem.SDMolSupplier(ChemDiv_dir + fn_list[1]) 32 | #ms1 = [x for x in sup1 if x is not None] 33 | #sup2 = Chem.SDMolSupplier(ChemDiv_dir + fn_list[2]) 34 | #ms2 = [x for x in sup2 if x is not None] 35 | 36 | 37 | # In[20]: 38 | 39 | i = 7 40 | target = target_list[i] 41 | g_step = g_step_list[i] 42 | pred_dir = "/home/scw4750/Documents/chembl/dnn_model/pred_files/%s/" % target 43 | pred_fn = pred_dir + "vs_chemdiv_%s_128_0.800_4.000e-03_%d.pred1000" % (target, g_step) 44 | chemdiv_pred = pd.read_csv(pred_fn, sep="\t", index_col=0, names=["id", "pred"]) 45 | #chemdiv_pred 46 | id_list = chemdiv_pred["id"].values 47 | #id_list 48 | 49 | 50 | # In[23]: 51 | 52 | m1000 = [] 53 | for fn in fn_list: 54 | print("start %s" % fn) 55 | sup = Chem.SDMolSupplier(ChemDiv_dir + fn) 56 | for m in sup: 57 | if (m is not None) and (m.GetProp("IDNUMBER") in id_list): 58 | m1000.append(m) 59 | #print(m.GetProp("IDNUMBER")) 60 | print("finished %s" % fn) 61 | 62 | 63 | # In[38]: 64 | 65 | def get_pred_value(id_): 66 | return chemdiv_pred["pred"][chemdiv_pred["id"] == id_].values[0] 67 | 68 | m1000.sort(key=lambda x: get_pred_value(x.GetProp("IDNUMBER")), reverse=True) 69 | 70 | 71 | # In[40]: 72 | 73 | writer = Chem.SDWriter(pred_fn.replace(".pred1000", "_top1000.sdf")) 74 | for m in m1000: 75 | writer.write(m) 76 | 77 | 78 | 79 | # In[ ]: 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /data_files/chembl_cal_fp.py: -------------------------------------------------------------------------------- 1 | # Author: xiaotaw@qq.com (Any bug report is welcome) 2 | # Time Created: Nov 2016 3 | # Time Last Updated: Dec 2016 4 | # Addr: Shenzhen, China 5 | # Description: 1. calculate atom pair fingerprint(apfp) for chembl molecules 6 | # 2. analyse apfp 7 | 8 | import os 9 | import gzip 10 | import numpy as np 11 | from collections import defaultdict 12 | 13 | from rdkit import Chem 14 | from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprint 15 | 16 | def dict_2_str(d): 17 | keylist = d.keys() 18 | keylist.sort() 19 | kv_list = ["{}: {}".format(k, d[k]) for k in keylist] 20 | return ", ".join(kv_list) 21 | 22 | """ 23 | ## calculate chembl apfp 24 | # 25 | sup = Chem.SmilesMolSupplier("structure_files/chembl.smiles", delimiter=",", smilesColumn=1, nameColumn=0, titleLine=True) 26 | 27 | if not os.path.exists("fp_files"): 28 | os.mkdir("fp_files") 29 | 30 | apfp_file = open("fp_files/chembl.apfp", "w") 31 | 32 | for m in sup: 33 | if m is None: 34 | continue 35 | id_ = m.GetProp("_Name") 36 | apfps = GetAtomPairFingerprint(Chem.RemoveHs(m)).GetNonzeroElements() 37 | apfp_file.write("%s\t{%s}\n" % (id_, dict_2_str(apfps))) 38 | 39 | apfp_file.close() 40 | 41 | 42 | ## calculate pns apfp 43 | # 44 | sup = Chem.SDMolSupplier("structure_files/pubchem_neg_sample.sdf") 45 | 46 | apfp_file = open("fp_files/pubchem_neg_sample.apfp", "w") 47 | 48 | for m in sup: 49 | if m is None: 50 | continue 51 | id_ = m.GetProp("PUBCHEM_COMPOUND_CID") 52 | apfps = GetAtomPairFingerprint(Chem.RemoveHs(m)).GetNonzeroElements() 53 | apfp_file.write("%s\t{%s}\n" % (id_, dict_2_str(apfps))) 54 | 55 | apfp_file.close() 56 | """ 57 | 58 | ## calculate ChemDiv apfp 59 | ChemDiv_dir = "/raid/xiaotaw/ChemDiv" 60 | fn_list = ["DC01_350000.sdf", "DC02_350000.sdf", "DC03_222773.sdf", "DC_saltdata_not-available_124145.sdf", "IC_non-excl_82693.sdf", "NC_340320.sdf"] 61 | 62 | for fn in fn_list: 63 | gzsup = Chem.SDMolSupplier(ChemDiv_dir + "/" + fn) 64 | molecules = [x for x in gzsup if x is not None] 65 | apfp_file = open(ChemDiv_dir + "/" + fn.replace("sdf", "apfp"), "w") 66 | for mol in molecules: 67 | id_ = mol.GetProp("IDNUMBER") 68 | apfps = GetAtomPairFingerprint(Chem.RemoveHs(mol)).GetNonzeroElements() 69 | apfp_file.write("%s\t{%s}\n" % (id_, dict_2_str(apfps))) 70 | apfp_file.close() 71 | 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /rf_model/chembl_rf_vs.py: -------------------------------------------------------------------------------- 1 | # Author: xiaotaw@qq.com (Any bug report is welcome) 2 | # Time Created: Dec 2016 3 | # Time Last Updated: Dec 2016 4 | # Addr: Shenzhen, China 5 | # Description: 6 | 7 | import os 8 | import sys 9 | import time 10 | import getpass 11 | import numpy as np 12 | from scipy import sparse 13 | from collections import defaultdict 14 | from matplotlib import pyplot as plt 15 | from sklearn.externals import joblib 16 | from sklearn.ensemble import RandomForestClassifier 17 | 18 | sys.path.append("/home/%s/Documents/chembl/data_files/" % getpass.getuser()) 19 | import chembl_input as ci 20 | 21 | # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others. 22 | target_list = ["CHEMBL279", "CHEMBL203", # Protein Kinases 23 | "CHEMBL217", "CHEMBL253", # GPCRs (Family A) 24 | "CHEMBL235", "CHEMBL206", # Nuclear Hormone Receptors 25 | "CHEMBL240", "CHEMBL4296", # Voltage Gated Ion Channels 26 | "CHEMBL4805", # Ligand Gated Ion Channels 27 | "CHEMBL204", "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others 28 | ] 29 | 30 | 31 | target_list = ["CHEMBL206", "CHEMBL217", "CHEMBL235", "CHEMBL240", 32 | "CHEMBL253", "CHEMBL4296", 33 | ] 34 | 35 | def virtual_screening(target): 36 | # input dataset 37 | d = ci.DatasetVS(target) 38 | # read saved rf clf model 39 | clf = joblib.load("model_files/rf_%s.m" % target) 40 | # pred file 41 | pred_dir = "pred_files/%s" % target 42 | if not os.path.exists(pred_dir): 43 | os.mkdir(pred_dir) 44 | for part_num in range(13): 45 | t0 = time.time() 46 | pred_path = os.path.join(pred_dir, "vs_pubchem_%d.pred" % part_num) 47 | predfile = open(pred_path, "w") 48 | fp_dir = "/raid/xiaotaw/pubchem/fp_files/%d" % part_num 49 | for i in range(part_num * 10000000 + 1, (part_num + 1) * 10000000, 25000): 50 | fp_fn = os.path.join(fp_dir, "Compound_{:0>9}_{:0>9}.apfp".format(i, i + 24999)) 51 | if os.path.exists(fp_fn): 52 | d.reset(fp_fn) 53 | features = d.features_dense 54 | pred = clf.predict_proba(features) 55 | for id_, pred_v in zip(d.pubchem_id, pred[:, 1]): 56 | predfile.write("%s\t%f\n" % (id_, pred_v)) 57 | #print("%s\t%d\n" % (fp_fn, pred.shape[0])) 58 | t1 = time.time() 59 | print("%s %d: %.3f" %(target, part_num, t1-t0)) 60 | 61 | 62 | def analyse(target): 63 | vs_pred_file = "pred_files/%s/vs_pubchem.pred" % (target) 64 | if not os.path.exists(vs_pred_file): 65 | os.system("cat pred_files/%s/vs_pubchem_*.pred > pred_files/%s/vs_pubchem.pred" % (target, target)) 66 | aa = np.genfromtxt(vs_pred_file, delimiter="\t") 67 | a = aa[:, 1] 68 | test_pred_file = "pred_files/test_%s.pred" % (target) 69 | bb = np.genfromtxt(test_pred_file, delimiter="\t", usecols=[1,2]) 70 | b = bb[:, 0][bb[:, 1].astype(bool)] 71 | x = [] 72 | y = [] 73 | for i in range(10): 74 | mark = (i + 1) / 20.0 75 | xi = 1.0 * (b > mark).sum() / b.shape[0] 76 | yi = (a > mark).sum() 77 | x.append(xi) 78 | y.append(yi) 79 | plt.plot(x, y, "*") 80 | plt.xlabel("pos yeild rate") 81 | plt.ylabel("vs pubchem false pos") 82 | plt.savefig("pred_files/%s/analyse.png" % (target)) 83 | 84 | 85 | target = target_list[int(sys.argv[1])] 86 | virtual_screening(target) 87 | analyse(target) 88 | 89 | 90 | """ 91 | for target in target_list: 92 | virtual_screening(target) 93 | #analyse(target) 94 | """ 95 | -------------------------------------------------------------------------------- /dnn_model/single_vs_chemdiv.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import os 8 | import sys 9 | import time 10 | import datetime 11 | import numpy as np 12 | import pandas as pd 13 | import tensorflow as tf 14 | from matplotlib import pyplot as plt 15 | 16 | 17 | import dnn_model 18 | sys.path.append("/home/scw4750/Documents/chembl/data_files/") 19 | import chembl_input as ci 20 | 21 | vs_batch_size = 1024 22 | 23 | def virtual_screening_chemdiv(target, g_step, gpu_num=0): 24 | t_0 = time.time() 25 | 26 | # dataset 27 | d = ci.DatasetChemDiv(target) 28 | # batch size 29 | batch_size = 128 30 | # input vec_len 31 | input_vec_len = d.num_features 32 | # keep prob 33 | keep_prob = 0.8 34 | # weight decay 35 | wd = 0.004 36 | # g_step 37 | #g_step = 2236500 38 | 39 | # virtual screen pred file 40 | pred_dir = "pred_files/%s" % target 41 | if not os.path.exists(pred_dir): 42 | os.makedirs(pred_dir) 43 | pred_path = os.path.join(pred_dir, "vs_chemdiv_%s_%d_%4.3f_%4.3e_%d.pred" % (target, batch_size, keep_prob, wd, g_step)) 44 | predfile = open(pred_path, 'w') 45 | print("virtual screen ChemDiv starts at: %s\n" % datetime.datetime.now()) 46 | 47 | # checkpoint file 48 | ckpt_dir = "ckpt_files/%s" % target 49 | ckpt_path = os.path.join(ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd)) 50 | 51 | # screening 52 | with tf.Graph().as_default(), tf.device("/gpu: %d" % gpu_num): 53 | # the input 54 | input_placeholder = tf.placeholder(tf.float32, shape = (None, input_vec_len)) 55 | # the term 56 | base = dnn_model.term(input_placeholder, in_units=input_vec_len, wd=wd, keep_prob=1.0) 57 | # the branches 58 | softmax = dnn_model.branch(target, base, wd=wd, keep_prob=1.0) 59 | # create a saver. 60 | saver = tf.train.Saver(tf.trainable_variables()) 61 | # Start screen 62 | config=tf.ConfigProto(allow_soft_placement=True) 63 | config.gpu_options.per_process_gpu_memory_fraction = 0.35 64 | 65 | 66 | with tf.Session(config=config) as sess: 67 | # Restores variables from checkpoint 68 | saver.restore(sess, ckpt_path + "-%d" % g_step) 69 | 70 | for ids, features in d.batch_generator_chemdiv(vs_batch_size): 71 | sm = sess.run(softmax, feed_dict = {input_placeholder: features}) 72 | for id_, sm_v in zip(ids, sm[:, 1]): 73 | predfile.write("%s\t%f\n" % (id_, sm_v)) 74 | """ 75 | try: 76 | while True: 77 | ids, features = d.generate_batch(vs_batch_size) 78 | sm = sess.run(softmax, feed_dict = {input_placeholder: features.toarray()}) 79 | for id_, sm_v in zip(ids, sm[:, 1]): 80 | predfile.write("%s\t%f\n" % (id_, sm_v)) 81 | except StopIteration: 82 | pass 83 | """ 84 | predfile.close() 85 | print("duration: %.3f" % (time.time() - t_0)) 86 | 87 | 88 | def analyse_sort_chemdiv(target, g_step): 89 | pred_file = "pred_files/%s/vs_chemdiv_%s_128_0.800_4.000e-03_%d.pred" % (target, target, g_step) 90 | pred = pd.read_csv(pred_file, sep="\t", names=("id", "pred")) 91 | pred.sort_values(by="pred", ascending=False, inplace=True) 92 | pred1000 = pred.iloc[:1000] 93 | pred1000.to_csv(pred_file.replace(".pred", ".pred1000"), header=False, sep="\t") 94 | 95 | 96 | if __name__ == "__main__": 97 | target_list = ["CHEMBL203", "CHEMBL204", "CHEMBL205", 98 | "CHEMBL206", "CHEMBL217", "CHEMBL235", "CHEMBL240", 99 | "CHEMBL244", "CHEMBL253", "CHEMBL279", "CHEMBL340", 100 | "CHEMBL4005", "CHEMBL4296", "CHEMBL4805", "CHEMBL4822", 101 | ] 102 | 103 | g_list = [2161371, 2236500, 2235600, 104 | 2091321, 2161661, 2086841, 2020411, 105 | 2161951, 2012041, 2161661, 2246400, 106 | 2235900, 2238000, 2168041, 1936221, 107 | ] 108 | 109 | #i = int(sys.argv[1]) 110 | #target = target_list[i] 111 | #g_step = g_list[i] 112 | virtual_screening_chemdiv(target="CHEMBL4005", g_step=2235900, gpu_num=1) 113 | analyse_sort_chemdiv("CHEMBL4005", g_step=2235900) 114 | 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /rf_model/chembl_rf.py: -------------------------------------------------------------------------------- 1 | # Author: xiaotaw@qq.com (Any bug report is welcome) 2 | # Time Created: Nov 2016 3 | # Time Last Updated: Dec 2016 4 | # Addr: Shenzhen, China 5 | # Description: 6 | 7 | import os 8 | import sys 9 | import math 10 | import time 11 | import getpass 12 | import numpy as np 13 | import pandas as pd 14 | from scipy import sparse 15 | from collections import defaultdict 16 | from matplotlib import pyplot as plt 17 | from sklearn.externals import joblib 18 | from sklearn.metrics import roc_curve, auc 19 | from sklearn.ensemble import RandomForestClassifier 20 | 21 | sys.path.append("/home/%s/Documents/chembl/data_files/" % getpass.getuser()) 22 | import chembl_input as ci 23 | 24 | 25 | 26 | 27 | 28 | # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others. 29 | target_list = ["CHEMBL279", "CHEMBL203", # Protein Kinases 30 | "CHEMBL217", "CHEMBL253", # GPCRs (Family A) 31 | "CHEMBL235", "CHEMBL206", # Nuclear Hormone Receptors 32 | "CHEMBL240", "CHEMBL4296", # Voltage Gated Ion Channels 33 | "CHEMBL4805", # Ligand Gated Ion Channels 34 | "CHEMBL204", "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others 35 | ] 36 | 37 | # the target 38 | target = "CHEMBL203" 39 | 40 | 41 | # 42 | model_dir = "model_files" 43 | if not os.path.exists(model_dir): 44 | os.mkdir(model_dir) 45 | 46 | # 47 | pred_dir = "pred_files" 48 | if not os.path.exists(pred_dir): 49 | os.mkdir(pred_dir) 50 | 51 | 52 | def train_pred(target, train_pos_multiply=0): 53 | # 54 | d = ci.Dataset(target, train_pos_multiply=train_pos_multiply) 55 | # random forest clf 56 | clf = RandomForestClassifier(n_estimators=100, max_features=1.0/3, n_jobs=10, max_depth=None, min_samples_split=5, random_state=0) 57 | # fit model 58 | clf.fit(d.train_features, d.train_labels) 59 | # save model 60 | joblib.dump(clf, model_dir + "/rf_%s.m" % target) 61 | # predict class probabilities 62 | #train_pred_proba = clf.predict_proba(d.train_features)[:, 1] 63 | test_pred_proba = clf.predict_proba(d.test_features)[:, 1] 64 | # save pred 65 | test_pred_file = open(pred_dir + "/test_%s.pred" % target, "w") 66 | for id_, pred_v, l_v in zip(d.target_ids_test, test_pred_proba, d.test_labels): 67 | test_pred_file.write("%s\t%f\t%f\n" % (id_, pred_v, l_v)) 68 | test_pred_file.close() 69 | # draw roc fig 70 | fpr, tpr, _ = roc_curve(d.test_labels, test_pred_proba) 71 | roc_auc = auc(fpr, tpr) 72 | plt.figure() 73 | plt.plot(fpr, tpr, color="r", lw=2, label="ROC curve (area = %.2f)" % roc_auc) 74 | plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--") 75 | plt.xlim([0.0, 1.0]) 76 | plt.ylim([0.0, 1.05]) 77 | plt.xlabel("False Positive Rate") 78 | plt.ylabel("True Positive Rate") 79 | plt.title("Receiver operating characteristic of RF model on %s" % target) 80 | plt.legend(loc="lower right") 81 | plt.savefig("%s.png" % target) 82 | #plt.show() 83 | 84 | 85 | 86 | 87 | 88 | target_list = ["CHEMBL206", "CHEMBL217", "CHEMBL235", "CHEMBL240", 89 | "CHEMBL253", "CHEMBL4296", 90 | ] 91 | 92 | 93 | for target in target_list: 94 | t0 = time.time() 95 | train_pred(target, train_pos_multiply=0) 96 | t1 = time.time() 97 | print("%s duration: %.3f" % (target, t1-t0)) 98 | 99 | 100 | """ 101 | 102 | t0 = time.time() 103 | train_pred("CHEMBL4805", train_pos_multiply=0) 104 | t1 = time.time() 105 | print("%s duration: %.3f" % (target, t1-t0)) 106 | 107 | """ 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | """ 120 | pns_pred = clf.predict(d.target_pns_features) 121 | cns_pred = clf.predict(d.target_cns_features_train) 122 | train_pred = clf.predict(d.train_features) 123 | test_pred = clf.predict(d.test_features) 124 | 125 | pns_result = ci.compute_performance(d.target_pns_mask.values.astype(int), pns_pred) 126 | cns_result = ci.compute_performance(d.target_cns_mask_train.values.astype(int), cns_pred) 127 | train_result = ci.compute_performance(d.train_labels, train_pred) 128 | test_result = ci.compute_performance(d.test_labels, test_pred) 129 | 130 | print(train_result) 131 | 132 | print(test_result) 133 | """ 134 | 135 | # load model 136 | #clf = joblib.load(model_dir + "/rf_%s.m" % target) 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /dnn_model/dnn_model.py: -------------------------------------------------------------------------------- 1 | # Author: xiaotaw@qq.com (Any bug report is welcome) 2 | # Time Created: Aug 2016 3 | # Time Last Updated: Nov 2016 4 | # Addr: Shenzhen, China 5 | # Description: dnn model for pk 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | 15 | def fcnn_layer(input_tensor, input_dim, output_dim, layer_name, 16 | wd=False, wd_collection=False, 17 | keep_prob=0.8, variable_collection=False): 18 | with tf.name_scope(layer_name): 19 | weights = tf.Variable(tf.truncated_normal([input_dim, output_dim], stddev=1.0 / np.sqrt(float(input_dim))), name="weights") 20 | if wd is not None: 21 | weight_decay = tf.mul(tf.nn.l2_loss(weights), wd, name="weight_loss") 22 | tf.add_to_collection(wd_collection, weight_decay) 23 | biases = tf.Variable(tf.zeros([output_dim]), name="biases") 24 | if variable_collection: 25 | tf.add_to_collection(variable_collection, weights) 26 | tf.add_to_collection(variable_collection, biases) 27 | relu = tf.nn.relu(tf.matmul(input_tensor, weights) + biases, name="relu") 28 | if keep_prob: 29 | dropout = tf.nn.dropout(relu, keep_prob, name="dropout") 30 | return dropout 31 | else: 32 | return relu 33 | 34 | def term_reg(in_layer, in_units=4852, th1_units=4096, th2_units=3072, th3_units=2048, 35 | wd=0.004, keep_prob=0.8): 36 | th1 = fcnn_layer(in_layer, in_units, th1_units, "term_layer1", wd=wd, wd_collection="term_wd_loss", keep_prob=keep_prob, variable_collection="term") 37 | th2 = fcnn_layer(th1, th1_units, th2_units, "term_layer2", wd=wd, wd_collection="term_wd_loss", keep_prob=keep_prob, variable_collection="term") 38 | th3 = fcnn_layer(th2, th2_units, th3_units, "term_layer3", wd=wd, wd_collection="term_wd_loss", keep_prob=keep_prob, variable_collection="term") 39 | return th3 40 | 41 | def branch_reg(branch_name, base_layer, wd=0.004, keep_prob=0.8, 42 | base_units=2048, bh1_units=2048, bh2_units=1024, out_units = 1): 43 | var_collection="branch_"+branch_name 44 | with tf.name_scope(branch_name): 45 | bh1 = fcnn_layer(base_layer, base_units, bh1_units, "branch_layer1", wd=wd, wd_collection=branch_name+"_wd_loss", keep_prob=keep_prob, variable_collection=var_collection) 46 | bh2 = fcnn_layer(bh1, bh1_units, bh2_units, "branch_layer2", wd=wd, wd_collection=branch_name+"_wd_loss", keep_prob=keep_prob, variable_collection=var_collection) 47 | with tf.name_scope("out_relu"): 48 | weights = tf.Variable(tf.truncated_normal([bh2_units, out_units], stddev=1.0 / np.sqrt(float(bh2_units))), name="weights") 49 | biases = tf.Variable(tf.zeros([out_units]), name="biases") 50 | tf.add_to_collection(var_collection, weights) 51 | tf.add_to_collection(var_collection, biases) 52 | out_relu = tf.nn.relu(tf.matmul(bh2, weights) + biases, name="out_relu") 53 | return out_relu 54 | 55 | 56 | def term(in_layer, in_units = 9561, th1_units = 8192, th2_units = 6144, th3_units = 4096, 57 | wd=0.004, keep_prob=0.8): 58 | th1 = fcnn_layer(in_layer, in_units, th1_units, "term_layer1", wd=wd, wd_collection="term_wd_loss", keep_prob=keep_prob, variable_collection="term") 59 | th2 = fcnn_layer(th1, th1_units, th2_units, "term_layer2", wd=wd, wd_collection="term_wd_loss", keep_prob=keep_prob, variable_collection="term") 60 | th3 = fcnn_layer(th2, th2_units, th3_units, "term_layer3", wd=wd, wd_collection="term_wd_loss", keep_prob=keep_prob, variable_collection="term") 61 | #th4 = fcnn_layer(th3, th3_units, th4_units, "term_layer4", wd=wd, wd_collection="term_wd_loss", keep_prob=keep_prob, variable_collection="term") 62 | return th3 63 | 64 | def branch(branch_name, base_layer, wd=0.004, keep_prob=0.8, 65 | base_units = 4096, bh1_units = 4096, bh2_units = 2048, bh3_units = 1024, out_units = 2): 66 | var_collection="branch_"+branch_name 67 | with tf.name_scope(branch_name): 68 | bh1 = fcnn_layer(base_layer, base_units, bh1_units, "branch_layer1", wd=wd, wd_collection=branch_name+"_wd_loss", keep_prob=keep_prob, variable_collection=var_collection) 69 | bh2 = fcnn_layer(bh1, bh1_units, bh2_units, "branch_layer2", wd=wd, wd_collection=branch_name+"_wd_loss", keep_prob=keep_prob, variable_collection=var_collection) 70 | bh3 = fcnn_layer(bh2, bh2_units, bh3_units, "branch_layer3", wd=wd, wd_collection=branch_name+"_wd_loss", keep_prob=keep_prob, variable_collection=var_collection) 71 | with tf.name_scope("softmax_linear"): 72 | weights = tf.Variable(tf.truncated_normal([bh3_units, out_units], stddev=1.0 / np.sqrt(float(bh3_units))), name="weights") 73 | biases = tf.Variable(tf.zeros([out_units]), name="biases") 74 | tf.add_to_collection(var_collection, weights) 75 | tf.add_to_collection(var_collection, biases) 76 | softmax = tf.nn.softmax(tf.matmul(bh3, weights) + biases, name="softmax") 77 | return softmax 78 | 79 | def x_entropy(softmax, labels, loss_name, neg_weight=1): 80 | with tf.name_scope(loss_name): 81 | weight = np.array([neg_weight, 1]).astype(np.float32) 82 | cross_entropy = -tf.reduce_sum(tf.reduce_mean(labels * tf.log(softmax) * weight, reduction_indices=[0]), name="x_entropy") 83 | return cross_entropy 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /dnn_model/pk_eval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Author: xiaotaw@qq.com (Any bug report is welcome) 3 | # Time: Aug 2016 4 | # Addr: Shenzhen 5 | # Description: evaluate pk model 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import os 12 | import time 13 | import numpy 14 | import datetime 15 | import tensorflow as tf 16 | import dnn_model 17 | import pk_input 18 | 19 | def evaluate(target_list): 20 | """ evaluate the model 21 | """ 22 | # virtual screen log file 23 | log_dir = "log_files" 24 | logpath = os.path.join(log_dir, "pk_eval.log") 25 | logfile = open(logpath, "w") 26 | logfile.write("pk_eval starts at: %s\n" % datetime.datetime.now()) 27 | 28 | # get input dataset 29 | train_dataset_dict = dict() 30 | test_dataset_dict = dict() 31 | for target in target_list: 32 | train_dataset_dict[target] = pk_input.get_inputs_by_cpickle("data_files/pkl_files/" + target + "_train.pkl") 33 | test_dataset_dict[target] = pk_input.get_inputs_by_cpickle("data_files/pkl_files/" + target + "_test.pkl") 34 | 35 | neg_dataset = pk_input.get_inputs_by_cpickle("data_files/pkl_files/pubchem_neg_sample.pkl") 36 | 37 | 38 | 39 | with tf.Graph().as_default(), tf.device("/gpu:0"): 40 | 41 | # build the model 42 | input_placeholder = tf.placeholder(tf.float32, shape = (None, 8192)) 43 | label_placeholder = tf.placeholder(tf.float32, shape = (None, 2)) 44 | # build the "Tree" with a mutual "Term" and several "Branches" 45 | base = dnn_model.term(input_placeholder, keep_prob=1.0) 46 | softmax_dict = dict() 47 | wd_loss_dict = dict() 48 | x_entropy_dict = dict() 49 | loss_dict = dict() 50 | accuracy_dict = dict() 51 | for target in target_list: 52 | # compute softmax 53 | softmax_dict[target] = dnn_model.branch(target, base, keep_prob=1.0) 54 | # compute loss. 55 | wd_loss_dict[target] = tf.add_n(tf.get_collection("term_wd_loss") + tf.get_collection(target+"_wd_loss")) 56 | x_entropy_dict[target] = dnn_model.x_entropy(softmax_dict[target], label_placeholder, target) 57 | loss_dict[target] = tf.add(wd_loss_dict[target], x_entropy_dict[target]) 58 | # compute accuracy 59 | accuracy_dict[target] = dnn_model.accuracy(softmax_dict[target], label_placeholder, target) 60 | 61 | # create a saver. 62 | saver = tf.train.Saver(tf.trainable_variables()) 63 | 64 | # create session. 65 | config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 66 | config.gpu_options.per_process_gpu_memory_fraction = 0.5 67 | sess = tf.Session(config=config) 68 | 69 | # Restores variables from checkpoint 70 | saver.restore(sess, "ckpt_files/model.ckpt-40000") 71 | 72 | 73 | 74 | # eval train dataset 75 | for target in target_list: 76 | t0 = float(time.time()) 77 | compds = numpy.vstack([train_dataset_dict[target].compds, neg_dataset.compds]) 78 | labels = numpy.vstack([train_dataset_dict[target].labels, neg_dataset.labels]) 79 | t1 = float(time.time()) 80 | LV, XLV, ACC, prediction, label_dense = sess.run( 81 | [wd_loss_dict[target], 82 | x_entropy_dict[target], 83 | accuracy_dict[target], 84 | tf.argmax(softmax_dict[target], 1), 85 | tf.argmax(labels, 1)], 86 | feed_dict = { 87 | input_placeholder: compds, 88 | label_placeholder: labels, 89 | } 90 | ) 91 | t2 = time.time() 92 | TP, TN, FP, FN, SEN, SPE, MCC = dnn_model.compute_performance(label_dense, prediction) 93 | format_str = "%6d %6d %6.3f %6.3f %10.3f %5d %5d %5d %5d %6.3f %6.3f %6.3f %6.3f %5.3f %5.3f %s" 94 | logfile.write(format_str % (5000, 40000, LV, XLV, 0, TP, FN, TN, FP, SEN, SPE, ACC, MCC, t1-t0, t2-t1, target)) 95 | logfile.write('\n') 96 | print(format_str % (5000, 40000, LV, XLV, 0, TP, FN, TN, FP, SEN, SPE, ACC, MCC, t1-t0, t2-t1, target)) 97 | 98 | # eval test dataset 99 | for target in target_list: 100 | t0 = float(time.time()) 101 | compds = test_dataset_dict[target].compds 102 | labels = test_dataset_dict[target].labels 103 | t1 = float(time.time()) 104 | LV, XLV, ACC, prediction, label_dense = sess.run( 105 | [wd_loss_dict[target], 106 | x_entropy_dict[target], 107 | accuracy_dict[target], 108 | tf.argmax(softmax_dict[target], 1), 109 | tf.argmax(labels, 1)], 110 | feed_dict = { 111 | input_placeholder: compds, 112 | label_placeholder: labels, 113 | } 114 | ) 115 | t2 = time.time() 116 | TP, TN, FP, FN, SEN, SPE, MCC = dnn_model.compute_performance(label_dense, prediction) 117 | format_str = "%6d %6d %6.3f %6.3f %10.3f %5d %5d %5d %5d %6.3f %6.3f %6.3f %6.3f %5.3f %5.3f %s" 118 | logfile.write(format_str % (5000, 40000, LV, XLV, 0, TP, FN, TN, FP, SEN, SPE, ACC, MCC, t1-t0, t2-t1, target)) 119 | logfile.write('\n') 120 | print(format_str % (5000, 40000, LV, XLV, 0, TP, FN, TN, FP, SEN, SPE, ACC, MCC, t1-t0, t2-t1, target)) 121 | 122 | logfile.close() 123 | 124 | 125 | if __name__ == "__main__": 126 | target_list = ["cdk2", "egfr_erbB1", "gsk3b", "hgfr", 127 | "map_k_p38a", "tpk_lck", "tpk_src", "vegfr2"] 128 | 129 | evaluate(target_list) 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /dnn_model/single_train.py: -------------------------------------------------------------------------------- 1 | # Author: xiaotaw@qq.com (Any bug report is welcome) 2 | # Time Created: Aug 2016 3 | # Time Last Updated: Dec 2016 4 | # Addr: Shenzhen, China 5 | # Description: train chembl model for a single target 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import os 12 | import sys 13 | import time 14 | import getpass 15 | import datetime 16 | import tensorflow as tf 17 | 18 | sys.path.append("/home/%s/Documents/chembl/data_files/" % getpass.getuser()) 19 | import dnn_model 20 | import chembl_input as ci 21 | 22 | 23 | def train(target, gpu_num=0, tpm=0, 24 | train_from=0, keep_prob=0.8, wd=0.004, batch_size=128): 25 | """""" 26 | # dataset 27 | d = ci.Dataset(target, train_pos_multiply=tpm) 28 | d.test_features_dense = d.test_features.toarray() 29 | # learning rate 30 | step_per_epoch = int(d.train_size / batch_size) # approximately equal to 7456 31 | start_learning_rate = 0.05 32 | decay_step = step_per_epoch * 10 33 | decay_rate = 0.9 34 | # max train steps 35 | max_step = 300 * step_per_epoch 36 | # input vec_len 37 | input_vec_len = d.num_features 38 | # checkpoint file 39 | ckpt_dir = "ckpt_files/%s" % target 40 | ckpt_path = os.path.join(ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd)) 41 | if not os.path.exists(ckpt_dir): 42 | os.makedirs(ckpt_dir) 43 | # train log file 44 | log_dir = "log_files" 45 | if not os.path.exists(log_dir): 46 | os.mkdir(log_dir) 47 | log_path = os.path.join(log_dir, "train_%s_%d_%4.3f_%4.3e.log" % (target, batch_size, keep_prob, wd)) 48 | logfile = open(log_path, 'w') 49 | logfile.write("train starts at: %s\n" % datetime.datetime.now()) 50 | 51 | 52 | # build dnn model and train 53 | with tf.Graph().as_default(), tf.device('/gpu: %d' % gpu_num): 54 | # placeholders 55 | input_placeholder = tf.placeholder(tf.float32, shape = (None, input_vec_len)) 56 | label_placeholder = tf.placeholder(tf.float32, shape = (None, 2)) 57 | # global step and learning rate 58 | global_step = tf.Variable(train_from, trainable=False) 59 | learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, decay_step, decay_rate) 60 | # build a Graph that computes the softmax predictions from the 61 | # inference model. 62 | base = dnn_model.term(input_placeholder, in_units=input_vec_len, wd=wd, keep_prob=keep_prob) 63 | # compute softmax 64 | softmax = dnn_model.branch(target, base, wd=wd, keep_prob=keep_prob) 65 | # compute loss. 66 | wd_loss = tf.add_n(tf.get_collection("term_wd_loss") + tf.get_collection(target+"_wd_loss")) 67 | x_entropy = dnn_model.x_entropy(softmax, label_placeholder, target, neg_weight=1) 68 | loss = tf.add(wd_loss, x_entropy) 69 | # train op 70 | train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step) 71 | # create a saver. 72 | saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=None) 73 | # start running operations on the Graph. 74 | config=tf.ConfigProto(allow_soft_placement=True) 75 | config.gpu_options.per_process_gpu_memory_fraction = 0.3 76 | sess = tf.Session(config=config) 77 | # initialize all variables at first. 78 | sess.run(tf.initialize_all_variables()) 79 | if train_from != 0: 80 | saver.restore(sess, ckpt_path + "-%d" % train_from) 81 | # print title to screen and log file 82 | title_str = " step g_step wdloss xloss learn_rate TP FN TN FP SEN SPE ACC MCC t1-t0 t2-t1 t3-t2 target" 83 | print(title_str) 84 | logfile.write(title_str + "\n") 85 | 86 | # format str 87 | format_str = "%6d %6d %6.4f %7.5f %10.8f %5d %5d %5d %5d %6.4f %6.4f %6.4f %6.4f %5.3f %5.3f %5.3f %10s " 88 | 89 | # train the model 90 | for step in xrange(max_step): 91 | t0 = time.time() 92 | 93 | # get a batch sample 94 | perm = d.generate_perm_for_train_batch(batch_size) 95 | compds_batch = d.train_features[perm].toarray() 96 | labels_batch_one_hot = d.train_labels_one_hot[perm] 97 | t1 = time.time() 98 | # train once 99 | _ = sess.run([train_op],feed_dict = {input_placeholder: compds_batch, label_placeholder: labels_batch_one_hot}) 100 | t2 = time.time() 101 | 102 | # compute performance for the train batch 103 | if step % step_per_epoch == 0 or (step + 1) == max_step: 104 | g_step, wd_ls, x_ls, lr, pred = sess.run([global_step, wd_loss, x_entropy, learning_rate, tf.argmax(softmax, 1)], 105 | feed_dict = {input_placeholder: compds_batch, label_placeholder: labels_batch_one_hot}) 106 | tp, tn, fp, fn, sen, spe, acc, mcc = ci.compute_performance(d.train_labels[perm], pred) 107 | t3 = float(time.time()) 108 | logfile.write(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target) + "\n") 109 | print(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target)) 110 | 111 | # save the model checkpoint periodically. 112 | if step % (10 * step_per_epoch) == 0 or (step + 1) == max_step: 113 | saver.save(sess, ckpt_path, global_step=global_step, write_meta_graph=False) 114 | 115 | # compute performance for the test data 116 | if step % (10 * step_per_epoch) == 0 or (step + 1) == max_step: 117 | x_ls, pred = sess.run([x_entropy, tf.argmax(softmax, 1)], 118 | feed_dict = {input_placeholder: d.test_features_dense, label_placeholder: d.test_labels_one_hot}) 119 | tp, tn, fp, fn, sen, spe, acc, mcc = ci.compute_performance(d.test_labels, pred) 120 | logfile.write(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, 0, 0, 0, target) + "\n") 121 | print(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, 0, 0, 0, target)) 122 | 123 | logfile.write("train ends at: %s\n" % datetime.datetime.now()) 124 | logfile.close() 125 | 126 | if __name__ == "__main__": 127 | 128 | # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others. 129 | target_list = ["CHEMBL279", 130 | "CHEMBL4805", # Ligand Gated Ion Channels 131 | "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others 132 | ] 133 | 134 | 135 | #for target in target_list: 136 | train(target="CHEMBL4722", gpu_num=0, tpm=1) 137 | 138 | -------------------------------------------------------------------------------- /dnn_model/pk_virtual_screen.py: -------------------------------------------------------------------------------- 1 | # Author: xiaotaw@qq.com (Any bug report is welcome) 2 | # Time Created: Aug 2016 3 | # Time Last Updated: Oct 2016 4 | # Addr: Shenzhen, China 5 | # Description: apply pk model to pubchem dataset, to screen potential active substrate(drugs) 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import os 12 | import sys 13 | import glob 14 | import time 15 | import numpy 16 | import cPickle 17 | import datetime 18 | import tensorflow as tf 19 | from scipy import sparse 20 | 21 | import dnn_model 22 | 23 | 24 | def virtual_screening(target_list, part_num): 25 | 26 | # virtual screen log file 27 | log_dir = "log_files" 28 | logpath = os.path.join(log_dir, "virtual_screen_pubchem_%d.log" % part_num) 29 | logfile = open(logpath, "w") 30 | logfile.write("virtual screen %d starts at: %s\n" % (part_num, datetime.datetime.now())) 31 | 32 | # input and output dir 33 | pkl_dir = "/raid/xiaotaw/pubchem/pkl_files" 34 | prediction_dir = "/raid/xiaotaw/pubchem/prediction_files" 35 | if not os.path.exists(prediction_dir): 36 | os.mkdir(prediction_dir) 37 | 38 | # screening 39 | with tf.Graph().as_default(), tf.device("/gpu:%d" % (part_num // 3)): 40 | # the input 41 | input_placeholder = tf.placeholder(tf.float32, shape = (None, 8192)) 42 | 43 | # the term 44 | base = dnn_model.term(input_placeholder, keep_prob=1.0) 45 | 46 | # the branches 47 | softmax_dict = dict() 48 | for target in target_list: 49 | softmax_dict[target] = dnn_model.branch(target, base, keep_prob=1.0) 50 | 51 | # create a saver. 52 | saver = tf.train.Saver(tf.trainable_variables()) 53 | 54 | # Start screen 55 | prediction_dict = dict() 56 | config=tf.ConfigProto(allow_soft_placement=True) 57 | config.gpu_options.per_process_gpu_memory_fraction = 0.2 58 | with tf.Session(config=config) as sess: 59 | # Restores variables from checkpoint 60 | saver.restore(sess, "ckpt_files/model.ckpt-40000") 61 | 62 | 63 | #for i in xrange(1, 121225001, 25000): 64 | begin_num = part_num * 10000000 + 1 65 | if part_num == 11: 66 | end_num = 121225001 67 | else: 68 | end_num = (part_num + 1) * 10000000 + 1 69 | 70 | for i in xrange(begin_num, end_num, 25000): 71 | start_time = float(time.time()) 72 | # get input compounds 73 | in_file = "Compound_" + "{:0>9}".format(i) + "_" + "{:0>9}".format(i + 24999) + ".pkl" 74 | if not os.path.exists(os.path.join(pkl_dir, in_file)): 75 | logfile.write("%s\t0\tnot exists" % in_file) 76 | continue 77 | infile = open(os.path.join(pkl_dir, in_file), "rb") 78 | data = cPickle.load(infile) 79 | numpy.clip(data, 0, 1, out=data) 80 | compds = data.astype(numpy.float32) 81 | infile.close() 82 | for target in target_list: 83 | prediction_dict[target] = sess.run(tf.argmax(softmax_dict[target], 1), feed_dict = {input_placeholder: compds}) 84 | 85 | # stack prediction result into a matrix with shape = (num_compds, num_targets) 86 | prediction = numpy.vstack([prediction_dict[k] for k in target_list]).T 87 | logfile.write("%s\t%s\t%d\n" % (in_file, prediction.sum(axis=0), compds.shape[0])) 88 | # convert into sparse matrix 89 | if not prediction.sum()==0: 90 | sparse_prediction = sparse.csr_matrix(prediction) 91 | # save result into file 92 | out_file = in_file.replace("pkl", "prediction") 93 | outfile = open(os.path.join(prediction_dir, out_file), "wb") 94 | cPickle.dump(sparse_prediction, outfile, protocol=2) 95 | outfile.close() 96 | #logfile.write(str(sparse_prediction)+"\n") 97 | print("%s\t%s\t%d\t%.3f" % (in_file, prediction.sum(axis=0), compds.shape[0], time.time()-start_time)) 98 | logfile.write("virtual screen %d ends at: %s\n" % (part_num, datetime.datetime.now())) 99 | logfile.close() 100 | 101 | 102 | # analyse vs result 103 | def analyse_vs_result(): 104 | prediction_dir = "/raid/xiaotaw/pubchem/prediction_files" 105 | mgfp_dir = "/raid/xiaotaw/pubchem/morgan_fp" 106 | 107 | cid_list = [] 108 | result_list = [] 109 | 110 | for i in xrange(1, 121225001, 25000): 111 | 112 | #for i in xrange(1, 125001, 25000): 113 | 114 | # load data from prediction file 115 | pre_file = "Compound_" + "{:0>9}".format(i) + "_" + "{:0>9}".format(i + 24999) + ".prediction" 116 | pre_filepath = os.path.join(prediction_dir, pre_file) 117 | if not os.path.exists(pre_filepath): 118 | continue 119 | prefile = open(pre_filepath, "rb") 120 | sp = cPickle.load(prefile) 121 | prefile.close() 122 | 123 | # get potential hit compounds' index 124 | index, _ = sp.nonzero() 125 | index = sorted(list(set(index))) 126 | # get potential hit compounds' prediction result 127 | result = sp.toarray()[index] 128 | 129 | # get potential hit compounds' cids from mgfp file 130 | mgfp_file = pre_file.replace("prediction", "mgfp") 131 | mgfp_filepath = os.path.join(mgfp_dir, mgfp_file) 132 | mgfpfile = open(mgfp_filepath, "r") 133 | lines = mgfpfile.readlines() 134 | mgfpfile.close() 135 | cid = [lines[x].split("\t")[0] for x in index] 136 | 137 | # append each file to 138 | cid_list.extend(cid) 139 | result_list.append(result) 140 | 141 | print("%s\t%d" % (pre_file, len(index))) 142 | 143 | results_pre = numpy.vstack(result_list) 144 | results_cid = numpy.array(cid_list, dtype=numpy.int) 145 | results = numpy.hstack([results_cid.reshape(len(cid_list), 1), results_pre]) 146 | 147 | outfile = open("vs_pubchem.result", "wb") 148 | cPickle.dump(results, outfile, protocol=2) 149 | outfile.close() 150 | 151 | return results 152 | 153 | 154 | 155 | def get_chembl_pos(target_list): 156 | mgfp_dir = "data_files/mgfp_files/" 157 | cid_dir = "data_files/id_files/" 158 | 159 | def get_cids(target): 160 | tmp_list = list() 161 | infile = open(mgfp_dir + target + ".mgfp6", "r") 162 | lines = infile.readlines() 163 | infile.close() 164 | lines = [x.split("\t") for x in lines] 165 | infile = open(cid_dir + target + ".cids", "r") 166 | cids = [x.split("\t")[1] for x in infile.readlines()] 167 | 168 | for i in range(len(lines)): 169 | line = lines[i] 170 | if line[1] == "1": 171 | tmp_list.append(cids[i]) 172 | return tmp_list 173 | 174 | 175 | pos_cid_dict = dict() 176 | for target in target_list: 177 | pos_cid_dict[target] = set(get_cids(target)) 178 | 179 | return pos_cid_dict 180 | 181 | 182 | 183 | 184 | if __name__ == "__main__": 185 | 186 | target_list = ["cdk2", "egfr_erbB1", "gsk3b", "hgfr", 187 | "map_k_p38a", "tpk_lck", "tpk_src", "vegfr2"] 188 | 189 | #virtual_screening(target_list, int(sys.argv[1])) 190 | 191 | 192 | 193 | 194 | 195 | 196 | """ 197 | import virtual_screen_pubchem as vsp 198 | import cPickle 199 | 200 | target_list = ["cdk2", "egfr_erbB1", "gsk3b", "hgfr", 201 | "map_k_p38a", "tpk_lck", "tpk_src", "vegfr2"] 202 | 203 | f = open("vs_pubchem.result", "r") 204 | results = cPickle.load(f) 205 | f.close() 206 | 207 | pos_cid_dict = vsp.get_chembl_pos(target_list) 208 | 209 | # test cdk2 210 | cdk2_vs = [results[i, 0] for i in range(results.shape[0]) if results[i, 1]==1] 211 | vs = set(cdk2_vs) 212 | cdk2_re = [int(x) for x in pos_cid_dict["cdk2"]] 213 | re = set(cdk2_re) 214 | len(list(vs | re)) 215 | 216 | 217 | 218 | 219 | 220 | 221 | """ 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | -------------------------------------------------------------------------------- /dnn_model/single_eval.py: -------------------------------------------------------------------------------- 1 | # Author: xiaotaw@qq.com (Any bug report is welcome) 2 | # Time Created: Aug 2016 3 | # Time Last Updated: Oct 2016 4 | # Addr: Shenzhen, China 5 | # Description: evaluate pk model for a single target 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import os 12 | import sys 13 | import time 14 | import datetime 15 | import numpy as np 16 | import tensorflow as tf 17 | 18 | from matplotlib import pyplot as plt 19 | from sklearn.metrics import roc_curve, auc 20 | 21 | import dnn_model 22 | sys.path.append("/home/scw4750/Documents/chembl/data_files/") 23 | import chembl_input as ci 24 | 25 | 26 | eval_batch_size = 1024 27 | 28 | 29 | def evaluate(target, g_step_list=None, gpu_num=0, 30 | keep_prob=0.8, wd=0.004, batch_size=128): 31 | """ evaluate the model 32 | """ 33 | # dataset 34 | d = ci.Dataset(target) 35 | # learning rate 36 | step_per_epoch = int(d.train_size / batch_size) 37 | # input vec_len 38 | input_vec_len = d.num_features 39 | # checkpoint file 40 | ckpt_dir = "ckpt_files/%s" % target 41 | ckpt_path = os.path.join(ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd)) 42 | 43 | # pred file 44 | pred_dir = "pred_files/%s" % target 45 | if not os.path.exists(pred_dir): 46 | os.mkdir(pred_dir) 47 | 48 | print("%s eval starts at: %s\n" % (target, datetime.datetime.now())) 49 | 50 | # g_step_list 51 | #g_step_list = range(1, 2235900, 10 * step_per_epoch) 52 | #g_step_list.append(2235900) 53 | 54 | with tf.Graph().as_default(), tf.device("/gpu: %d" % gpu_num): 55 | # build the model 56 | input_placeholder = tf.placeholder(tf.float32, shape = (None, input_vec_len)) 57 | label_placeholder = tf.placeholder(tf.float32, shape = (None, 2)) 58 | # build the "Tree" with a mutual "Term" and several "Branches" 59 | base = dnn_model.term(input_placeholder, in_units=input_vec_len, wd=wd, keep_prob=1.0) 60 | # compute softmax 61 | softmax = dnn_model.branch(target, base, wd=wd, keep_prob=1.0) 62 | 63 | # create a saver. 64 | saver = tf.train.Saver(tf.trainable_variables()) 65 | # create session. 66 | config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 67 | config.gpu_options.per_process_gpu_memory_fraction = 0.2 68 | sess = tf.Session(config=config) 69 | 70 | 71 | for g_step in g_step_list: 72 | # Restores variables from checkpoint 73 | saver.restore(sess, ckpt_path + "-%d" % g_step) 74 | 75 | # the whole pns 76 | pns_pred_file = open(pred_dir + "/pns_%s_%d_%4.3f_%4.3e_%d.pred" % (target, batch_size, keep_prob, wd, g_step), "w") 77 | for ids, features, mask in d.batch_generator_pns(eval_batch_size): 78 | sm = sess.run(softmax, feed_dict={input_placeholder: features}) 79 | for i, s, m in zip(ids, sm[:, 1], mask): 80 | pns_pred_file.write("%s\t%f\t%d\n" % (i, s, m)) 81 | pns_pred_file.close() 82 | 83 | # the whole cns 84 | cns_pred_file = open(pred_dir + "/cns_%s_%d_%4.3f_%4.3e_%d.pred" % (target, batch_size, keep_prob, wd, g_step), "w") 85 | for ids, features, mask in d.batch_generator_cns(eval_batch_size): 86 | sm = sess.run(softmax, feed_dict={input_placeholder: features}) 87 | for i, s, m in zip(ids, sm[:, 1], mask): 88 | cns_pred_file.write("%s\t%f\t%d\n" % (i, s, m)) 89 | cns_pred_file.close() 90 | 91 | # the target's train 92 | train_pred_file = open(pred_dir + "/train_%s_%d_%4.3f_%4.3e_%d.pred" % (target, batch_size, keep_prob, wd, g_step), "w") 93 | sm = sess.run(softmax, feed_dict={input_placeholder: d.target_features_train.toarray()}) 94 | for i, s, m in zip(d.target_ids_train, sm[:, 1], d.target_labels_train): 95 | train_pred_file.write("%s\t%f\t%d\n" % (i, s, m)) 96 | train_pred_file.close() 97 | 98 | # the target's test 99 | test_pred_file = open(pred_dir + "/test_%s_%d_%4.3f_%4.3e_%d.pred" % (target, batch_size, keep_prob, wd, g_step), "w") 100 | sm = sess.run(softmax, feed_dict={input_placeholder: d.target_features_test.toarray()}) 101 | for i, s, m in zip(d.target_ids_test, sm[:, 1], d.target_labels_test): 102 | test_pred_file.write("%s\t%f\t%d\n" % (i, s, m)) 103 | test_pred_file.close() 104 | 105 | print("eval ends at: %s\n" % datetime.datetime.now()) 106 | 107 | 108 | def test(target, g_step): 109 | # dataset 110 | d = ci.DatasetTarget(target) 111 | # batch size 112 | batch_size = 128 113 | # keep prob 114 | keep_prob = 0.8 115 | # weight decay 116 | wd = 0.004 117 | # checkpoint file 118 | ckpt_dir = "ckpt_files/%s" % target 119 | ckpt_path = os.path.join(ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd)) 120 | # input vec_len 121 | input_vec_len = d.num_features 122 | 123 | with tf.Graph().as_default(), tf.device("/gpu:3"): 124 | # build the model 125 | input_placeholder = tf.placeholder(tf.float32, shape = (None, input_vec_len)) 126 | label_placeholder = tf.placeholder(tf.float32, shape = (None, 2)) 127 | # build the "Tree" with a mutual "Term" and several "Branches" 128 | base = dnn_model.term(input_placeholder, in_units=input_vec_len, wd=wd, keep_prob=1.0) 129 | # compute softmax 130 | softmax = dnn_model.branch(target, base, wd=wd, keep_prob=1.0) 131 | # compute loss. 132 | wd_loss = tf.add_n(tf.get_collection("term_wd_loss") + tf.get_collection(target+"_wd_loss")) 133 | x_entropy = dnn_model.x_entropy(softmax, label_placeholder, target) 134 | loss = tf.add(wd_loss, x_entropy) 135 | # create a saver. 136 | saver = tf.train.Saver(tf.trainable_variables()) 137 | # create session. 138 | config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 139 | config.gpu_options.per_process_gpu_memory_fraction = 0.2 140 | sess = tf.Session(config=config) 141 | 142 | saver.restore(sess, ckpt_path + "-%d" % g_step) 143 | sm = sess.run(softmax, feed_dict = {input_placeholder: d.target_features_test.toarray()}) 144 | 145 | fpr, tpr, _ = roc_curve(d.target_labels_test, sm[:, 1]) 146 | roc_auc = auc(fpr, tpr) 147 | plt.figure() 148 | plt.plot(fpr, tpr, color="r", lw=2, label="ROC curve (area = %.2f)" % roc_auc) 149 | plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--") 150 | plt.xlim([0.0, 1.0]) 151 | plt.ylim([0.0, 1.05]) 152 | plt.xlabel("False Positive Rate") 153 | plt.ylabel("True Positive Rate") 154 | plt.title("Receiver operating characteristic of DNN model on %s" % target) 155 | plt.legend(loc="lower right") 156 | plt.savefig("%s.png" % target) 157 | #plt.show() 158 | 159 | 160 | 161 | if __name__ == "__main__": 162 | # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others. 163 | target_list = ["CHEMBL279", "CHEMBL203", # Protein Kinases 164 | "CHEMBL217", "CHEMBL253", # GPCRs (Family A) 165 | "CHEMBL235", "CHEMBL206", # Nuclear Hormone Receptors 166 | "CHEMBL240", "CHEMBL4296", # Voltage Gated Ion Channels 167 | "CHEMBL4805", # Ligand Gated Ion Channels 168 | "CHEMBL204", "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others 169 | ] 170 | 171 | target_list = ["CHEMBL203", "CHEMBL204", "CHEMBL205", 172 | "CHEMBL206", "CHEMBL217", "CHEMBL235", "CHEMBL240", 173 | "CHEMBL244", "CHEMBL253", "CHEMBL279", "CHEMBL340", 174 | "CHEMBL4005", "CHEMBL4296", "CHEMBL4805", "CHEMBL4822", 175 | ] 176 | 177 | g_list = [2161371, 2236500, 2235600, 178 | 2091321, 2161661, 2086841, 2020411, 179 | 2161951, 2012041, 2161661, 2246400, 180 | 2235900, 2238000, 2168041, 1936221 181 | ] 182 | 183 | #i = int(sys.argv[1]) 184 | #target = target_list[i] 185 | #g_step = g_list[i] 186 | #evaluate(target=target, g_step_list=[g_step], gpu_num=i % 4) 187 | evaluate(target="CHEMBL4722", g_step_list=[2242500], gpu_num=0) 188 | #test(target, g_step, ) 189 | 190 | 191 | 192 | -------------------------------------------------------------------------------- /dnn_model/pk_train.py: -------------------------------------------------------------------------------- 1 | # Author: xiaotaw@qq.com (Any bug report is welcome) 2 | # Time Created: Aug 2016 3 | # Time Last Updated: Nov 2016 4 | # Addr: Shenzhen, China 5 | # Description: train pk model 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import os 12 | import time 13 | import datetime 14 | import math 15 | import numpy 16 | import random 17 | import tensorflow as tf 18 | 19 | import pk_input as pki 20 | import dnn_model 21 | 22 | 23 | def train(target_list, train_from = 0): 24 | 25 | # dataset 26 | d = pki.Datasets(target_list) 27 | 28 | # batch size. 29 | # note: the mean number of neg sample is 25.23 times as many as pos's. 30 | neg_batch_size = 512 31 | pos_batch_size_dict = {} 32 | pos_sum = 0 33 | for target in target_list: 34 | pos_sum += d.pos[target].size 35 | pos_batch_size = int(neg_batch_size * pos_sum / d.neg.size) 36 | for target in target_list: 37 | pos_batch_size_dict[target] = int(neg_batch_size * d.pos[target].size / d.neg.size) 38 | #pos_batch_size_dict[target] = pos_batch_size 39 | # learning rate 40 | step_per_epoch = int(d.neg.size / neg_batch_size) 41 | start_learning_rate = 0.05 42 | decay_step = step_per_epoch * 10 * 8 43 | decay_rate = 0.9 44 | # max train steps 45 | max_step = 50 * step_per_epoch 46 | # input vec_len 47 | input_vec_len = d.neg.features.shape[1] 48 | # keep prob 49 | keep_prob = 0.8 50 | # weight decay 51 | wd = 0.001 52 | # checkpoint file 53 | ckpt_dir = "ckpt_files_big_tree/pk" 54 | ckpt_path = os.path.join(ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (neg_batch_size, keep_prob, wd)) 55 | if not os.path.exists(ckpt_dir): 56 | os.makedirs(ckpt_dir) 57 | # train log file 58 | log_dir = "log_files_big_tree" 59 | if not os.path.exists(log_dir): 60 | os.mkdir(log_dir) 61 | log_path = os.path.join(log_dir, "train_pk_%d_%4.3f_%4.3e.log" % (neg_batch_size, keep_prob, wd)) 62 | logfile = open(log_path, 'w') 63 | logfile.write("train starts at: %s\n" % datetime.datetime.now()) 64 | 65 | 66 | # train the model 67 | with tf.Graph().as_default(), tf.device("/gpu:0"): 68 | 69 | # exponential decay learning rate 70 | global_step = tf.Variable(train_from, trainable=False) 71 | learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, decay_step, decay_rate) 72 | 73 | # build the model 74 | input_placeholder = tf.placeholder(tf.float32, shape = (None, input_vec_len)) 75 | label_placeholder = tf.placeholder(tf.float32, shape = (None, 2)) 76 | # build the "Tree" with a mutual "Term" and several "Branches" 77 | base = dnn_model.term(input_placeholder, wd=wd, keep_prob=keep_prob) 78 | softmax_dict = dict() 79 | wd_loss_dict = dict() 80 | x_entropy_dict = dict() 81 | loss_dict = dict() 82 | accuracy_dict = dict() 83 | train_op_dict = dict() 84 | for target in target_list: 85 | # compute softmax 86 | softmax_dict[target] = dnn_model.branch(target, base, wd=wd, keep_prob=keep_prob) 87 | # compute loss. 88 | wd_loss_dict[target] = tf.add_n(tf.get_collection("term_wd_loss") + tf.get_collection(target+"_wd_loss")) 89 | x_entropy_dict[target] = dnn_model.x_entropy(softmax_dict[target], label_placeholder, target) 90 | loss_dict[target] = tf.add(wd_loss_dict[target], x_entropy_dict[target]) 91 | # compute accuracy 92 | accuracy_dict[target] = dnn_model.accuracy(softmax_dict[target], label_placeholder, target) 93 | # train op 94 | train_op_dict[target] = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss_dict[target], global_step=global_step) 95 | # create a saver. 96 | saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=None) 97 | # start running operations on the Graph. 98 | config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 99 | config.gpu_options.per_process_gpu_memory_fraction = 0.8 100 | sess = tf.Session(config=config) 101 | # initialize all variables at first. 102 | sess.run(tf.initialize_all_variables()) 103 | if train_from != 0: 104 | saver.restore(sess, ckpt_path + "-%d" % train_from) 105 | # print title to screen and log file 106 | title_str = " step g_step wdloss xloss learn_rate TP FN TN FP SEN SPE ACC MCC t1-t0 t2-t1 t3-t2 target" 107 | print(title_str) 108 | logfile.write(title_str + "\n") 109 | 110 | # format str 111 | format_str = "%6d %6d %6.4f %7.5f %10.8f %5d %5d %5d %5d %6.4f %6.4f %6.4f %6.4f %5.3f %5.3f %5.3f %10s " 112 | 113 | # train with max step 114 | for step in xrange(max_step): 115 | for target in target_list: 116 | t0 = time.time() 117 | 118 | # get a batch sample 119 | compds_batch, labels_batch = d.next_train_batch(target, pos_batch_size_dict[target], neg_batch_size) 120 | t1 = float(time.time()) 121 | 122 | _ = sess.run(train_op_dict[target], feed_dict={input_placeholder: compds_batch, label_placeholder: labels_batch}) 123 | t2 = float(time.time()) 124 | 125 | # compute performance 126 | # compute performance 127 | if step % step_per_epoch == 0 or (step + 1) == max_step: 128 | g_step, wd_ls, x_ls, lr, acc, pred, label_dense = sess.run([global_step, wd_loss_dict[target], x_entropy_dict[target], learning_rate, accuracy_dict[target], tf.argmax(softmax_dict[target], 1), tf.argmax(labels_batch, 1)], 129 | feed_dict = {input_placeholder: compds_batch, label_placeholder: labels_batch}) 130 | tp, tn, fp, fn, sen, spe, mcc = dnn_model.compute_performance(label_dense, pred) 131 | t3 = float(time.time()) 132 | # print to file and screen 133 | 134 | logfile.write(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target)) 135 | logfile.write('\n') 136 | print(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target)) 137 | 138 | 139 | # save the model checkpoint periodically. 140 | if step % (10 * step_per_epoch) == 0 or (step + 1) == max_step: 141 | saver.save(sess, ckpt_path, global_step=global_step, write_meta_graph=False) 142 | 143 | if (step > 3 * 10 * step_per_epoch) and (step % (10 * step_per_epoch) == 0 or (step + 1) == max_step): 144 | for target in target_list: 145 | # the whole train 146 | t0 = time.time() 147 | compds_batch = numpy.vstack([d.pos[target].features[d.pos[target].train_perm], d.neg.features[d.neg.train_perm]]) 148 | labels_batch = numpy.vstack([d.pos[target].labels[d.pos[target].train_perm], d.neg.mask_dict[target][d.neg.train_perm]]) 149 | t1 = time.time() 150 | t2 = time.time() 151 | g_step, wd_ls, x_ls, lr, acc, pred, label_dense = sess.run([global_step, wd_loss_dict[target], x_entropy_dict[target], learning_rate, accuracy_dict[target], tf.argmax(softmax_dict[target], 1), tf.argmax(labels_batch, 1)], 152 | feed_dict = {input_placeholder: compds_batch, label_placeholder: labels_batch}) 153 | t3 = float(time.time()) 154 | tp, tn, fp, fn, sen, spe, mcc = dnn_model.compute_performance(label_dense, pred) 155 | # print to file and screen 156 | logfile.write(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target)) 157 | logfile.write('\n') 158 | print(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target)) 159 | 160 | # the whole test 161 | t0 = time.time() 162 | compds_batch = numpy.vstack([d.pos[target].features[d.pos[target].test_perm], d.neg.features[d.neg.test_perm]]) 163 | labels_batch = numpy.vstack([d.pos[target].labels[d.pos[target].test_perm], d.neg.mask_dict[target][d.neg.test_perm]]) 164 | t1 = time.time() 165 | t2 = time.time() 166 | g_step, wd_ls, x_ls, lr, acc, pred, label_dense = sess.run([global_step, wd_loss_dict[target], x_entropy_dict[target], learning_rate, accuracy_dict[target], tf.argmax(softmax_dict[target], 1), tf.argmax(labels_batch, 1)], 167 | feed_dict = {input_placeholder: compds_batch, label_placeholder: labels_batch}) 168 | t3 = float(time.time()) 169 | tp, tn, fp, fn, sen, spe, mcc = dnn_model.compute_performance(label_dense, pred) 170 | # print to file and screen 171 | logfile.write(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target)) 172 | logfile.write('\n') 173 | print(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target)) 174 | 175 | 176 | logfile.write("train ends at: %s\n" % datetime.datetime.now()) 177 | logfile.close() 178 | 179 | 180 | 181 | if __name__ == "__main__": 182 | 183 | target_list = ["cdk2", "egfr_erbB1", "gsk3b", "hgfr", 184 | "map_k_p38a", "tpk_lck", "tpk_src", "vegfr2"] 185 | 186 | train(target_list, train_from=0) 187 | 188 | 189 | 190 | -------------------------------------------------------------------------------- /dnn_model/single_vs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Author: xiaotaw@qq.com (Any bug report is welcome) 3 | # Time: Aug 2016 4 | # Addr: Shenzhen 5 | # Description: apply pk model to pubchem dataset, to screen potential active substrate(drugs) 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import os 12 | import sys 13 | import time 14 | import numpy as np 15 | import datetime 16 | import tensorflow as tf 17 | from matplotlib import pyplot as plt 18 | 19 | import dnn_model 20 | sys.path.append("/home/scw4750/Documents/chembl/data_files/") 21 | import chembl_input as ci 22 | 23 | 24 | 25 | 26 | 27 | def virtual_screening_single(target, g_step, part_num, gpu_num): 28 | t_0 = time.time() 29 | 30 | # dataset 31 | d = ci.DatasetVS(target) 32 | # batch size 33 | batch_size = 128 34 | # input vec_len 35 | input_vec_len = d.num_features 36 | # keep prob 37 | keep_prob = 0.8 38 | # weight decay 39 | wd = 0.004 40 | # g_step 41 | #g_step = 2236500 42 | 43 | # virtual screen pred file 44 | pred_dir = "pred_files/%s" % target 45 | if not os.path.exists(pred_dir): 46 | os.makedirs(pred_dir) 47 | pred_path = os.path.join(pred_dir, "vs_pubchem_%s_%d_%4.3f_%4.3e_%d_%d.pred" % (target, batch_size, keep_prob, wd, g_step, part_num)) 48 | predfile = open(pred_path, 'w') 49 | print("virtual screen %d starts at: %s\n" % (part_num, datetime.datetime.now())) 50 | 51 | # checkpoint file 52 | ckpt_dir = "ckpt_files/%s" % target 53 | ckpt_path = os.path.join(ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd)) 54 | 55 | # input and output dir 56 | fp_dir = "/raid/xiaotaw/pubchem/fp_files/%d" % part_num 57 | 58 | # screening 59 | with tf.Graph().as_default(), tf.device("/gpu: %d" % gpu_num): 60 | #with tf.Graph().as_default(), tf.device("/gpu:%d" % (part_num % 4)): 61 | # the input 62 | input_placeholder = tf.placeholder(tf.float32, shape = (None, input_vec_len)) 63 | # the term 64 | base = dnn_model.term(input_placeholder, in_units=input_vec_len, wd=wd, keep_prob=1.0) 65 | # the branches 66 | softmax = dnn_model.branch(target, base, wd=wd, keep_prob=1.0) 67 | # create a saver. 68 | saver = tf.train.Saver(tf.trainable_variables()) 69 | # Start screen 70 | config=tf.ConfigProto(allow_soft_placement=True) 71 | config.gpu_options.per_process_gpu_memory_fraction = 0.35 72 | with tf.Session(config=config) as sess: 73 | # Restores variables from checkpoint 74 | saver.restore(sess, ckpt_path + "-%d" % g_step) 75 | for i in xrange(part_num * 10000000 + 1, (part_num + 1) * 10000000, 25000): 76 | in_file = "Compound_" + "{:0>9}".format(i) + "_" + "{:0>9}".format(i + 24999) + ".apfp" 77 | fp_fn = os.path.join(fp_dir, in_file) 78 | if not os.path.exists(fp_fn): 79 | print("%s not exists" % fp_fn) 80 | continue 81 | d.reset(fp_fn) 82 | compds = d.features_dense 83 | sm = sess.run(softmax, feed_dict = {input_placeholder: compds}) 84 | for id_, sm_v in zip(d.pubchem_id, sm[:, 1]): 85 | predfile.writelines("%s\t%f\n" % (id_, sm_v)) 86 | print("%s\t%d\n" % (fp_fn, len(d.pubchem_id))) 87 | 88 | print("duration: %.3f" % (time.time() - t_0)) 89 | 90 | 91 | """ 92 | def predict(target, g_step_list=None): 93 | # dataset 94 | d = ci.Dataset(target) 95 | # batch size 96 | batch_size = 128 97 | # learning rate 98 | step_per_epoch = int(d.train_size / batch_size) 99 | # input vec_len 100 | input_vec_len = d.train_features.shape[1] 101 | # keep prob 102 | keep_prob = 0.8 103 | # weight decay 104 | wd = 0.004 105 | # checkpoint file 106 | ckpt_dir = "ckpt_files/%s" % target 107 | ckpt_path = os.path.join(ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd)) 108 | # pred file 109 | pred_dir = "pred_files/%s" % target 110 | if not os.path.exists(pred_dir): 111 | os.makedirs(pred_dir) 112 | 113 | # g_step_list 114 | #g_step_list = range(1, 2235900, 10 * step_per_epoch) 115 | #g_step_list = [2161371] 116 | 117 | with tf.Graph().as_default(), tf.device("/gpu:3"): 118 | 119 | # build the model 120 | input_placeholder = tf.placeholder(tf.float32, shape = (None, input_vec_len)) 121 | label_placeholder = tf.placeholder(tf.float32, shape = (None, 2)) 122 | # build the "Tree" with a mutual "Term" and several "Branches" 123 | base = dnn_model.term(input_placeholder, in_units=input_vec_len, wd=wd, keep_prob=1.0) 124 | # compute softmax 125 | softmax = dnn_model.branch(target, base, wd=wd, keep_prob=1.0) 126 | # compute loss. 127 | wd_loss = tf.add_n(tf.get_collection("term_wd_loss") + tf.get_collection(target+"_wd_loss")) 128 | x_entropy = dnn_model.x_entropy(softmax, label_placeholder, target) 129 | loss = tf.add(wd_loss, x_entropy) 130 | # create a saver. 131 | saver = tf.train.Saver(tf.trainable_variables()) 132 | # create session. 133 | config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 134 | config.gpu_options.per_process_gpu_memory_fraction = 0.9 135 | sess = tf.Session(config=config) 136 | 137 | # target test 138 | test_chemblid = d.time_split_test["CMPD_CHEMBLID"] 139 | test_compds = d.test_features.toarray() 140 | test_labels_dense = d.test_labels 141 | 142 | # target train 143 | time_split_train = d.target_clf_label[d.target_clf_label["YEAR"] <= 2014] 144 | target_train_chemblid = time_split_train["CMPD_CHEMBLID"] 145 | m = d.target_cns_mask.index.isin(time_split_train["CMPD_CHEMBLID"]) 146 | target_train_features = d.target_cns_features[m].toarray() 147 | target_train_labels_dense = d.target_cns_mask[m].values.astype(int) 148 | 149 | for g_step in g_step_list: 150 | # Restores variables from checkpoint 151 | saver.restore(sess, ckpt_path + "-%d" % g_step) 152 | 153 | # the target's test 154 | sm = sess.run(softmax, feed_dict = {input_placeholder: test_compds}) 155 | 156 | test_pred_path = os.path.join(pred_dir, "test_%s_%d_%4.3f_%4.3e_%d.pred" % (target, batch_size, keep_prob, wd, g_step)) 157 | test_pred_file = open(test_pred_path, 'w') 158 | 159 | for id_, sm_v, l_v in zip(test_chemblid, sm[:, 1], test_labels_dense): 160 | test_pred_file.writelines("%s\t%f\t%f\n" % (id_, sm_v, l_v)) 161 | 162 | test_pred_file.close() 163 | 164 | # the target's train 165 | sm = sess.run(softmax, feed_dict = {input_placeholder: target_train_features}) 166 | 167 | train_pred_path = os.path.join(pred_dir, "train_%s_%d_%4.3f_%4.3e_%d.pred" % (target, batch_size, keep_prob, wd, g_step)) 168 | train_pred_file = open(train_pred_path, 'w') 169 | 170 | for id_, sm_v, l_v in zip(target_train_chemblid, sm[:, 1], target_train_labels_dense): 171 | train_pred_file.writelines("%s\t%f\t%f\n" % (id_, sm_v, l_v)) 172 | 173 | train_pred_file.close() 174 | """ 175 | 176 | def analyse(target, g_step): 177 | vs_pred_file = "pred_files/%s/vs_pubchem_%s_128_0.800_4.000e-03_%d.pred" % (target, target, g_step) 178 | aa = np.genfromtxt(vs_pred_file, delimiter="\t") 179 | a = aa[:, 1] 180 | 181 | test_pred_file = "pred_files/%s/test_%s_128_0.800_4.000e-03_%d.pred" % (target, target, g_step) 182 | bb = np.genfromtxt(test_pred_file, delimiter="\t", usecols=[1,2]) 183 | b = bb[:, 0][bb[:, 1].astype(bool)] 184 | 185 | """ 186 | train_pred_file = "pred_files/%s/train_%s_128_0.800_4.000e-03_%d.pred" % (target, target, g_step) 187 | cc = np.genfromtxt(train_pred_file, delimiter="\t", usecols=[1,2]) 188 | c = cc[:, 0][cc[:, 1].astype(bool)] 189 | 190 | bhist = plt.hist(b, bins=100, range=(0, 1), cumulative=False, histtype="stepfilled", ) 191 | plt.hist(b, bins=100, range=(0, 1), cumulative=True, histtype="step", ) 192 | plt.show() 193 | 194 | 195 | chist = plt.hist(c, bins=100, cumulative=False, histtype="stepfilled", ) 196 | plt.hist(c, bins=100, cumulative=True, histtype="step", ) 197 | plt.show() 198 | 199 | histtype="bar", "barstacked", "step", "stepfilled", 200 | 201 | 202 | """ 203 | 204 | x = [] 205 | y = [] 206 | for i in range(10): 207 | mark = (i + 1) / 20.0 208 | xi = 1.0 * (b > mark).sum() / b.shape[0] 209 | yi = (a > mark).sum() 210 | x.append(xi) 211 | y.append(yi) 212 | 213 | plt.plot(x, y, "*") 214 | plt.xlabel("pos yeild rate") 215 | plt.ylabel("vs pubchem false pos") 216 | 217 | plt.savefig("pred_files/%s/%d.png" % (target, g_step)) 218 | 219 | 220 | 221 | 222 | 223 | if __name__ == "__main__": 224 | # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others. 225 | target_list = ["CHEMBL279", "CHEMBL203", # Protein Kinases 226 | "CHEMBL217", "CHEMBL253", # GPCRs (Family A) 227 | "CHEMBL235", "CHEMBL206", # Nuclear Hormone Receptors 228 | "CHEMBL240", "CHEMBL4296", # Voltage Gated Ion Channels 229 | "CHEMBL4805", # Ligand Gated Ion Channels 230 | "CHEMBL204", "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others 231 | ] 232 | 233 | # the target 234 | target = "CHEMBL4722" 235 | 236 | #part_num range from 0 to 12(included) 237 | #for i in range(9, 13): 238 | # virtual_screening_single(target, 2260800, i, 3) 239 | 240 | #predict(target, g_step_list=[2252100]) 241 | 242 | 243 | analyse(target, g_step=2242500) 244 | 245 | 246 | 247 | -------------------------------------------------------------------------------- /data_files/chembl_cal_mask.py: -------------------------------------------------------------------------------- 1 | # Author: xiaotaw@qq.com (Any bug report is welcome) 2 | # Time Created: Dec 2016 3 | # Time Last Updated: Dec 2016 4 | # Addr: Shenzhen, China 5 | # Description: calculate mask(label) of chembl molecules for specific targets 6 | 7 | import os 8 | import sys 9 | import math 10 | import time 11 | import datetime 12 | import multiprocessing 13 | import numpy as np 14 | from scipy import sparse 15 | from collections import defaultdict 16 | 17 | # folders 18 | fp_dir = "fp_files" 19 | structure_dir = "structure_files" 20 | mask_dir = "mask_files" 21 | if not os.path.exists(mask_dir): 22 | os.mkdir(mask_dir) 23 | log_dir = "log_files" 24 | if not os.path.exists(log_dir): 25 | os.mkdir(log_dir) 26 | 27 | 28 | # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others. 29 | target_list = ["CHEMBL279", "CHEMBL203", # Protein Kinases 30 | "CHEMBL217", "CHEMBL253", # GPCRs (Family A) 31 | "CHEMBL235", "CHEMBL206", # Nuclear Hormone Receptors 32 | "CHEMBL240", "CHEMBL4296", # Voltage Gated Ion Channels 33 | "CHEMBL4805", # Ligand Gated Ion Channels 34 | "CHEMBL204", "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others 35 | ] 36 | 37 | # the target 38 | #target = target_list[int(sys.argv[1])] 39 | 40 | # read chembl id and apfp 41 | chembl_id = [] 42 | chembl_apfp = {} 43 | f = open(os.path.join(fp_dir, "chembl.apfp"), "r") 44 | for line in f: 45 | id_, fps_str = line.split("\t") 46 | id_ = id_.strip() 47 | fps_str = fps_str.strip() 48 | chembl_id.append(id_) 49 | chembl_apfp[id_] = fps_str 50 | 51 | f.close() 52 | 53 | # read (pubchem negative sample)pns apfp and counts the fps that appeared in pns compounds 54 | pns_id = [] 55 | pns_apfp = {} 56 | pns_count = defaultdict(lambda : 0) 57 | f = open(os.path.join(fp_dir, "pubchem_neg_sample.apfp"), "r") 58 | for line in f: 59 | id_, fps_str = line.split("\t") 60 | id_ = id_.strip() 61 | fps_str = fps_str.strip() 62 | pns_id.append(id_) 63 | pns_apfp[id_] = fps_str 64 | for fp in fps_str[1:-1].split(","): 65 | if ":" in fp: 66 | k, _ = fp.split(":") 67 | pns_count[int(k)] += 1 68 | 69 | f.close() 70 | 71 | 72 | # read top 79 targets' label 73 | clf_label_79 = np.genfromtxt(os.path.join(structure_dir, "chembl_top79.label"), usecols=[0, 2, 3], delimiter="\t", skip_header=1, dtype=str) 74 | 75 | def cal_mask(target): 76 | ################################################################################ 77 | # generate sparse matrix for target features 78 | 79 | # target compounds' chembl_id and clf label. 80 | target_clf_label = clf_label_79[clf_label_79[:, 0] == target] 81 | 82 | # remove compounds whose apfp cannot be caculated 83 | m = [] 84 | for cmpd_id in target_clf_label[:, 1]: 85 | if cmpd_id in chembl_id: 86 | m.append(True) 87 | else: 88 | m.append(False) 89 | target_clf_label = target_clf_label[np.array(m)] 90 | 91 | # target fps 92 | target_fps = [chembl_apfp[x] for x in target_clf_label[:, 1]] 93 | 94 | # count the fps that appeared in the compounds of the target 95 | target_count = defaultdict(lambda : 0) 96 | for fps_str in target_fps: 97 | for fp in fps_str[1:-1].split(","): 98 | if ":" in fp: 99 | k, _ = fp.split(":") 100 | target_count[int(k)] += 1 101 | 102 | target_count.update(pns_count) 103 | 104 | # save target apfp count 105 | count_file = open(os.path.join(mask_dir, "%s_apfp.count" % target), "w") 106 | for k in target_count.keys(): 107 | count_file.write("%d\t%d\n" % (k, target_count[k])) 108 | 109 | count_file.close() 110 | 111 | # pick out that fps that appeared for more than 10 times. 112 | # Here we assume that the more frequently a fp appeared, the more important it is. 113 | v = np.array([[k, target_count[k]] for k in target_count.keys()]) 114 | m = v[:, 1] > 10 115 | target_apfp_picked = v[m][:, 0] 116 | 117 | # according to the apfp that picked out, define the columns in the feature sparse matrix 118 | # Note: a defaultdict is used. 119 | # And the purpose is assign a default value(length of target_apfp_picked) for the apfps 120 | # which is not included in target_apfp_picked. And this column(the last column) was finally 121 | # not used at all. 122 | columns_dict = defaultdict(lambda : len(target_apfp_picked)) 123 | for i, apfp in enumerate(target_apfp_picked): 124 | columns_dict[apfp] = i 125 | 126 | # define the function which can construct a feature sparse matrix according to the columns_dict 127 | def sparse_features(fps_list): 128 | data = [] 129 | indices = [] 130 | indptr = [0] 131 | for fps_str in fps_list: 132 | n = indptr[-1] 133 | for fp in fps_str[1:-1].split(","): 134 | if ":" in fp: 135 | k, v = fp.split(":") 136 | indices.append(columns_dict[int(k)]) 137 | data.append(int(v)) 138 | n += 1 139 | indptr.append(n) 140 | a = sparse.csr_matrix((np.array(data), indices, indptr), shape=(len(fps_list), len(target_apfp_picked) + 1)) 141 | return a 142 | 143 | # pick out target compounds with pos labels 144 | # normally, abs(clf_label) > 0.5(refer to chembl_preparation.py), 145 | # so it also works when using the following line: 146 | # target_pos_id = target_clf_label[target_clf_label[:, 2].astype(float) > 0.5][:, 1] 147 | target_pos_id = target_clf_label[target_clf_label[:, 2].astype(float) > 0][:, 1] 148 | target_pos_fps = [chembl_apfp[x] for x in target_pos_id] 149 | 150 | # generate feature sparse matrix for target's pos compounds 151 | target_pos_features = sparse_features(target_pos_fps)[:, :-1].toarray() 152 | 153 | # generate feature sparse matrix for pns compounds 154 | target_pns_features = sparse_features([pns_apfp[k] for k in pns_id])[:, :-1] 155 | 156 | # generate feature sparse matrix for (chembl negative sample)cns compounds 157 | target_cns_features = sparse_features([chembl_apfp[k] for k in chembl_id])[:, :-1] 158 | 159 | 160 | ################################################################################ 161 | # generate mask for pns and cns 162 | 163 | # define a task function for sub process: 164 | # it can compare a part of negative sample(cns or pns) with pos samples, 165 | # and return the mask of those samples back to the main process. 166 | def sub_compare(sub_neg_id, sub_neg_features, conn): 167 | mask = {} 168 | log_str = [] 169 | for neg_k, neg_f in zip(sub_neg_id, sub_neg_features): 170 | for pos_k, pos_f in zip(target_pos_id, target_pos_features): 171 | if (neg_f != pos_f).sum() == 0: 172 | mask[neg_k] = True 173 | log_str.append("%s\t%s\n" % (neg_k, pos_k)) 174 | conn.send((mask, log_str)) 175 | conn.close() 176 | 177 | # the number of sub process for computation 178 | n_jobs = 6 179 | 180 | 181 | # using multiprocessing compute mask for pns 182 | t1 = time.time() 183 | date1 = datetime.datetime.now() 184 | 185 | num_per_job = int(math.ceil(target_pns_features.shape[0] / float(n_jobs))) 186 | thread_list = [] 187 | conn_list = [] 188 | for i in range(0, n_jobs): 189 | begin = i * num_per_job 190 | end = (i + 1) * num_per_job 191 | if end > target_pns_features.shape[0]: 192 | end = target_pns_features.shape[0] 193 | p_conn, c_conn = multiprocessing.Pipe() 194 | conn_list.append((p_conn, c_conn)) 195 | t = multiprocessing.Process(target=sub_compare, args=(pns_id[begin: end], target_pns_features[begin: end], c_conn)) 196 | thread_list.append(t) 197 | 198 | for i in range(n_jobs): 199 | thread_list[i].start() 200 | 201 | for i in range(n_jobs): 202 | thread_list[i].join() 203 | 204 | t2 = time.time() 205 | 206 | target_pns_mask = defaultdict(lambda : False) 207 | 208 | log = open(log_dir + "/" + target + "_gen_pns_mask.log", "w") 209 | log.write("%s generate mask for pubchem neg sample, begins at %s\n" % (target, str(date1))) 210 | 211 | for i in range(n_jobs): 212 | p_conn = conn_list[i][0] 213 | mask, log_str = p_conn.recv() 214 | target_pns_mask.update(mask) 215 | log.writelines(log_str) 216 | 217 | log.write("generate mask for pns, duration: %.3f\n" % (t2 - t1)) 218 | log.close() 219 | 220 | mask_file = open(os.path.join(mask_dir, "%s_pns.mask" % target), "w") 221 | mask_file.writelines(["%s\t%s\n" % (x, target_pns_mask[x]) for x in pns_id]) 222 | mask_file.close() 223 | 224 | print("generate mask for pns, duration: %.3f" % (t2 - t1)) 225 | 226 | 227 | # using multiprocessing compute mask for cns 228 | t2 = time.time() 229 | date2 = datetime.datetime.now() 230 | 231 | num_per_job = int(math.ceil(target_cns_features.shape[0] / float(n_jobs))) 232 | thread_list = [] 233 | conn_list = [] 234 | for i in range(0, n_jobs): 235 | begin = i * num_per_job 236 | end = (i + 1) * num_per_job 237 | if end > target_cns_features.shape[0]: 238 | end = target_cns_features.shape[0] 239 | p_conn, c_conn = multiprocessing.Pipe() 240 | conn_list.append((p_conn, c_conn)) 241 | t = multiprocessing.Process(target=sub_compare, args=(chembl_id[begin: end], target_cns_features[begin: end], c_conn)) 242 | thread_list.append(t) 243 | 244 | for i in range(n_jobs): 245 | thread_list[i].start() 246 | 247 | for i in range(n_jobs): 248 | thread_list[i].join() 249 | 250 | t3 = time.time() 251 | 252 | target_cns_mask = defaultdict(lambda : False) 253 | 254 | log = open(log_dir + "/" + target + "_gen_cns_mask.log", "w") 255 | log.write("%s generate mask for chembl neg sample, begins at %s\n" % (target, str(date2))) 256 | 257 | for i in range(n_jobs): 258 | p_conn = conn_list[i][0] 259 | mask, log_str = p_conn.recv() 260 | target_cns_mask.update(mask) 261 | log.writelines(log_str) 262 | 263 | log.write("generate mask for cns, duration: %.3f\n" % (t3 - t2)) 264 | log.close() 265 | 266 | mask_file = open(os.path.join(mask_dir, "%s_cns.mask" % target), "w") 267 | mask_file.writelines(["%s\t%s\n" % (x, target_cns_mask[x]) for x in chembl_id]) 268 | mask_file.close() 269 | 270 | print("generate mask for cns, duration: %.3f" % (t3 - t2)) 271 | 272 | 273 | # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others. 274 | target_list = [ 275 | "CHEMBL4805", # Ligand Gated Ion Channels 276 | "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others 277 | ] 278 | 279 | 280 | #for target in target_list: 281 | # cal_mask(target) 282 | cal_mask(sys.argv[1]) 283 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /dnn_model/pk_input.py: -------------------------------------------------------------------------------- 1 | # Author: xiaotaw@qq.com (Any bug report is welcome) 2 | # Time Created: Aug 2016 3 | # Time Last Updated: Nov 2016 4 | # Addr: Shenzhen, China 5 | # Description: define functions and parameters related to input data 6 | 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import os 13 | import h5py 14 | import time 15 | import random 16 | 17 | import numpy as np 18 | import pandas as pd 19 | 20 | from scipy import sparse 21 | 22 | vec_len = 9561 23 | data_dir = "../data_files" 24 | h5_dir = os.path.join(data_dir, "h5_files") 25 | 26 | 27 | def dense_to_one_hot(labels_dense, num_classes=2, dtype=np.int): 28 | """Convert class labels from scalars to one-hot vectors. 29 | Args: 30 | labels_dense: dense label 31 | num_classes: the number of classes in one hot label 32 | dtype: data type 33 | Return: 34 | labels_ont_hot: one hot label 35 | """ 36 | num_labels = labels_dense.shape[0] 37 | index_offset = np.arange(num_labels) * num_classes 38 | labels_one_hot = np.zeros((num_labels, num_classes)) 39 | labels_one_hot.flat[index_offset + labels_dense.ravel().astype(dtype)] = 1 40 | return labels_one_hot 41 | 42 | 43 | class Dataset(object): 44 | """Base dataset class 45 | """ 46 | def __init__(self, size, is_shuffle=False, fold=10): 47 | """Constructor, create a dataset container. 48 | Args: 49 | size: the number of samples 50 | is_shuffle: whether shuffle samples when the dataset created 51 | fold: how many folds to split samples 52 | Return: 53 | None 54 | """ 55 | self.size = size 56 | self.perm = np.array(range(self.size)) 57 | if is_shuffle: 58 | random.shuffle(self.perm) 59 | 60 | self.train_size = int(self.size * (1.0 - 1.0 / fold)) 61 | self.train_perm = self.perm[range(self.train_size)] 62 | self.train_begin = 0 63 | self.train_end = 0 64 | 65 | self.test_perm = self.perm[range(self.train_size, self.size)] 66 | 67 | def generate_perm_for_train_batch(self, batch_size): 68 | """Create the permutation for a batch of train samples 69 | Args: 70 | batch_size: the number of samples in the batch 71 | Return: 72 | perm: the permutation of samples which form a batch 73 | """ 74 | self.train_begin = self.train_end 75 | self.train_end += batch_size 76 | if self.train_end > self.train_size: 77 | random.shuffle(self.train_perm) 78 | self.train_begin = 0 79 | self.train_end = batch_size 80 | perm = self.train_perm[self.train_begin: self.train_end] 81 | return perm 82 | 83 | 84 | class PosDataset(Dataset): 85 | """Positive dataset class 86 | """ 87 | def __init__(self, target, one_hot=True, dtype=np.float32): 88 | """Create a positive dataset for a protein kinase target. 89 | The data is read from hdf5 files. 90 | Args: 91 | target: the protein kinase target name, also the name of hdf5 file 92 | one_hot: whether to convert labels from dense to one_hot 93 | dtype: data type of features 94 | Return: 95 | None 96 | """ 97 | # open h5 file 98 | self.h5_fn = os.path.join(h5_dir, target + ".h5") 99 | self.h5 = h5py.File(self.h5_fn, "r") 100 | # read ids 101 | self.ids = self.h5["chembl_id"].value 102 | # read 3 fp, and stack as feauture 103 | ap = sparse.csr_matrix((self.h5["ap"]["data"], self.h5["ap"]["indices"], self.h5["ap"]["indptr"]), shape=[len(self.h5["ap"]["indptr"]) - 1, vec_len]) 104 | #mg = sparse.csr_matrix((self.h5["mg"]["data"], self.h5["mg"]["indices"], self.h5["mg"]["indptr"]), shape=[len(self.h5["mg"]["indptr"]) - 1, vec_len]) 105 | #tt = sparse.csr_matrix((self.h5["tt"]["data"], self.h5["tt"]["indices"], self.h5["tt"]["indptr"]), shape=[len(self.h5["tt"]["indptr"]) - 1, vec_len]) 106 | #self.features = sparse.hstack([ap, mg, tt]).toarray() 107 | self.features = ap.toarray() 108 | # label 109 | self.labels = self.h5["label"].value 110 | if one_hot == True: 111 | self.labels = dense_to_one_hot(self.labels) 112 | # year 113 | if "year" in self.h5.keys(): 114 | self.years = self.h5["year"].value 115 | else: 116 | self.years = None 117 | # close h5 file 118 | self.h5.close() 119 | # dtype 120 | self.dtype = dtype 121 | # pre_process 122 | #self.features = np.log10(1.0 + self.features).astype(self.dtype) 123 | self.features = np.clip(self.features, 0, 1).astype(self.dtype) 124 | # 125 | Dataset.__init__(self, self.features.shape[0]) 126 | 127 | 128 | def next_train_batch(self, batch_size): 129 | """Generate the next batch of samples 130 | Args: 131 | batch_size: the number of samples in the batch 132 | Return: 133 | A tuple of features and labels of the samples in the batch 134 | """ 135 | perm = self.generate_perm_for_train_batch(batch_size) 136 | return self.features[perm], self.labels[perm] 137 | 138 | 139 | class NegDataset(Dataset): 140 | """Negative dataset class 141 | """ 142 | def __init__(self, target_list, one_hot=True, dtype=np.float32): 143 | """Create a negative dataset for a protein kinase target. 144 | The data is read from a hdf5 file, pubchem_neg_sample.h5. 145 | Note that for each target, these samples has the corresponding labels, 146 | and I use a mask_dict to store these labels, i.e. mask_dict[target] = labels for target 147 | Args: 148 | target_list: the protein kinase targets' list 149 | one_hot: whether to convert labels from dense to one_hot 150 | dtype: data type of features 151 | Return: 152 | None 153 | """ 154 | # open h5 file 155 | self.h5_fn = os.path.join(h5_dir, "pubchem_neg_sample.h5") 156 | self.h5 = h5py.File(self.h5_fn, "r") 157 | # read ids 158 | self.ids = self.h5["chembl_id"].value 159 | # read 3 fp, and stack as feauture 160 | ap = sparse.csr_matrix((self.h5["ap"]["data"], self.h5["ap"]["indices"], self.h5["ap"]["indptr"]), shape=[len(self.h5["ap"]["indptr"]) - 1, vec_len]) 161 | #mg = sparse.csr_matrix((self.h5["mg"]["data"], self.h5["mg"]["indices"], self.h5["mg"]["indptr"]), shape=[len(self.h5["mg"]["indptr"]) - 1, vec_len]) 162 | #tt = sparse.csr_matrix((self.h5["tt"]["data"], self.h5["tt"]["indices"], self.h5["tt"]["indptr"]), shape=[len(self.h5["tt"]["indptr"]) - 1, vec_len]) 163 | #self.features = sparse.hstack([ap, mg, tt]).toarray() 164 | self.features = ap.toarray() 165 | # label(mask) 166 | self.mask_dict = {} 167 | for target in target_list: 168 | #mask = self.h5["mask"][target].value 169 | mask = self.h5["cliped_mask"][target].value 170 | if one_hot == True: 171 | self.mask_dict[target] = dense_to_one_hot(mask) 172 | else: 173 | self.mask_dict[target] = mask 174 | # close h5 file 175 | self.h5.close() 176 | # dtype 177 | self.dtype = dtype 178 | # pre_process 179 | #self.features = np.log10(1.0 + self.features).astype(self.dtype) 180 | self.features = np.clip(self.features, 0, 1).astype(self.dtype) 181 | # 182 | Dataset.__init__(self, self.features.shape[0]) 183 | 184 | def next_train_batch(self, target, batch_size): 185 | """Generate the next batch of samples 186 | Args: 187 | batch_size: the number of samples in the batch 188 | Return: 189 | A tuple of features and labels of the samples in the batch 190 | """ 191 | perm = self.generate_perm_for_train_batch(batch_size) 192 | return self.features[perm], self.mask_dict[target][perm] 193 | 194 | 195 | class Datasets(object): 196 | """dataset class, contains several positive datasets and one negative dataset. 197 | """ 198 | def __init__(self, target_list, one_hot=True): 199 | """ 200 | Args: 201 | target_list: the protein kinase targets' list 202 | one_hot: whether to convert labels from dense to one_hot 203 | return: 204 | None 205 | """ 206 | # read neg dataset 207 | self.neg = NegDataset(target_list, one_hot=one_hot) 208 | # read pos datasets 209 | self.pos = {} 210 | for target in target_list: 211 | self.pos[target] = PosDataset(target, one_hot=one_hot) 212 | 213 | def next_train_batch(self, target, pos_batch_size, neg_batch_size): 214 | """Generate the next batch of samples 215 | Args: 216 | target: the positive target name 217 | pos_batch_size: the number of samples in the batch from positive target dataset 218 | neg_batch_size: the number of samples in the batch from negative target dataset 219 | Return: 220 | A tuple of features and labels of the samples in the batch 221 | """ 222 | pos_feature_batch, pos_label_batch = self.pos[target].next_train_batch(pos_batch_size) 223 | neg_feature_batch, neg_label_batch = self.neg.next_train_batch(target, neg_batch_size) 224 | return np.vstack([pos_feature_batch, neg_feature_batch]), np.vstack([pos_label_batch, neg_label_batch]) 225 | 226 | 227 | def test_dataset(): 228 | """A simple test 229 | """ 230 | target_list = ["cdk2", "egfr_erbB1", "gsk3b", "hgfr", "map_k_p38a", "tpk_lck", "tpk_src", "vegfr2"] 231 | d = Datasets(target_list) 232 | print("test for batching") 233 | print("batch_num target feature_min feature_max label_min label_max") 234 | for step in range(2 * 500): 235 | for target in target_list: 236 | compds_batch, labels_batch = d.next_train_batch(target, 128, 128) 237 | if np.isnan(compds_batch).sum() > 0: 238 | print("warning: nan in feature"), 239 | print("%9d %10s %11.2f %11.2f %9.2f %9.2f" % (step, target, compds_batch.min(), compds_batch.max(), labels_batch.min(), labels_batch.max())) 240 | if (step % 500) == 0: 241 | print("%9d %10s %11.2f %11.2f %9.2f %9.2f" % (step, target, compds_batch.min(), compds_batch.max(), labels_batch.min(), labels_batch.max())) 242 | 243 | # from data_files/fp_2_code.py 244 | def read_fp(filename, dtype=int): 245 | """ read fingerprint from file 246 | Args: 247 | filename: 248 | Return: 249 | chembl_id_list: , a list of str 250 | fps_list: , a list of dict. 251 | """ 252 | chembl_id_list = [] 253 | fps_list = [] 254 | infile = open(filename, "r") 255 | line_num = 0 256 | for line in infile: 257 | line_num += 1 258 | chembl_id = line.split("\t")[0].strip() 259 | fps_str = line.split("\t")[1].strip() 260 | fps = {} 261 | fps_str = fps_str[1:-1].split(",") 262 | for fp in fps_str: 263 | if ":" in fp: 264 | k, v = fp.split(":") 265 | k = dtype(k.strip()) 266 | v = dtype(v.strip()) 267 | assert k not in fps.keys(), ("error in fp_file %s at line %d: dict's keys duplicated" % (filename, line_num)) 268 | fps[k] = v 269 | chembl_id_list.append(chembl_id) 270 | fps_list.append(fps) 271 | infile.close() 272 | return chembl_id_list, fps_list 273 | 274 | 275 | class Dataset_reg(object): 276 | def __init__(self, target, train_year_up_limit = 2013): 277 | """ 278 | """ 279 | fp_dir = "../data_files/fp_files" 280 | # all apfps that were picked out. 281 | apfp_picked_fn = os.path.join(fp_dir, target + "_apfp.picked_all") 282 | self.apfp_picked_all = list(np.genfromtxt(apfp_picked_fn, dtype=str)) 283 | self.apfp_picked_all.sort() 284 | # read response 285 | response_df = pd.read_csv(os.path.join(fp_dir, target + ".response"), delimiter="\t", names=["CHEMBL_ID", "YEAR", "LABEL", "TYPE", "RELATION", "VALUE"], index_col=0) 286 | # read apfp as features 287 | apfp_fn = os.path.join(fp_dir, target + ".apfp") 288 | id_list, apfps_list = read_fp(apfp_fn, dtype=str) 289 | features_df = pd.DataFrame(index=id_list, data=apfps_list, columns=self.apfp_picked_all, dtype=float) 290 | # merge response and features 291 | df = pd.concat([response_df, features_df], axis=1) 292 | # pick out records with explicit values 293 | df = df[df["RELATION"] == "="] 294 | df = df[["YEAR", "VALUE"] + self.apfp_picked_all] 295 | # remove duplicates, keep the mean "VALUE" and mean "YEAR". 296 | df.reset_index(drop=False, inplace=True) 297 | df = df.fillna(0).groupby(by=["CHEMBL_ID"]).mean() 298 | # log processing for "VALUE" and features 299 | df["LOG_VALUE"] = np.log(df["VALUE"]) 300 | df[self.apfp_picked_all] = np.log(1 + df[self.apfp_picked_all]) 301 | 302 | self.df = df 303 | 304 | # batch related 305 | mask = self.df["YEAR"] <= train_year_up_limit 306 | self.tr_ids = self.df.index[mask].values 307 | self.te_ids = self.df.index[~mask].values 308 | 309 | self.tr_size = self.tr_ids.shape[0] 310 | self.tr_begin = 0 311 | self.tr_end = 0 312 | 313 | 314 | def next_batch(self, batch_size): 315 | """ 316 | """ 317 | self.tr_begin = self.tr_end 318 | self.tr_end += batch_size 319 | if self.tr_end > self.tr_size: 320 | random.shuffle(self.tr_ids) 321 | self.tr_begin = 0 322 | self.tr_end = batch_size 323 | batch = self.df.ix[self.tr_ids[self.tr_begin: self.tr_end]] 324 | return batch[self.apfp_picked_all].values, batch["LOG_VALUE"].values 325 | 326 | def test_batch(self): 327 | batch = self.df.ix[self.te_ids] 328 | return batch[self.apfp_picked_all].values, batch["LOG_VALUE"].values 329 | 330 | def train_batch(self): 331 | batch = self.df.ix[self.tr_ids] 332 | return batch[self.apfp_picked_all].values, batch["LOG_VALUE"].values 333 | 334 | 335 | 336 | if __name__ == "__main__": 337 | target_list = ["cdk2", "egfr_erbB1", "gsk3b", "hgfr", "map_k_p38a", "tpk_lck", "tpk_src", "vegfr2"] 338 | test_dataset() 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | -------------------------------------------------------------------------------- /data_files/chembl_input.py: -------------------------------------------------------------------------------- 1 | # Author: xiaotaw@qq.com (Any bug report is welcome) 2 | # Time Created: Nov 2016 3 | # Time Last Updated: Dec 2016 4 | # Addr: Shenzhen, China 5 | # Description: 6 | 7 | import os 8 | import getpass 9 | import numpy as np 10 | import pandas as pd 11 | from scipy import sparse 12 | from collections import defaultdict 13 | 14 | data_dir = "/home/%s/Documents/chembl/data_files/" % getpass.getuser() 15 | fp_dir = os.path.join(data_dir, "fp_files") 16 | mask_dir = os.path.join(data_dir, "mask_files") 17 | structure_dir = os.path.join(data_dir, "structure_files") 18 | 19 | # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others. 20 | target_list = ["CHEMBL279", "CHEMBL203", # Protein Kinases 21 | "CHEMBL217", "CHEMBL253", # GPCRs (Family A) 22 | "CHEMBL235", "CHEMBL206", # Nuclear Hormone Receptors 23 | "CHEMBL240", "CHEMBL4296", # Voltage Gated Ion Channels 24 | "CHEMBL4805", # Ligand Gated Ion Channels 25 | "CHEMBL204", "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others 26 | ] 27 | 28 | 29 | def dense_to_one_hot(labels_dense, num_classes=2, dtype=np.int): 30 | """Convert class labels from scalars to one-hot vectors. 31 | Args: 32 | labels_dense: dense label 33 | num_classes: the number of classes in one hot label 34 | dtype: data type 35 | Return: 36 | labels_ont_hot: one hot label 37 | """ 38 | num_labels = labels_dense.shape[0] 39 | index_offset = np.arange(num_labels) * num_classes 40 | labels_one_hot = np.zeros((num_labels, num_classes)) 41 | labels_one_hot.flat[index_offset + labels_dense.ravel().astype(dtype)] = 1 42 | return labels_one_hot 43 | 44 | 45 | def sparse_features(fps_list, target_columns_dict, num_features, is_log=True): 46 | """construct a sparse matrix(csr_matrix) for features according to target_columns_dict. 47 | Args: 48 | fps_list: a list of apfps for the molecules 49 | is_log: flag whether apply np.log to data, default is True 50 | Return: 51 | features: the sparse matrix of features 52 | """ 53 | data = [] 54 | indices = [] 55 | indptr = [0] 56 | for fps_str in fps_list: 57 | n = indptr[-1] 58 | for fp in fps_str[1:-1].split(","): 59 | if ":" in fp: 60 | k, v = fp.split(":") 61 | indices.append(target_columns_dict[int(k)]) 62 | data.append(int(v)) 63 | n += 1 64 | indptr.append(n) 65 | data = np.array(data) 66 | if is_log: 67 | data = np.log(data).astype(np.float32) 68 | # here we add one to num_features, because any apfp not founded in target_apfp_picked will be mapped 69 | # to the last column of the features matrix, though the last column will not be used ultimately. 70 | features = sparse.csr_matrix((data, indices, indptr), shape=(len(fps_list), num_features + 1)) 71 | return features 72 | 73 | 74 | class DatasetBase(object): 75 | def __init__(self, target): 76 | # read count and the apfps that were picked out 77 | counts = np.genfromtxt(mask_dir + "/%s_apfp.count" % target, delimiter="\t", dtype=int) 78 | self.target_apfp_picked = counts[counts[:, 1] > 10][:, 0] 79 | self.target_apfp_picked.sort() 80 | self.num_features = len(self.target_apfp_picked) 81 | # columns and sparse features 82 | # here we use a defaultdict, where any apfp not founded in target_apfp_picked will be mapped 83 | # to the last column of the features matrix, though the last column will not be used ultimately. 84 | self.target_columns_dict = defaultdict(lambda : self.num_features) 85 | for i, apfp in enumerate(self.target_apfp_picked): 86 | self.target_columns_dict[apfp] = i 87 | 88 | def batch_generator_base(self, size, batch_size): 89 | begin = 0 90 | end = 0 91 | while True: 92 | begin = end 93 | if begin >= size: 94 | raise StopIteration() 95 | end += batch_size 96 | if end > size: 97 | end = size 98 | yield begin, end 99 | 100 | 101 | class DatasetTarget(DatasetBase): 102 | def __init__(self, target, year_split=2014): 103 | DatasetBase.__init__(self, target) 104 | # read chembl id and apfp 105 | self.chembl_id = [] 106 | self.chembl_apfp = {} 107 | f = open(fp_dir + "/chembl.apfp", "r") 108 | for line in f: 109 | id_, fps_str = line.split("\t") 110 | id_ = id_.strip() 111 | fps_str = fps_str.strip() 112 | self.chembl_id.append(id_) 113 | self.chembl_apfp[id_] = fps_str 114 | f.close() 115 | # read top 79 targets' label data, and get the specific target's label data 116 | clf_label_79 = pd.read_csv(structure_dir + "/chembl_top79.label", usecols=[0, 2, 3, 4], delimiter="\t") 117 | self.target_clf_label = clf_label_79[clf_label_79["TARGET_CHEMBLID"] == target] 118 | # remove compounds whose apfp cannot be caculated 119 | m = self.target_clf_label["CMPD_CHEMBLID"].isin(self.chembl_id) 120 | self.target_clf_label = self.target_clf_label[m.values] 121 | # time split 122 | time_mask = self.target_clf_label["YEAR"] > year_split 123 | time_split_train = self.target_clf_label[~time_mask] 124 | time_split_test = self.target_clf_label[time_mask] 125 | # ids 126 | self.target_ids_train = time_split_train["CMPD_CHEMBLID"].values 127 | self.target_ids_test = time_split_test["CMPD_CHEMBLID"].values 128 | # features 129 | self.target_features_train = sparse_features([self.chembl_apfp[k] for k in self.target_ids_train], self.target_columns_dict, self.num_features)[:, :-1] 130 | self.target_features_test = sparse_features([self.chembl_apfp[k] for k in self.target_ids_test], self.target_columns_dict, self.num_features)[:, :-1] 131 | # labels 132 | self.target_labels_train = (time_split_train["CLF_LABEL"] > 0).astype(int).values 133 | self.target_labels_test = (time_split_test["CLF_LABEL"] > 0).astype(int).values 134 | 135 | 136 | class DatasetCNS(DatasetTarget): 137 | def __init__(self, target, year_split=2014): 138 | DatasetTarget.__init__(self, target, year_split=year_split) 139 | # read mask 140 | self.cns_mask = pd.Series.from_csv(mask_dir + "/%s_cns.mask" % target, header=None, sep="\t") 141 | # features 142 | self.cns_features = sparse_features([self.chembl_apfp[k] for k in self.chembl_id], self.target_columns_dict, self.num_features)[:, :-1] 143 | # 144 | m = self.cns_mask.index.isin(self.target_ids_test) 145 | self.cns_features_train = self.cns_features[~m] 146 | self.cns_mask_train = self.cns_mask[~m] 147 | 148 | def batch_generator_cns(self, batch_size): 149 | for begin, end in self.batch_generator_base(self.cns_features.shape[0], batch_size): 150 | ids = self.chembl_id[begin: end] 151 | features = self.cns_features[begin: end].toarray() 152 | mask = self.cns_mask[begin: end].values 153 | yield ids, features, mask 154 | 155 | 156 | class DatasetPNS(DatasetBase): 157 | def __init__(self, target): 158 | DatasetBase.__init__(self, target) 159 | # read pns apfp 160 | self.pns_id = [] 161 | self.pns_apfp = {} 162 | f = open(fp_dir + "/pubchem_neg_sample.apfp", "r") 163 | for line in f: 164 | id_, fps_str = line.split("\t") 165 | id_ = id_.strip() 166 | fps_str = fps_str.strip() 167 | self.pns_id.append(id_) 168 | self.pns_apfp[id_] = fps_str 169 | f.close() 170 | # read mask 171 | self.pns_mask = pd.Series.from_csv(mask_dir + "/%s_pns.mask" % target, header=None, sep="\t") 172 | # features 173 | self.pns_features = sparse_features([self.pns_apfp[k] for k in self.pns_id], self.target_columns_dict, self.num_features)[:, :-1] 174 | 175 | def batch_generator_pns(self, batch_size): 176 | for begin, end in self.batch_generator_base(self.pns_features.shape[0], batch_size): 177 | ids = self.pns_id[begin: end] 178 | features = self.pns_features[begin: end].toarray() 179 | mask = self.pns_mask[begin: end].values 180 | yield ids, features, mask 181 | 182 | 183 | class Dataset(DatasetCNS, DatasetPNS): 184 | """Base dataset class for chembl inhibitors 185 | """ 186 | def __init__(self, target, one_hot=True, is_shuffle_train=True, train_pos_multiply=0): 187 | """Constructor, create a dataset container. 188 | Args: 189 | target: the chemblid of the target, e.g. "CHEMBL203". 190 | one_hot: flag whether create one_hot label, default is True. 191 | is_shuffle: flag whether shuffle samples when the dataset created. 192 | year_split: time split year, 193 | if a molecule's year > year_split, it will be split into test data, 194 | otherwise, if a molecule's year <= year_split, it will be split into train data. 195 | Return: 196 | None 197 | """ 198 | DatasetCNS.__init__(self, target, year_split=2014) 199 | DatasetPNS.__init__(self, target) 200 | # cns train pos 201 | self.cns_features_train_pos = self.cns_features_train[self.cns_mask_train.values] 202 | self.cns_mask_train_pos = self.cns_mask_train[self.cns_mask_train.values] 203 | # train, if train_pos_multiply > 0, cns_train_pos will be extra added for train_pos_multiply times . 204 | tf_list = [self.cns_features_train, self.pns_features] 205 | tl_list = [self.cns_mask_train, self.pns_mask] 206 | for _ in range(train_pos_multiply): 207 | tf_list.append(self.cns_features_train_pos) 208 | tl_list.append(self.cns_mask_train_pos) 209 | self.train_features = sparse.vstack(tf_list) 210 | self.train_labels = np.hstack(tl_list).astype(int) 211 | # test 212 | self.test_features = self.target_features_test 213 | self.test_labels = self.target_labels_test 214 | # one_hot 215 | if one_hot: 216 | self.train_labels_one_hot = dense_to_one_hot(self.train_labels) 217 | self.test_labels_one_hot = dense_to_one_hot(self.test_labels) 218 | # batch related 219 | self.train_size = self.train_features.shape[0] # (954049, 9412) 220 | self.train_perm = np.arange(self.train_size) 221 | if is_shuffle_train: 222 | np.random.shuffle(self.train_perm) 223 | self.train_begin = 0 224 | self.train_end = 0 225 | 226 | 227 | def generate_perm_for_train_batch(self, batch_size): 228 | """Create the permutation for a batch of train samples 229 | Args: 230 | batch_size: the number of samples in the batch 231 | Return: 232 | perm: the permutation of samples which form a batch 233 | """ 234 | self.train_begin = self.train_end 235 | self.train_end += batch_size 236 | if self.train_end > self.train_size: 237 | np.random.shuffle(self.train_perm) 238 | self.train_begin = 0 239 | self.train_end = batch_size 240 | perm = self.train_perm[self.train_begin: self.train_end] 241 | return perm 242 | 243 | def generate_train_batch(self, batch_size): 244 | perm = self.generate_perm_for_train_batch(batch_size) 245 | return self.train_features[perm].toarray().astype(np.float32), self.train_labels_one_hot[perm] 246 | 247 | def reset_begin_end(self): 248 | self.train_begin = 0 249 | self.train_end = 0 250 | 251 | def generate_train_batch_once(self, batch_size): 252 | self.train_begin = self.train_end 253 | self.train_end += batch_size 254 | if self.train_end > self.train_size: 255 | self.train_end = self.train_size 256 | perm = self.train_perm[self.train_begin: self.train_end] 257 | return self.train_features[perm].toarray().astype(np.float32), self.train_labels_one_hot[perm] 258 | 259 | 260 | 261 | # dataset for virtual screening(vs) 262 | class DatasetVS(DatasetBase): 263 | def __init__(self, target): 264 | DatasetBase.__init__(self, target) 265 | 266 | def reset(self, fp_fn): 267 | # read chembl id and apfp 268 | self.pubchem_id = [] 269 | self.pubchem_apfp = {} 270 | f = open(fp_fn, "r") 271 | for line in f: 272 | id_, fps_str = line.split("\t") 273 | id_ = id_.strip() 274 | fps_str = fps_str.strip() 275 | self.pubchem_id.append(id_) 276 | self.pubchem_apfp[id_] = fps_str 277 | f.close() 278 | # generate features 279 | self.features = sparse_features([self.pubchem_apfp[k] for k in self.pubchem_id], self.target_columns_dict, self.num_features)[:, :-1] 280 | self.features_dense = self.features.toarray() 281 | 282 | 283 | class DatasetChemDiv(DatasetBase): 284 | def __init__(self, target): 285 | DatasetBase.__init__(self, target) 286 | # read ids and apfps 287 | ChemDiv_dir = "/raid/xiaotaw/ChemDiv" 288 | fn_list = ["DC01_350000.apfp", "DC02_350000.apfp", "DC03_222773.apfp", "DC_saltdata_not-available_124145.apfp", "IC_non-excl_82693.apfp", "NC_340320.apfp"] 289 | self.chemdiv_ids = [] 290 | self.chemdiv_apfps = {} 291 | for fn in fn_list: 292 | f = open(ChemDiv_dir + "/" + fn, "r") 293 | for line in f: 294 | id_, fps_str = line.split("\t") 295 | id_ = id_.strip() 296 | fps_str = fps_str.strip() 297 | self.chemdiv_ids.append(id_) 298 | self.chemdiv_apfps[id_] = fps_str 299 | f.close() 300 | # batch related 301 | self.begin = 0 302 | self.end = 0 303 | self.size = len(self.chemdiv_ids) 304 | 305 | def generate_batch(self, batch_size): 306 | self.begin = self.end 307 | if self.begin >= self.size: 308 | raise StopIteration() 309 | self.end += batch_size 310 | if self.end > self.size: 311 | self.end = self.size 312 | ids = self.chemdiv_ids[self.begin: self.end] 313 | apfp_list = [self.chemdiv_apfps[k] for k in ids] 314 | features = sparse_features(apfp_list, self.target_columns_dict, self.num_features)[:, :-1] 315 | return ids, features 316 | 317 | def batch_generator_chemdiv(self, batch_size): 318 | for begin, end in self.batch_generator_base(self.size, batch_size): 319 | ids = self.chemdiv_ids[begin: end] 320 | apfp_list = [self.chemdiv_apfps[k] for k in ids] 321 | features = sparse_features(apfp_list, self.target_columns_dict, self.num_features)[:, :-1].toarray() 322 | yield ids, features 323 | 324 | def compute_performance(label, prediction): 325 | """sensitivity(SEN), specificity(SPE), accuracy(ACC), matthews correlation coefficient(MCC) 326 | """ 327 | assert label.shape[0] == prediction.shape[0], "label number should be equal to prediction number" 328 | N = label.shape[0] 329 | APP = sum(prediction) 330 | ATP = sum(label) 331 | TP = sum(prediction * label) 332 | FP = APP - TP 333 | FN = ATP - TP 334 | TN = N - TP - FP - FN 335 | SEN = float(TP) / (ATP) if ATP != 0 else np.nan 336 | SPE = float(TN) / (N - ATP) 337 | ACC = float(TP + TN) / N 338 | MCC = (TP * TN - FP * FN) / (np.sqrt(long(N - APP) * long(N - ATP) * APP * ATP)) if not (N - APP) * (N - ATP) * APP * ATP == 0 else 0.0 339 | return TP, TN, FP, FN, SEN, SPE, ACC, MCC 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | -------------------------------------------------------------------------------- /data_files/3_chembl_analyse_fp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from scipy import sparse\n", 13 | "from collections import Counter\n", 14 | "from collections import defaultdict\n", 15 | "from matplotlib import pyplot as plt\n", 16 | "%matplotlib inline" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## calculate count for apfp" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "pns_apfp = pd.Series.from_csv(\"fp_files/pns_apfp.csv\", sep=\"\\t\", header=0) \n", 33 | "\n", 34 | "pns_apfp_counter = Counter()\n", 35 | "for apfp_str in pns_apfp:\n", 36 | " apfp = json.loads(apfp_str)\n", 37 | " pns_apfp_counter.update(apfp.keys())\n", 38 | " \n", 39 | "pns_apfp_count = pd.Series(pns_apfp_counter)\n", 40 | "pns_apfp_count.index.name = \"APFP\"\n", 41 | "pns_apfp_count.name = \"COUNT\"\n", 42 | "pns_apfp_count.to_csv(\"fp_files/pns_apfp_count.csv\", header=True)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": { 58 | "collapsed": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "cancer_approved_target = [\"CHEMBL279\", \"CHEMBL203\", \"CHEMBL333\", \"CHEMBL325\", \"CHEMBL267\", \"CHEMBL2842\"]\n", 63 | "cancer_clinical_target = [\"CHEMBL340\", \"CHEMBL4005\", \"CHEMBL332\"]\n", 64 | "target_list = cancer_approved_target + cancer_clinical_target" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "inh_apfp = pd.Series.from_csv(\"fp_files/inhibitor_apfp.csv\", sep=\"\\t\", header=0)\n", 76 | "\n", 77 | "for target in target_list:\n", 78 | " clf_label = pd.read_csv(\"chembl_source/%s_clf_label.csv\" % target)\n", 79 | " target_apfp = inh_apfp.loc[clf_label[\"CMPD_CHEMBLID\"].values]\n", 80 | " target_apfp_counter = Counter()\n", 81 | " for apfp_str in target_apfp:\n", 82 | " apfp = json.loads(apfp_str)\n", 83 | " target_apfp_counter.update(apfp.keys())\n", 84 | " target_apfp_count = pd.Series(target_apfp_counter)\n", 85 | " target_apfp_count.index.name = \"APFP\"\n", 86 | " target_apfp_count.name = \"COUNT\"\n", 87 | " target_apfp_count.to_csv(\"fp_files/%s_apfp_count.csv\" % target, header=True)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "collapsed": true 94 | }, 95 | "source": [ 96 | "## pick a threshold for minimun count of apfp" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "cancer_approved_target = [\"CHEMBL279\", \"CHEMBL203\", \"CHEMBL333\", \"CHEMBL325\", \"CHEMBL267\", \"CHEMBL2842\"]\n", 108 | "cancer_clinical_target = [\"CHEMBL340\", \"CHEMBL4005\", \"CHEMBL332\"]\n", 109 | "target_list = cancer_approved_target + cancer_clinical_target" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 6, 115 | "metadata": { 116 | "collapsed": true 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "inh_apfp = pd.Series.from_csv(\"fp_files/inhibitor_apfp.csv\", sep=\"\\t\", header=0)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 7, 126 | "metadata": { 127 | "collapsed": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "pns_count = pd.Series.from_csv(\"fp_files/pns_apfp_count.csv\", header=0)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 8, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "def sparse_features(fps_series, target_apfp_picked):\n", 143 | " columns_dict = defaultdict(lambda : len(target_apfp_picked))\n", 144 | " for i, apfp in enumerate(target_apfp_picked):\n", 145 | " columns_dict[apfp] = i\n", 146 | " data = []\n", 147 | " indices = []\n", 148 | " indptr = [0]\n", 149 | " for _, fps in fps_series.iteritems():\n", 150 | " n = indptr[-1]\n", 151 | " for k, v in fps.items():\n", 152 | " indices.append(columns_dict[k])\n", 153 | " data.append(v)\n", 154 | " n += 1\n", 155 | " indptr.append(n)\n", 156 | " a = sparse.csr_matrix((np.array(data), indices, indptr), shape=(len(fps_series), len(target_apfp_picked) + 1))\n", 157 | " return a" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 9, 163 | "metadata": { 164 | "collapsed": true 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "target = \"CHEMBL279\"" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 10, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "target_clf_label = pd.read_csv(\"chembl_source/%s_clf_label.csv\" % target)\n", 180 | "target_apfp_str = inh_apfp.loc[target_clf_label[\"CMPD_CHEMBLID\"].values]\n", 181 | "target_apfp = target_apfp_str.apply(json.loads)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 20, 187 | "metadata": { 188 | "collapsed": true 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "target_count = pd.Series.from_csv(\"fp_files/%s_apfp_count.csv\" % target, header=0)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": true 200 | }, 201 | "outputs": [], 202 | "source": [] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 22, 207 | "metadata": { 208 | "collapsed": true 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "count_threshold = 50" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 24, 218 | "metadata": { 219 | "scrolled": true 220 | }, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "10\n", 227 | "(21160,) 11504\n", 228 | "CHEMBL279 168\n", 229 | "0.9 52006.0\n", 230 | "0.95 11212.0\n", 231 | "0.99 149.0\n", 232 | "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n", 233 | "\n", 234 | "\n", 235 | "30\n", 236 | "(21160,) 8803\n", 237 | "CHEMBL279 161\n", 238 | "0.9 51948.0\n", 239 | "0.95 11216.0\n", 240 | "0.99 147.0\n", 241 | "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n", 242 | "\n", 243 | "\n", 244 | "50\n", 245 | "(21160,) 7661\n", 246 | "CHEMBL279 178\n", 247 | "0.9 52027.0\n", 248 | "0.95 11238.0\n", 249 | "0.99 147.0\n", 250 | "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n", 251 | "\n", 252 | "\n", 253 | "70\n", 254 | "(21160,) 6916\n", 255 | "CHEMBL279 158\n", 256 | "0.9 52269.0\n", 257 | "0.95 11286.0\n", 258 | "0.99 148.0\n", 259 | "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n", 260 | "\n", 261 | "\n", 262 | "90\n", 263 | "(21160,) 6363\n", 264 | "CHEMBL279 152\n", 265 | "0.9 52449.0\n", 266 | "0.95 11347.0\n", 267 | "0.99 148.0\n", 268 | "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n", 269 | "\n", 270 | "\n", 271 | "110\n", 272 | "(21160,) 5927\n", 273 | "CHEMBL279 148\n", 274 | "0.9 52492.0\n", 275 | "0.95 11352.0\n", 276 | "0.99 149.0\n", 277 | "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n", 278 | "\n", 279 | "\n", 280 | "130\n", 281 | "(21160,) 5583\n", 282 | "CHEMBL279 145\n", 283 | "0.9 52548.0\n", 284 | "0.95 11373.0\n", 285 | "0.99 152.0\n", 286 | "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n", 287 | "\n", 288 | "\n", 289 | "150\n", 290 | "(21160,) 5310\n", 291 | "CHEMBL279 141\n", 292 | "0.9 52617.0\n", 293 | "0.95 11384.0\n", 294 | "0.99 152.0\n", 295 | "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n", 296 | "\n", 297 | "\n", 298 | "170\n", 299 | "(21160,) 5093\n", 300 | "CHEMBL279 126\n", 301 | "0.9 52722.0\n", 302 | "0.95 11433.0\n", 303 | "0.99 152.0\n", 304 | "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n", 305 | "\n", 306 | "\n", 307 | "190\n", 308 | "(21160,) 4893\n", 309 | "CHEMBL279 117\n", 310 | "0.9 52875.0\n", 311 | "0.95 11479.0\n", 312 | "0.99 157.0\n", 313 | "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n", 314 | "\n", 315 | "\n" 316 | ] 317 | } 318 | ], 319 | "source": [ 320 | "for count_threshold in range(10, 200, 20):\n", 321 | " print count_threshold\n", 322 | " pns_m = pns_count > count_threshold\n", 323 | " print pns_m.shape, pns_m.sum()\n", 324 | "\n", 325 | " count = target_count.add(pns_count, fill_value=0)\n", 326 | " m = count > count_threshold\n", 327 | " picked = count.loc[m]\n", 328 | " print target, picked.shape[0] - pns_m.sum()\n", 329 | " target_apfp_picked = picked.index.astype(str)\n", 330 | "\n", 331 | " a = sparse_features(target_apfp, target_apfp_picked)\n", 332 | "\n", 333 | " aa = a.toarray()[:, :-1]\n", 334 | "\n", 335 | " b = np.corrcoef(aa)\n", 336 | "\n", 337 | " c = (abs(b) > 0.9).astype(int) - np.eye(a.shape[0], dtype=int)\n", 338 | " print 0.9, c.sum() / 2.0\n", 339 | " c = (abs(b) > 0.95).astype(int) - np.eye(a.shape[0], dtype=int)\n", 340 | " print 0.95, c.sum() / 2.0\n", 341 | " c = (abs(b) > 0.99).astype(int) - np.eye(a.shape[0], dtype=int)\n", 342 | " print 0.99, c.sum() / 2.0\n", 343 | " c = (abs(b) > 0.999999).astype(int) - np.eye(a.shape[0], dtype=int)\n", 344 | " \n", 345 | " id_list = []\n", 346 | " for i, j in zip(*c.nonzero()):\n", 347 | " if i <= j:\n", 348 | " continue\n", 349 | " li = target_clf_label.iloc[i][\"CLF_LABEL\"]\n", 350 | " lj = target_clf_label.iloc[j][\"CLF_LABEL\"]\n", 351 | " if (li>0) != (lj>0):\n", 352 | " idi = target_clf_label.iloc[i][\"CMPD_CHEMBLID\"]\n", 353 | " idj = target_clf_label.iloc[j][\"CMPD_CHEMBLID\"]\n", 354 | " id_list.append(idi)\n", 355 | " id_list.append(idj)\n", 356 | " print (idi, li), (idj, lj)\n", 357 | " print \"\\n\"" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": { 363 | "collapsed": true 364 | }, 365 | "source": [ 366 | "## check molecules' collision " 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 25, 372 | "metadata": { 373 | "collapsed": true 374 | }, 375 | "outputs": [], 376 | "source": [ 377 | "from rdkit import Chem\n", 378 | "from rdkit.Chem import Draw\n", 379 | "from rdkit.Chem.Draw import IPythonConsole" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 28, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "inh_smi = pd.Series.from_csv(\"structure_files/inhibitor_smiles.csv\", header=0)" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 29, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "ms = [Chem.MolFromSmiles(inh_smi.loc[id_]) for id_ in id_list]" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 30, 403 | "metadata": {}, 404 | "outputs": [ 405 | { 406 | "data": { 407 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZAAAADICAIAAABJdyC1AAAYwElEQVR4nO3de1xT5/0H8CcIJKIh\nAVQoKAhtABHBC10VfTlUhuKsuilqh1asFa1TbLUW3fwNh/NnbF330q0iirWorDW1rU6nRdRCRbyL\nSjvAeb9bLoIhCITk+/vjuMgPkUtyAjzs8/4rkHP5ql8/OTnnOc+REBEDAOCBTVsXAADQXAgsAOAG\nAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACAGwgsAOAG\nAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACAGwgsAOAG\nAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACAGwgsAOAG\nAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACAGwgsAOAG\nAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACAGwgsAOAG\nAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCS2TFxUwiYUlJT38MDGRlZW1Z\nD3QM6CsBAkt8vXuzTz5hVVVtXQd0LOgrhsCyBoWCTZ367MMQQBToK4bAspJ332UpKUyna+s6oGNB\nXyGwrEIuZzNnsr/+lTHGDIa2rgY6CvQVAstaFixgO3YwrZaVlLCQELZ+PSspaeuagH//5X2FwBJT\nTc2z1w4ObO5cdusWc3ZmI0cytZq98cauOXPm5OTktF2BwCX01TME4omLe+Fbej3t25c+YcIEOzs7\nf3//tWvX3rt3rxVLIyLKzs7esmXL6dOnW3m/YCH0lQkCSzRJSeTr28Dvy8rou+/IaHz644MHD9at\nWxcQEGBrazt16tRWKOzhw4cfffRRnz597Ozsfv7zn0ul0i1btrTCfkEU6Ku6EFiiCQujAwca+P2j\nRySVUmTkmsTExBs3bph+f+LECZVKlZaWZqV6qqurNRpNeHi4jY3NwIEDk5OTy8rKiGj//v0KhSI6\nOrqystJKuwYRoa/qQmCJZtOmF75VXEwbNvx14MCBNjY2I0eOTE1NraioICJvb+/du3eLXsnFixdj\nY2OdnJxcXFzi4+N/+OEH01sGg4GICgsL+/TpM3DgwJs3b4q+dxAX+qouBJY5ioqIMdq48emPffvS\no0dkMDS94qVLlxYvXuzq6iocPHfr1i09PV2sqrRabXJy8qBBgyQSSXh4uEajefLkSd0F9u3b9+qr\nr96+fZuIysvLJ0yY0L1798zMTLEKAAuhr5qEwDJHURH17k19+5LwDyc0VvPp9frq6moikkqlOTk5\nopR07Ngx4aNv0aJFeXl5DS6j1WonT57s7Ox86NAhIjIajQkJCVKpdPPmzaLUABZCXzUJgWWOoiIK\nDqbERPr4Y6KWN5agurqaMXbp0iVRSgoICHjnnXeqqqoaX8xoNKrVant7e7VaLfxm165dXbp0iY2N\nFXod2hD6qkkIrIbp9frS0tIbN278+OOPZ86cycjI2L//p9RU2rCB/vQnOn+egoPp8WMKCKCKCjMb\nq6SkhDF2/fp1Uaq1t7c/fvw4EV28eLG0tLTx5Q8cOKBUKt944w2dTies4u3tPXTo0Pv371teDDQC\nfWUhBFZ9+fn5kZGR9UarOTk5jR37VUAADRlCo0fTyZMUHExEtHYtrVljZmPduHGDMVZcXGx5zYWF\nhYyxkpISInrllVd27NjR5CqXL18OCAjo37+/cIGpuLg4PDzcw8Pj1KlTltcDz0NfWV4PEdlad1iq\nqCoqKioqKrRa7ePHj8vKyoTX9vb248ePl8lkYu1l+fLl9vb2ubm5jo6Ocrm8S5cuDg4O9ZYpLn76\nYsEC9uqrrKLCnB1ptVrGmFwut6hcxhhjBQUFPXr0cHZ2rq6uvn79ur+/f5OrqFSqkydPvvnmmyEh\nIRqNZsSIEf/85z/nzZu3bNmyb7/91t7e3vKqeIG+epH22FeixJ7oCgsLx48fP3jw4L59+3p5eTk5\nOUkkkrplKxQKDw8Pf3//4ODgn/3sZ7W1taLs9/Tp07a2toWFhY0vJpxrEKxfT4yZ80mYk5Njb2/f\n4tUasnbt2uHDhxNRXl6eRCLRarXNXLG2tnbJkiUymezvf/87ER05cqRr165G02DEDgd91SJi9dXR\no0fF6qv2eIRVU1MTERExePDg6OhouVzetWtXhUKhUCiE13K5XKFQmBbW6XRBQUFqtfr3v/+95bte\ntWrVtGnTfH19i4uLZ82a9dlnn7m4uDy/WLdu7MKFp6/j4lhcnDn70mq1onwMMsaE8S+MsYKCgp49\ne3bt2rWZK3bq1GndunUDBgzw8PAQVvf19a33f7jDQF+1lIh9pVKpROkrcQJrzpw5r7/++vjx40XZ\nWnJystFoTE1NlUqljS9ZVlamVCq3bNkyduzYiRMn9u3b15L9njp16uDBg/n5+YyxNWvWFBUVNdhV\nL6LVsub0SX5+fl5e3pQpU0RsrIKCgsmTJwsvmnPcXk90dLTworCw0IzVrQd9xdBX9Vh+kEZEarXa\nzc1NODlnIZ1O5+bmtnHjRiLKysratWvXi5bcvn27n5+fcB9ATEzMa6+9ZuEBfGRk5MyZM4nozp07\nMpns22+/beaKRiMVFRk2bfrnixZ48OBBWlpaTExMz549GWMBAQF6vX7btm39+vWzpGATZ2fnAwcO\nENH06dMXLlxo9nYiIiL++Mc/ilKSKNBXHaOvRo8evXLlSlFKEiew9Hp9SEiI8K9ioT//+c/e3t41\nNTVGo3HAgAHLli1rZKcDBgx47733iKisrMzDw+NjYfiKWU6cOGFnZ3flyhUiWrhwofDVvflOnz5t\nZ2dXd3RvaWmpRqOJjY318fFhjHl6esbGxmo0mp9++klYYMOGDaGhoWYXbPLw4UPG2LVr14goJCTk\nk08+MXtTnp6ejfxPbn3oq47RV15eXl988YXlJZGIwxouXrxob29/8OBBSzby+PHjbt26JScnE9HX\nX3/t6OjY+NVZ4Z/z2LFjRLR7924HB4d///vf5u06IiLirbfeIqJbt25JpdIjR460dAsrVqzw8vIS\nTkzOnDlTKpXKZLJRo0ap1eqzZ88a6txhUV5evmfPnrCwsNDQ0CaH5DUpKyvLwcHBYDAYjUa5XG5G\n5QKtViuRSC5evGhhPeJCX/HeVzqdzsbG5sKFCxbWIxDzKuGKFSs8PT0fP35s9hbUarWPj09NTY3B\nYAgMDFyxYkWTq7z//vt+fn7CzU2TJk0KCwsz42JEVlaWvb29MHJk3rx5YWFhZhRfXV0dGBi4aNEi\nItq9e3d6enrd29b1ev3x48dXrlw5dOhQW1tbNze3qKgoNze3IUOGWDiBUXJycv/+/Yno9u3bjDGz\nt3bu3LlOnTrVu02sPUBfcd1Xly6VjxjxvxUVOkuKMREzsCorK319ff/ngw/MW728vNzFxWXr1q1E\npNFolErlo2Zc1K2srFSpVMuXLyei+/fvOzs7b2rk9vYXGDVqVGxsLBFdu3bNzs4uOzu7xdUTEdGp\nU6dMn8xEZDQaz549q1arw8PDO3fu7OTkFBUVlZycfPXqVWEBYVhdt27djh49at4eiWjx4sXTpk0j\nooyMDKVSafZ20tLSfHx8zF7detBXXPfVgwdk7p+7ASKPw/ohO9vw0kvNusH8OatWrfL19a2trTUY\nDAEBAQkJCc1cMTMz087O7syZM0S0bds2R0fHW7duNX+/mZmZUqlUmBBj9uzZv/jFL1pe+zNLly5V\nqVQ7d+58++23vby8bGxsBg0atGzZsiNHjjR4lF5bWxsfH29ra7t+/Xrz9jh27Fjh7+rw4cPz5883\nu/I//OEPY8eONXt1q0Jf8dtX4rLCwNFFi6jlXytKS0uVSuVnn31GRJ9//rmzs7MwK1gzzZs3Lygo\nSLjNcsyYMZGRkc1fd9iwYe+88w4RFRYW2traWjjTa1FR0csvvxwYGBgfH5+RkSHMT9SktLS0zp07\nt/RO0fPnzy9YsMDX13fy5Mk1NTXmlvzUlClTFi9ebOFGrAh9xVVfNThVjuWsEFg6HT15Qj/+SJGR\nFBFBw4dTamqTKyUkJPj5+dXW1ur1el9f38TExBbts7y8vFevXsIl+Zs3b8rl8u3btxORwWAoLS29\nfv16Xl5eTk5Oenq6RqNJSUn5y1/+kpiYuHTp0qlTp8pkMuEsw5w5cyz8GCSidevW9erVy4xTnufP\nn/fy8ho6dOiDBw8aX7KkpGTDhg39+/eXSqVTpkxJSkrq3bv38OHDHz582KI96nS61NTU2bNnCz8G\nBQUJp6XbKfQVN31Fqal08KBFU+W8iHVuzdFq6ZVXSJjgQqej0FA6fLiRxUtKShwdHYVbK3fu3Nm9\ne/fm3wRg8s0338hksvz8fCJatmyZSqXq0qVL3RFndnZ2zs7OPj4+wcHBw4YNi4yMjIqKio6Otre3\nz8jIIKL79++bfTFIoNVqu3XrlpSUZN7qP/30U1hYWM+ePYWvIfXU1NQIU9N26tQpPDw8NTW1vLxc\neKu8vHz8+PHNnzXt9OnTc+fOVSgUrq6uS5cura6urq2tlclkWVlZ5lXeStBXZmnFvqK5c0mhIFdX\nOnJEhKlynmedwPr6a3rjjWc/7tlD/5kVX/hounbtWm5ublZW1j/+8Y+dO3dOmzZNpVLp9Xq9Xq9S\nqVavXm3ebrdu3Sp05MSJE3/5y19mZmaeO3fu6tWrRUVFjVz8SkhIcHd3b86J2CZ9+OGHwtUos7eg\n1+vj4uJkMpnwSS64cOGCMDVt9+7d4+Pj//Wvfz2/omnWtEYeBHDz5s2EhASVStW5c+cZM2ZkZGSY\nLoqfO3eOMdbSz9LWhr4yl1X7qqSkZP369VOmbO/UicaNo6+/ppqap/dFWjhVzvOsE1jr11PdgXm5\nuTRkCL3yCsnlJU5Opo8mBweHl156yc/PLygoSBjqVllZmZiYaAp48wg3ml6+fLmZy1dXV/fr12/e\nvHmW7JT+M9gnJSXFwu0QUXJyslQqXbx4cWpqalhYmEQiGTp06LZt25o8c9HgrGl1P0KDg4OTk5NN\nExsZDIb09PSpU6dKpdIFCxZYXrl1oa8sI25fGQyGjIyMadOmSaXSl19+ec2aj+/cefpWbS3l5Ykw\nVc7zrBNYX31F0dHPfty7l6ZMoX376OhR47lzV69eLS4u1uv1pveLiop69Ogh1gmUcePGTZ8+vUWr\nCL0oHMCbTa1WC1ejLNmIyeHDhz09Pd3d3dVqtTBbdjM9P2vaRx99JJPJoqOjjx49ahpMlJeXFxcX\n5+bm5uLiEhcXd/bsWVHKti70lcWs0VdHjhwx9VVuLsXGkpMTbd36NLB0OgoIIE/P9hxYWi35+JDw\nUA2djoYNo6bun9q5c2dLLxs36OTJk3Z2dmacMli6dKm3t7cZ5zgElZW0ZMmnGs1X5q3eoO7du+/Z\ns8eMFevNmlZaWmo6pKqoqGj8gQLtGvpKDNboqydPnuzYsWPy5A8kEhoyhLZsoXv3RJgq53lWmw/r\n0iUaM+bp1ZxPP23OGhMmTLB8HJDpRtOWqqqq6tOnj4XX9UWcSKq4uJgxJnwBWbFixYkTJ1q0ujAM\nRyaTbdu2TfjNsWPHYmNjHR0dvb29W/rp2o6grywjel9dvHhx4cKFTk5Orq6u77//fn6+dZ8M0I4m\n8Lt7965SqWzONKwvcuzYMdONpmbIycmxtbU1eziyuLKzs6VSqfBFQKFQNP8W/7qSkpKkUumMGTMG\nDx5sa2s7fvz4vXv3Wj5iiy/oq7rE7avXXnutlfuqHQUWEW3atMnFxaXJASMvEhERYRpVZJ7f/va3\n/v7+wrekysrK5ORktVq9fPny+fPn/+Y3v5k/P2/ECBo0iFQqUqutNTROkJKS0rdvXyK6d+8eY6zu\no31b5PPPP/f09FyzZs1/8wMm0FcmvPdV+5pxNDY2dvfu3QsXLtRoNC1d9/vvv8/MzLx8+bIlBaxd\nuzYoKGj16tWrVq0yGo2bN29WKpXCvJSurq5eXrW9ezOFgsnlrH9/xhjr3Zt98gmbNYuJN/f3U6Y5\nzwoKCrp06eLp6WnedmxtbRljy5YtE7M43qCvTETsK4lE0vp91b4CSyKRbN68uV+/fl999dWkSZMY\nYw8fPrxx44ZWq3306JFWqxWeFKDVasvLy8vLy7VabURExKJFixhjiYmJMTExXl5elhTQpUuXLVu2\njBkzZuLEiYMGDTp79mwjCxcXM4WCTZrEkpLYe+9ZstsGFBQUBAUFMYunLTZvrsgOBn1lIlZf5efn\nt0lfta/AYox5e3uvXLlywYIFI0aMcHZ2/vLLL999911HR0elUimXy+VyufDQEaVS2atXL7lcrlKp\nGGOZmZnZ2dmffvqp5QWMHDly+vTps2bN2r17d0VFRWVlpU6nE7qZseD79weVl7PycjZiBAsPZ4yx\nd99lgwez2FjL9/z/FBQUREVFMYunl0VgCdBXArH6qs1m027N75/NZDAYhgwZ8uabbzZ/laFDhwo3\nmoqiuLh43Lhxwt+PTCZzcXHx8fEJCgqaPv2z0aNpyhSKjaWDB58940TcoXFEVFVV1alTJ+F2WQun\nlx00aJAlc0V2JOgrcftqo+k0Wytqd0dYjDEbG5vk5OSQkJBJkyaZHkAgHKhrtdqKigrTcbvwCLkr\nV66cOXMmLS1NrAJcXFy++eYbrVarUChsbGxetJgoT5Fr0JUrV4xGo+lcw6xZs8zbDhHhCMsEfdUB\n+qo9BhZjrF+/fkuWLFm5cuX8+fOF8wumtyQSiVKpFA7ghaczKZXK77//3sKzDPXY2to61bnbo3EO\nDmzuXLZokWh7LygocHd3l8vlOp3u1q1bZnfG3bt3dTqdr6+vaJVxDn0lSl/duXNHp9O1SWBJiKj1\n99ocRqPx/Pnzd+/eFR4YJ3SS8Mjcti7N6lavXv3dd98dPnw4Nzc3JCSkoqKic+fOZmwnO5t98EHp\n8eP1Hxf63wx9ZXlfHTp0aOrUqY8ePRK9wia10yMsxpiNjU1ISEhISEhbF9IG6l579vLyMq+rGGPD\nhrGcHGdRS+Me+opZ3FcFBQV+fn6iltZcL/wiDW0oPz9feNJcaGjoxo0b27oc6CDE6qs2fOAuAqs9\nioyM/Nvf/ibMFTlmzJiWrn7vHpNIWFLS0x8DA1lZmbgFApcs7Kvi/1wOyM/PxxEWPJOQkDB9+vTQ\n0NCUlJTmr6XVspQUFhrKDh9+OlS6qsp6NQJ/zO0rbUpKSmhoaEBAQG1tLWvbwX2tP5ICmumLL754\nfta0Bp08WfrWW9S1K3l40O9+RzduWGV2WugYmtlXRqMxIyMjKipKmJ9PrVbfvXuXiIRz7Q1OT9oK\nEFjt2oULF+rNmlbXw4cP161bFxAQEBg4c+JE2rePhFnerDQ7LXQYjffV9evX4+Pje/bs6eDgEBsb\ne+zYMdNU2g8ePFi6dKm7u3uLHsMjIgRWe1dcXDxq1CgPD4+6z4k6ceLEpEmT7O3t/fz81q5da2o7\ng4HS0+nAAWsNlYYOo8m++vDDD03zW9TW1u7fv/9Xv/qVnZ2dv7+/hXOoWgKBxQG9Xl9v1jSNRjNz\n5kzTo4CJ6O7dylWrqHdvcnSkQ4esMjstdDAN9lVMTEzdvjI9qMLFxSU+Pv4HYbrXtoPA4obpoZh1\nZ0qrrq7+8ssvx44d2727/6hRxtRU0ume3YxGos5OCx1Sg331+PHjelNpm/FIRGtovyPd4Xm5ubkT\nJ0709vbWaDSXLl3avn373r17XVxc5s6dO2PGDHd397YuELhk6qtdu3bl5eVt3rx5//79PXr0iImJ\niY6OFiauaCcQWJy5ffv2r3/9a71en5eXN3r06Lfffvv111+3s7Nr67qAb3X7Kjw8fPbs2RMmTJBK\npW1dV30ILP5UVVXl5eW5urqaPV0kwPOEvurRo4e493uLC4EFANzASHcA4AYCCwC4gcACAG4gsACA\nGwgsAOAGAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACA\nGwgsAOAGAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACA\nGwgsAOAGAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACA\nGwgsAOAGAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACA\nGwgsAOAGAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACA\nGwgsAOAGAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4Mb/AelZiqmuND8TAAAA\nAElFTkSuQmCC\n", 408 | "text/plain": [ 409 | "" 410 | ] 411 | }, 412 | "execution_count": 30, 413 | "metadata": {}, 414 | "output_type": "execute_result" 415 | } 416 | ], 417 | "source": [ 418 | "Draw.MolsToGridImage(ms, molsPerRow=2)" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": { 425 | "collapsed": true 426 | }, 427 | "outputs": [], 428 | "source": [] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": { 434 | "collapsed": true 435 | }, 436 | "outputs": [], 437 | "source": [] 438 | } 439 | ], 440 | "metadata": { 441 | "kernelspec": { 442 | "display_name": "Python 2", 443 | "language": "python", 444 | "name": "python2" 445 | }, 446 | "language_info": { 447 | "codemirror_mode": { 448 | "name": "ipython", 449 | "version": 2 450 | }, 451 | "file_extension": ".py", 452 | "mimetype": "text/x-python", 453 | "name": "python", 454 | "nbconvert_exporter": "python", 455 | "pygments_lexer": "ipython2", 456 | "version": "2.7.6" 457 | } 458 | }, 459 | "nbformat": 4, 460 | "nbformat_minor": 1 461 | } 462 | -------------------------------------------------------------------------------- /data_files/3_fingerprint_analyse_additional.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 6, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 8, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "pns_count = pd.Series.from_csv(\"fp_files/pns_apfp_count.csv\", header=0)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 10, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "(21160,)" 39 | ] 40 | }, 41 | "execution_count": 10, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "pns_count.shape" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 13, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/plain": [ 58 | "APFP\n", 59 | "10552354 5\n", 60 | "10552355 16\n", 61 | "10552356 7\n", 62 | "10552357 6\n", 63 | "10552358 2\n", 64 | "Name: COUNT, dtype: int64" 65 | ] 66 | }, 67 | "execution_count": 13, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "pns_count.head()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 11, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "import matplotlib.pyplot as plt" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 19, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "a = pns_count.sort_values(ascending=False)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 28, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAD8CAYAAACcjGjIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHYJJREFUeJzt3XuQXGed3vHv090zo/t9JBRJRgJU\n9hoSfJnIIkuRjb3IspNaOQlQplKRylEhspgUVOWyJvuHN7BUYJMsiyqstwxWkCgWYbxQVu3KaLXC\nXmoTZGsMRr5bY2FHUiRrpNH9PjO//NHvmPZMd5+WNDM9o/N8qrr69O+85/R7jmbm0bkrIjAzM6tU\naHYHzMxs7HE4mJnZEA4HMzMbwuFgZmZDOBzMzGwIh4OZmQ3hcDAzsyEyw0HS9ZKeq3idlPR5SbMk\nbZe0J73PTO0lab2kLkm7Jd1SMa81qf0eSWsq6rdKej5Ns16SRmZxzcysEZnhEBGvRsRNEXETcCtw\nFvgR8ACwIyKWAjvSZ4C7gKXptQ54CEDSLOBB4DZgGfDgQKCkNp+qmG7lsCydmZldkdJltr8DeD0i\n3pS0CvitVN8IPAX8HrAK2BTlS693SpohaX5quz0iegAkbQdWSnoKmBYRO1N9E3AP8ES9jsyZMycW\nL158md03M8uvZ5999khEtDfS9nLD4V7ge2l4XkQcTMOHgHlpeAGwr2Ka/alWr76/Sr2uxYsX09nZ\neZndNzPLL0lvNtq24QPSklqB3wF+MHhc2koY8Zs0SVonqVNSZ3d390h/nZlZbl3O2Up3AT+PiLfS\n57fS7iLS++FUPwAsqphuYarVqy+sUh8iIh6OiI6I6Ghvb2jLyMzMrsDlhMMn+fUuJYAtwMAZR2uA\nxyvqq9NZS8uBE2n30zZghaSZ6UD0CmBbGndS0vJ0ltLqinmZmVkTNHTMQdJk4KPApyvKXwEelbQW\neBP4RKpvBe4Guiif2XQfQET0SPoSsCu1++LAwWngM8C3gYmUD0TXPRhtZmYjS+P1eQ4dHR3hA9Jm\nZo2T9GxEdDTS1ldIm5nZEA4HMzMbInfhsH7HHv72NZ8Ga2ZWT+7C4aGnXud/dx1pdjfMzMa03IWD\nBP394/MgvJnZaMldOBSkkb+U28xsnMtdOAjoH6en75qZjZbchQMCZ4OZWX25C4eCnyNkZpYpd+Eg\nebeSmVmW3IVDQfJuJTOzDLkLBx+QNjPLlr9w8KmsZmaZchgOMF7vRGtmNlryFw74VFYzsyy5Cwcf\nkDYzy5a7cPCprGZm2XIXDr63kplZttyFA3jLwcwsS+7CoVAAbzqYmdXXUDhImiHpMUmvSHpZ0ock\nzZK0XdKe9D4ztZWk9ZK6JO2WdEvFfNak9nskramo3yrp+TTNemnkboAk5C0HM7MMjW45fB34cUTc\nAHwQeBl4ANgREUuBHekzwF3A0vRaBzwEIGkW8CBwG7AMeHAgUFKbT1VMt/LqFqs2yRsOZmZZMsNB\n0nTgI8AjABFxMSKOA6uAjanZRuCeNLwK2BRlO4EZkuYDdwLbI6InIo4B24GVady0iNgZ5avTNlXM\na9j5VFYzs2yNbDksAbqB/yXpF5K+JWkyMC8iDqY2h4B5aXgBsK9i+v2pVq++v0p9RPjeSmZm2RoJ\nhxJwC/BQRNwMnOHXu5AASP/jH/G/uJLWSeqU1Nnd3X2F8/BuJTOzLI2Ew35gf0Q8nT4/Rjks3kq7\nhEjvh9P4A8CiiukXplq9+sIq9SEi4uGI6IiIjvb29ga6PpQk31vJzCxDZjhExCFgn6TrU+kO4CVg\nCzBwxtEa4PE0vAVYnc5aWg6cSLuftgErJM1MB6JXANvSuJOSlqezlFZXzGvYFfyYUDOzTKUG2/07\n4LuSWoG9wH2Ug+VRSWuBN4FPpLZbgbuBLuBsaktE9Ej6ErArtftiRPSk4c8A3wYmAk+k14jwqaxm\nZtkaCoeIeA7oqDLqjiptA7i/xnw2ABuq1DuBDzTSl6slbzmYmWXK3RXSftiPmVm2/IUDftiPmVmW\n3IVDoeDdSmZmWXIXDj4gbWaWLXfhUPBFcGZmmXIXDkj0Ox3MzOrKXTj4gLSZWbbchUNhxJ4UYWZ2\n7chdOEg+IG1mliV34eB7K5mZZctdOPhUVjOzbLkLB7zlYGaWKXfh4OsczMyy5S4chB/2Y2aWJX/h\n4N1KZmaZchcOBd+y28wsU+7CQcJnK5mZZchdOIB3K5mZZcldOPhJcGZm2XIXDoXynfea3Q0zszEt\nd+Eg8C27zcwyNBQOkt6Q9Lyk5yR1ptosSdsl7UnvM1NdktZL6pK0W9ItFfNZk9rvkbSmon5rmn9X\nmnbE7p1a3q3kdDAzq+dythz+SUTcFBEd6fMDwI6IWArsSJ8B7gKWptc64CEohwnwIHAbsAx4cCBQ\nUptPVUy38oqXKINvvGdmlu1qdiutAjam4Y3APRX1TVG2E5ghaT5wJ7A9Inoi4hiwHViZxk2LiJ1R\nvnR5U8W8RoCfBGdmlqXRcAjgryU9K2ldqs2LiINp+BAwLw0vAPZVTLs/1erV91epj4jyFdJOBzOz\nekoNtvtwRByQNBfYLumVypEREZJG/C9uCqZ1ANddd92VzWM4O2Rmdo1qaMshIg6k98PAjygfM3gr\n7RIivR9OzQ8AiyomX5hq9eoLq9Sr9ePhiOiIiI729vZGuj5EQfIxBzOzDJnhIGmypKkDw8AK4AVg\nCzBwxtEa4PE0vAVYnc5aWg6cSLuftgErJM1MB6JXANvSuJOSlqezlFZXzGvY+fYZZmbZGtmtNA/4\nUTq7tAT8eUT8WNIu4FFJa4E3gU+k9luBu4Eu4CxwH0BE9Ej6ErArtftiRPSk4c8A3wYmAk+k14iQ\nn+dgZpYpMxwiYi/wwSr1o8AdVeoB3F9jXhuADVXqncAHGujvVZP8PAczsyy5vELa2WBmVl/+wsE3\n3jMzy5S7cCj4Ogczs0y5CwffeM/MLFv+wsE33jMzy5S/cMAHpM3MsuQvHHyFtJlZphyGgw9Im5ll\nyV04FOQD0mZmWXIXDi3FAr39/c3uhpnZmJbLcLjY63AwM6snd+FQLPhJcGZmWXIXDqWC6HM6mJnV\nlbtwKDgczMwy5S4cSgX5gLSZWYbchcPAMQdf62BmVlv+wqH8RDvvWjIzqyN/4VBM4eAtBzOzmvIX\nDt5yMDPLlL9wKJTDodfhYGZWU8PhIKko6ReS/jJ9XiLpaUldkr4vqTXV29LnrjR+ccU8vpDqr0q6\ns6K+MtW6JD0wfIs31EA49DsczMxqupwth88BL1d8/irwtYh4H3AMWJvqa4Fjqf611A5JNwL3Au8H\nVgJ/mgKnCHwDuAu4EfhkajsiSt5yMDPL1FA4SFoI/FPgW+mzgNuBx1KTjcA9aXhV+kwaf0dqvwrY\nHBEXIuJXQBewLL26ImJvRFwENqe2I6JYKC+ytxzMzGprdMvhT4D/BAxcPTYbOB4RvenzfmBBGl4A\n7ANI40+k9m/XB01Tqz4iimmJveVgZlZbZjhI+mfA4Yh4dhT6k9WXdZI6JXV2d3df0TwGthx8tpKZ\nWW2NbDn8JvA7kt6gvMvnduDrwAxJpdRmIXAgDR8AFgGk8dOBo5X1QdPUqg8REQ9HREdEdLS3tzfQ\n9aEGthwcDmZmtWWGQ0R8ISIWRsRiygeUfxIR/wp4EvhYarYGeDwNb0mfSeN/EuV7VWwB7k1nMy0B\nlgLPALuApensp9b0HVuGZemqGNhy8G4lM7PaStlNavo9YLOkPwR+ATyS6o8A35HUBfRQ/mNPRLwo\n6VHgJaAXuD8i+gAkfRbYBhSBDRHx4lX0q66Bi+D6fYW0mVlNlxUOEfEU8FQa3kv5TKPBbc4DH68x\n/ZeBL1epbwW2Xk5frtTbF8H1ORzMzGrJ3RXSA9c5+JiDmVltuQuHgS0H33jPzKy2/IaDH/hjZlZT\njsOhyR0xMxvDchsOflSomVltuQ0HH5A2M6vN4WBmZkPkLhx8KquZWbbchUPBjwk1M8uUu3AoFR0O\nZmZZchcOA/dW8o33zMxqy184FHzjPTOzLLkNB994z8ysttyGg++tZGZWW+7CoeTHhJqZZcpdOKRs\n8AFpM7M6chcOA1sO/Q4HM7OachcOPpXVzCxb/sIhXQTnLQczs9ryFw7ecjAzy5S/cPCT4MzMMmWG\ng6QJkp6R9EtJL0r6L6m+RNLTkrokfV9Sa6q3pc9dafziinl9IdVflXRnRX1lqnVJemD4F/PXWtJu\npUu+CM7MrKZGthwuALdHxAeBm4CVkpYDXwW+FhHvA44Ba1P7tcCxVP9aaoekG4F7gfcDK4E/lVSU\nVAS+AdwF3Ah8MrUdEZIoFuQnwZmZ1ZEZDlF2On1sSa8AbgceS/WNwD1peFX6TBp/hySl+uaIuBAR\nvwK6gGXp1RUReyPiIrA5tR0xpYJ8+wwzszoaOuaQ/of/HHAY2A68DhyPiN7UZD+wIA0vAPYBpPEn\ngNmV9UHT1KpX68c6SZ2SOru7uxvpelUtxYJ3K5mZ1dFQOEREX0TcBCyk/D/9G0a0V7X78XBEdERE\nR3t7+xXPp1T0biUzs3ou62yliDgOPAl8CJghqZRGLQQOpOEDwCKANH46cLSyPmiaWvURUyp4y8HM\nrJ5GzlZqlzQjDU8EPgq8TDkkPpaarQEeT8Nb0mfS+J9ERKT6velspiXAUuAZYBewNJ391Er5oPWW\n4Vi4WlqKorfPWw5mZrWUspswH9iYzioqAI9GxF9KegnYLOkPgV8Aj6T2jwDfkdQF9FD+Y09EvCjp\nUeAloBe4PyL6ACR9FtgGFIENEfHisC1hFeXdSt5yMDOrJTMcImI3cHOV+l7Kxx8G188DH68xry8D\nX65S3wpsbaC/w6KlUOCStxzMzGrK3RXSkLYcfMzBzKymXIZD+VRWbzmYmdWSy3BoKxW40OtwMDOr\nJafhUORCb1+zu2FmNmblMhwmtBQ4f8lbDmZmteQ0HIqcu+QtBzOzWnIZDpNaS5y76HAwM6slp+FQ\n5OzF3uyGZmY5lc9waCtyxlsOZmY15TMcWkpc7O33/ZXMzGrIZThMbisCcNYHpc3MqsplOExsLYeD\nD0qbmVWXy3CYlMLhzAUflDYzqyaX4TClrQWA0w4HM7OqchkO0yeWw+H42UtN7omZ2diUy3B4+4C0\njzmYmVWVy3CY1Fp+xtG5S96tZGZWTU7DYeCAtLcczMyqyWU4TG5LWw7erWRmVlUuw2FiS9py8P2V\nzMyqygwHSYskPSnpJUkvSvpcqs+StF3SnvQ+M9Ulab2kLkm7Jd1SMa81qf0eSWsq6rdKej5Ns16S\nRmJhBxQLYmJL0QekzcxqaGTLoRf49xFxI7AcuF/SjcADwI6IWArsSJ8B7gKWptc64CEohwnwIHAb\nsAx4cCBQUptPVUy38uoXrb7JbUVOnfeWg5lZNZnhEBEHI+LnafgU8DKwAFgFbEzNNgL3pOFVwKYo\n2wnMkDQfuBPYHhE9EXEM2A6sTOOmRcTOiAhgU8W8RsyMSa0cP3txpL/GzGxcuqxjDpIWAzcDTwPz\nIuJgGnUImJeGFwD7Kibbn2r16vur1EfU5FY/Dc7MrJaGw0HSFOAvgM9HxMnKcel//DHMfavWh3WS\nOiV1dnd3X9W8JviYg5lZTQ2Fg6QWysHw3Yj4YSq/lXYJkd4Pp/oBYFHF5AtTrV59YZX6EBHxcER0\nRERHe3t7I12vaWJrkfPecjAzq6qRs5UEPAK8HBF/XDFqCzBwxtEa4PGK+up01tJy4ETa/bQNWCFp\nZjoQvQLYlsadlLQ8fdfqinmNmPKjQh0OZmbVlBpo85vAvwael/Rcqv1n4CvAo5LWAm8Cn0jjtgJ3\nA13AWeA+gIjokfQlYFdq98WI6EnDnwG+DUwEnkivETWhpeiL4MzMasgMh4j4O6DWdQd3VGkfwP01\n5rUB2FCl3gl8IKsvw2lii3crmZnVkssrpMG7lczM6sltOExsKZ/KWt7QMTOzSvkNh3Tbbm89mJkN\nldtwGHga3MnzfhqcmdlguQ2HKRPKWw5n/BxpM7Mh8hsO6VGhvvmemdlQuQ2H6RNbATh+zruVzMwG\ny204zJvWBsChE+eb3BMzs7Ent+Ewd+oEAI6cutDknpiZjT25DYfWUoGpE0ocOe1wMDMbLLfhADB3\nahuHveVgZjZErsNhxqRWX+dgZlZFrsNhSluJ0z6V1cxsiFyHw9QJJV/nYGZWRa7DYdbkVo6eudjs\nbpiZjTm5DoeBYw59/b4zq5lZpXyHw8QWIuCkr5I2M3uHXIfD7CnlW2j4Wgczs3fKdTi0Ty3fQqPb\n4WBm9g75DocpKRx8IZyZ2TvkOxzSlsOR0z5jycysUmY4SNog6bCkFypqsyRtl7Qnvc9MdUlaL6lL\n0m5Jt1RMsya13yNpTUX9VknPp2nWS9JwL2Qt0ye20FKUtxzMzAZpZMvh28DKQbUHgB0RsRTYkT4D\n3AUsTa91wENQDhPgQeA2YBnw4ECgpDafqphu8HeNGEnMmdLmA9JmZoNkhkNE/BToGVReBWxMwxuB\neyrqm6JsJzBD0nzgTmB7RPRExDFgO7AyjZsWETsjIoBNFfMaFXOmtHnLwcxskCs95jAvIg6m4UPA\nvDS8ANhX0W5/qtWr769Sr0rSOkmdkjq7u7uvsOvv1O47s5qZDXHVB6TT//hH5RLjiHg4IjoioqO9\nvX1Y5nndrEn836NnKC+GmZnBlYfDW2mXEOn9cKofABZVtFuYavXqC6vUR83i2ZM4c7HPZyyZmVW4\n0nDYAgyccbQGeLyivjqdtbQcOJF2P20DVkiamQ5ErwC2pXEnJS1PZymtrpjXqHj3nMkAvHH0zGh+\nrZnZmNbIqazfA34GXC9pv6S1wFeAj0raA/x2+gywFdgLdAHfBD4DEBE9wJeAXen1xVQjtflWmuZ1\n4InhWbTGvHfOFABeP3x6NL/WzGxMK2U1iIhP1hh1R5W2AdxfYz4bgA1V6p3AB7L6MVIWzJxIW6lA\nl8PBzOxtub5CGqBYEO9tn0JXt8PBzGxA7sMB4H1zp7DnLYeDmdkAhwOwdO4UDhw/R4+fCmdmBjgc\nAPit6+cC8OQrhzNampnlg8MB+I35U5naVuKp14bnqmszs/HO4QCUigXuXbaIv9r9/zh86nyzu2Nm\n1nQOh+STy65DEt/86d5md8XMrOkcDsl72qdw5/vnsfmZfZy50Nvs7piZNZXDocLaDy/h1IVeHnt2\nf3ZjM7NrmMOhwq3vnsXN183gW3+3l96+/mZ3x8ysaRwOg3z6I+9hX885tr5wqNldMTNrGofDIL/9\nG/N4b/tk/tu2V3zswcxyy+EwSKlY4L/+i3/A/mPn+KMfv9Ls7piZNYXDoYplS2ax5kOL2fizN+l8\nY/Djs83Mrn0Ohxr+453X8/emT+D+P/85+3rONrs7ZmajyuFQw+S2Ehvu+4ecv9TPx//sZ3QdPtXs\nLpmZjRqHQx03vGsa3//0cnr7g3/50M94/LkD9PdHs7tlZjbiHA4ZbnjXNH74u/+IJXMm87nNz/GP\n//uT/MnfvOZdTWZ2TVP5yZ7jT0dHR3R2do7a913q6+evdh/kB8/u4/+8fpQIuPm6Gfz9BdO54V3T\nuGH+VK6fN5XJbZlPXjUzawpJz0ZER0NtHQ6Xb/+xs/zw5wf429e6efXQKU5XXA8xZ0obi2dP4rrZ\nk1g4YyKzp7QxY1IL7VPbWDBjIgtnTqJYUFP6bWb5Ni7DQdJK4OtAEfhWRHylXvtmhkOl/v7gwPFz\nvHLoFK8eOsm+nnO8cfQMbx49y+FT5xl8iKK1VGDJ7Mm8e/Yk5k5rY9bkNuZMaWX25DZmTW4tD09p\nY/rEFoeImQ2rywmHMbEPRFIR+AbwUWA/sEvSloh4qbk9y1YoiEWzJrFo1iQ+euO8d4zr7evnxLlL\nHDt7kcMnL7Dv2Fn2dp/h9e7TvHH0DLve6OH4uUtUy2cJZk5qZdbkVuZPn8C7pk3gXdMnMHdqG7On\nlINk4DV1Qom2UnGUltjM8mBMhAOwDOiKiL0AkjYDq4AxHw71lIoFZk8p/zF/39ypVdv09QdHz1zg\n2JlLHD19gSNnLtJz+gI9Zy/Rc+YCR05d5OCJc7x66BRHTl8YsiUyoK1UYEpbiQktRSa2FpnYUmRC\nS4EJLUXaSuX3CQO1UpG2lgJtpSKtpQKtxQKtpQKlgigVC7QURalQoFTU28PFgt750q+HSwVRKIiC\nREFQkJCgmGpKtYHxGtROpPfKYcrtyu/lYTMbPWMlHBYA+yo+7wdua1JfRlWxIOZOncDcqROA6gEy\noLevn56zFzly6iLHzl7k6JmLHDtzkVPnL3Hi3CXOXuzj3KU+zqX3C5f6OX2hlyOn+7lwqY/zl/o4\n39tffr/UVzNoxqrK0CikIOHtWgobLi9U6o2uN2W9+WbFWP0u1Zlvxozr97fedCP1nVce6HX7O8aW\nJXMph/lnbNakVh79tx/K+tarNlbCoSGS1gHrAK677rom92b0lYqFiiC5er19/Vzs6+dib/nV2x/0\n9gWX+vvL730DtX76+oO+iPL7oFdvf9AfQQT0R9Cf3qNiuD8of+6vrA3UIShPH2k+QZX6oFp/Gubt\n9kPnV0+94231Jq0336g7Zda0VzZd1tR1v3MMLkvd773i78xYlrrTXtl0Wd9bd9o6I6dOGJ0/22Ml\nHA4Aiyo+L0y1d4iIh4GHoXxAenS6du0qFQuUigUmtTa7J2Y21oyVi+B2AUslLZHUCtwLbGlyn8zM\ncmtMbDlERK+kzwLbKJ/KuiEiXmxyt8zMcmtMhANARGwFtja7H2ZmNnZ2K5mZ2RjicDAzsyEcDmZm\nNoTDwczMhnA4mJnZEGPmrqyXS1I38OYVTj4HODKM3bnWeP3U5/VTn9dPbc1eN++OiPZGGo7bcLga\nkjobvW1tHnn91Of1U5/XT23jad14t5KZmQ3hcDAzsyHyGg4PN7sDY5zXT31eP/V5/dQ2btZNLo85\nmJlZfXndcjAzszpyFQ6SVkp6VVKXpAea3Z/RJOkNSc9Lek5SZ6rNkrRd0p70PjPVJWl9Wk+7Jd1S\nMZ81qf0eSWuatTxXS9IGSYclvVBRG7b1IenWtL670rTj6jmnNdbPH0g6kH6GnpN0d8W4L6RlfVXS\nnRX1qr9z6fb8T6f699Ot+scNSYskPSnpJUkvSvpcql87P0ORnth1rb8o3wr8deA9QCvwS+DGZvdr\nFJf/DWDOoNofAQ+k4QeAr6bhu4EnKD/FcDnwdKrPAvam95lpeGazl+0K18dHgFuAF0ZifQDPpLZK\n097V7GUehvXzB8B/qNL2xvT71AYsSb9nxXq/c8CjwL1p+M+A3232Ml/m+pkP3JKGpwKvpfVwzfwM\n5WnLYRnQFRF7I+IisBlY1eQ+NdsqYGMa3gjcU1HfFGU7gRmS5gN3AtsjoicijgHbgZWj3enhEBE/\nBXoGlYdlfaRx0yJiZ5R/yzdVzGtcqLF+alkFbI6ICxHxK6CL8u9b1d+59D/g24HH0vSV63pciIiD\nEfHzNHwKeBlYwDX0M5SncFgA7Kv4vD/V8iKAv5b0bHoWN8C8iDiYhg8B89JwrXV1ra/D4VofC9Lw\n4Pq14LNpt8iGgV0mXP76mQ0cj4jeQfVxSdJi4Gbgaa6hn6E8hUPefTgibgHuAu6X9JHKkel/Jz51\nLfH6qOoh4L3ATcBB4H80tzvNJ2kK8BfA5yPiZOW48f4zlKdwOAAsqvi8MNVyISIOpPfDwI8ob/K/\nlTZfSe+HU/Na6+paX4fDtT4OpOHB9XEtIt6KiL6I6Ae+SflnCC5//RylvFulNKg+rkhqoRwM342I\nH6byNfMzlKdw2AUsTWdJtAL3Alua3KdRIWmypKkDw8AK4AXKyz9wdsQa4PE0vAVYnc6wWA6cSJvK\n24AVkmamXQorUu1aMSzrI407KWl52r++umJe49bAH73kn1P+GYLy+rlXUpukJcBSygdTq/7Opf9R\nPwl8LE1fua7HhfTv+gjwckT8ccWoa+dnqNlH/UfzRfmMgdcon0Hx+83uzygu93sonynyS+DFgWWn\nvO93B7AH+BtgVqoL+EZaT88DHRXz+jeUDzh2Afc1e9muYp18j/KukUuU9+euHc71AXRQ/uP5OvA/\nSRecjpdXjfXznbT8uyn/sZtf0f7307K+SsVZNbV+59LP5DNpvf0AaGv2Ml/m+vkw5V1Gu4Hn0uvu\na+lnyFdIm5nZEHnarWRmZg1yOJiZ2RAOBzMzG8LhYGZmQzgczMxsCIeDmZkN4XAwM7MhHA5mZjbE\n/wf3xMIpnDEfowAAAABJRU5ErkJggg==\n", 109 | "text/plain": [ 110 | "
" 111 | ] 112 | }, 113 | "metadata": {}, 114 | "output_type": "display_data" 115 | } 116 | ], 117 | "source": [ 118 | "plt.plot(a.values)\n", 119 | "plt.show()" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 32, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHItJREFUeJzt3Xl8VOW9x/HPbyYbCQlrCEsImyCC\nyBYWAbUuIFqXutTlal2qpdZatdqF2vZqr7223lqtS22lSt1LtcXrUiwigmyCBGRHCCAICEnYw06S\n5/6RgRuRkEmYyTln8n2/XnnlcObM5HceZ76eec5zzmPOOUREJDhCXhcgIiK1o+AWEQkYBbeISMAo\nuEVEAkbBLSISMApuEZGAUXCLiASMgltEJGCSotnIzNYCpUA5UOacy49nUSIiUr2ogjvibOfclmg2\nbNmypevYsWPdKhIRaYDmzZu3xTmXHc22tQnuqHXs2JGCgoJ4vLSISEIys3XRbhttH7cD3jOzeWY2\nqm5liYhILER7xD3MObfRzFoBk8zsU+fctKobRAJ9FEBeXl6MyxQRkcOiOuJ2zm2M/C4G3gAGHmOb\nMc65fOdcfnZ2VN00IiJSBzUGt5llmFnm4WVgBLAk3oWJiMixRdNVkgO8YWaHt3/VOffvuFYlIiLV\nqjG4nXNrgN71UIuIiERBV06KiASMr4L7icmFfLiyxOsyRER8zVfB/cyHq5m6otjrMkREfM1XwZ2Z\nlszu/WVelyEi4ms+C+4kShXcIiLH5b/gPnDI6zJERHzNZ8GdrCNuEZEa+Cy41VUiIlITnwV3MqX7\n1VUiInI8vgrurLQkdumIW0TkuPwV3I2SOVhWwf5D5V6XIiLiW74K7laZqQAU7zrgcSUiIv7lq+DO\nyUoDoKh0v8eViIj4lz+De5eCW0SkOj4L7squkiJ1lYiIVMtXwd2kUTIpSSGKdcQtIlItXwW3mZGT\nlcpmBbeISLV8FdwArbPS1MctInIcvgvuVllpGg4oInIcvgvu1llpbNixj4oK53UpIiK+5Lvg7pyd\nwcGyCtZv3+t1KSIivuS74O6X1wyABet3eFyJiIg/+S64u2Q3JilkfLq51OtSRER8yXfBnZIUIq9F\nOmu37PG6FBERX/JdcAO0b5auPm4RkWr4MrjzmqezbutenNPIEhGRo/kyuLtkZ1C6v4yS3RrPLSJy\nNF8Gd7ecTACWb9IJShGRo/kyuE/NbQLAIg0JFBH5Cl8Gd1ZaMl2yM1i4YafXpYiI+I4vgxugR9sm\nrCja5XUZIiK+49vg7taqMeu37WPvQc36LiJSVdTBbWZhM/vEzN6JZ0GHdY2coCws2l0ff05EJDBq\nc8R9F7A8XoUcrVtOYwBWFmlkiYhIVVEFt5nlAl8Hno1vOf+vQ4sMUpJCFBbriFtEpKpoj7j/APwE\nqIhjLV8SDhldshvriFtE5Cg1BreZXQQUO+fm1bDdKDMrMLOCkpKSmBR3ck5jlm/SyBIRkaqiOeIe\nClxiZmuBccA5Zvby0Rs558Y45/Kdc/nZ2dkxKa5P+6YU7TrAxh37YvJ6IiKJoMbgds79zDmX65zr\nCFwDfOCcuz7ulQH5HZsDULB2W338ORGRQPDtOG6A7q0zSU8JM3/ddq9LERHxjaTabOycmwpMjUsl\nx5AUDtE7tynzPldwi4gc5usjboD+HZqxfFOprqAUEYnwfXAP7NSc8grHzFVbvS5FRMQXfB/cp3dp\nQXpKmHcXb/K6FBERX/B9cCeHQ1yV3563Fn5B0a79XpcjIuI53wc3wM1DO1LuHK/MXud1KSIingtE\ncHdokcG53VvxypzPOVBW7nU5IiKeCkRwA9xweke27jnIe0uLvC5FRMRTgQnuoSe1pG2TNF6ft8Hr\nUkREPBWY4A6HjCv65zK9sIRNO3XvEhFpuAIT3ABX9s/FORg/f6PXpYiIeCZQwd2hRQYDOzbn9YL1\nOOe8LkdExBOBCm6AK/NzWbt1L3PX6v4lItIwBS64v96rDekpYf4+d73XpYiIeCJwwZ2RmsSlfdry\nzqIvdJJSRBqkwAU3wK1ndOZAWQXvLNT9S0Sk4QlkcHfJbkx+h2a8NHsd5RU6SSkiDUsggxvg5qGd\n+HzbXiYt05WUItKwBDa4z++ZQ/vmjfjL9DVelyIiUq8CG9xJ4RC3DO3EvHXbmbdOkwmLSMMR2OAG\n+GZ+e5qlJ/ObCZ/qghwRaTACHdwZqUn8+PzuFKzbzjuLNMJERBqGQAc3wNUD2tMtpzFPflDIofIK\nr8sREYm7wAd3OGTcM/xkVhbt5qkPVnldjohI3AU+uAFGntqay/q244kPCvn3ks1elyMiElcJEdwA\nD13Wi965Tblz3CcaZSIiCS1hgrtRSpi/3jSAVpmp3P33BZSUHvC6JBGRuEiY4AZolpHCk9f2paT0\nAJc9PZPlm3Z5XZKISMwlVHAD9M1rxrhRp3OovILLnp7JmGmrdT8TEUkoCRfcAH3aN+XtHwxj2Ekt\neWjCp1z+9ExWFpV6XZaISEwkZHADtMpM4y835PP4NX3YsH0fFz85g99MWM7+Q+VelyYickISNrgB\nzIxL+7Tj3bvP4MJebXhm2houeHw6M1dt0SXyIhJYCR3ch7XKTOOxq/vw8i2DKK9wXPfsHG7661w+\nWr1V/d8iEjhW05GnmaUB04BUIAn4h3Pu/uM9Jz8/3xUUFMSsyFjad7Ccl2ev4/HJhew+UEbbJml8\n58zOXDeoAylJDeL/YyLiQ2Y2zzmXH9W2UQS3ARnOud1mlgzMAO5yzs2u7jl+Du7D9h4s4/3lxbwy\nex1zPttGt5zGPHfjANo3T/e6NBFpgGoT3DUeYrpKuyP/TI78BL5/IT0liUt6t2XcqME8d2M+m3fu\n54o/zWLJxp1elyYiclxR9Q2YWdjMFgDFwCTn3Jz4llV/zIxzT8lh3KjTCYeMq575iKkrir0uS0Sk\nWlEFt3Ou3DnXB8gFBprZqUdvY2ajzKzAzApKSkpiXWfc9WibxZvfH0pe83RueaGAX729lKJd+70u\nS0TkK2p1Ns45twOYAow8xmNjnHP5zrn87OzsWNVXr1plpfH6badzZb9cXpi1lrMfmcofp6ziYJnu\n8y0i/lFjcJtZtpk1jSw3AoYDn8a7MK9kpiXz8JWn8f49Z3FG15b8buIKhvz2AyYs1gw7IuIP0Rxx\ntwGmmNkiYC6VfdzvxLcs73XObswz38rnpVsG0rZpGre/Mp/bXprHjr0HvS5NRBq4GocD1kUQhgPW\nxsGyCsZMW80Tk1eR1SiZe4Z34+oB7QmHzOvSRCRBxHQ4oEBKUog7zunK67edTscW6dz3xmKuf3YO\nxTp5KSIeUHDXQu/2TXn9ttP5zeW9KFi3jeGPTaNgrWbbEZH6peCuJTPj2oF5TLjzDJpnpHDriwUU\n6paxIlKPFNx11DUnk7E3DSApZFwzZjafb93rdUki0kAouE9Ap5YZjBs1mEPlFdz0/Me6YEdE6oWC\n+wSd1CqT524aQNHO/dz817ns3HfI65JEJMEpuGNgQMfmPHVdP1YWlfIff5mt8BaRuFJwx8jZJ7fi\nT9f3Z8XmUq4dM5udexXeIhIfCu4YGt4jh7/ckM/KolKu/PMshbeIxIWCO8bO7t6K528eyGdb9nD7\nq/Mo3a/wFpHYUnDHwbCuLfntFacxe802Ln96lkabiEhMKbjj5Mr+ufz1pgFs3LGPa8bMZvNOhbeI\nxIaCO47O7JbN2JsGsGnnPi57eiYzCrd4XZKIJAAFd5wN7tyCv31nMMnhENc/N4cbx37M1t0HvC5L\nRAJMwV0P+uY14927zuCnI7vz0ZqtjHx8OhOXbva6LBEJKAV3PclITeJ7X+vCG7cPIT0lzHdfmscd\nr85n2x5NzCAitaPgrmc92zZh4t1n8oNzTuLdJZs5+5GpfLR6q9dliUiAKLg9kJYc5t4RJ/PG7UNo\nlp7Mdc/OZvz8DV6XJSIBoeD20Gm5TXn9tiH079CMe15byCMTV1BREfup5EQksSi4PZadmcpLtwzi\n0j5teWrKKv7j2dls0agTETkOBbcPpCWH+cPVffjF10+hYO12LnlyBnM1JZqIVEPB7RNmxq1ndOaV\nWwdR7hxXP/MRf5yyCufUdSIiX6bg9plBnVvw3g/P4pzurfjdxBXcOW6BblQlIl+i4PahJo2SeeZb\n+Xz/7C68vfALbhz7MWtKdntdloj4hILbp8Ih48fnd+eRb/ZmyRe7GPn4dF6YtVZdJyKi4Pa7K/vn\nMvmes+if14z731rKLS8UsH6bZpQXacgU3AHQvnk6r9w6iHuHd2NG4RZGPDaN1wrWe12WiHhEwR0Q\noZDxg3O78v49Z9GlVQY/+ccirh0zm2JN0iDS4Ci4AyavRTrjvzeUH57XjdmfbeXsR6byv59s9Los\nEalHCu4ASkkKcdd5XXnl1kFkpCZx998XcOHj01m0YYfXpYlIPVBwB9iQLi2Z/tOzufPcrqwoKuWS\np2Yy+p+LKNf9TkQSmoI74FKTwtwzvBszfno23XIaM27uekb+YZpGnogksBqD28zam9kUM1tmZkvN\n7K76KExqp02TRvz7rjO578LuFBbv5oz/mcJ/vb2MPQfKvC5NRGIsmiPuMuBe51wPYDDwfTPrEd+y\npC5CIWPUmV145wfD6Nk2i7EzP6Pvg5N4bNJKDpSVe12eiMRIjcHtnNvknJsfWS4FlgPt4l2Y1N2p\n7ZrwrzvP4Onr+tEsPZnHJxfS/8H3mblKs8yLJIJa9XGbWUegLzAnHsVIbF3Yqw1z7juPh6/oxcGy\nCq57dg63PD9X/d8iAWfR3vvCzBoDHwL/7Zwbf4zHRwGjAPLy8vqvW7culnXKCSou3c994xfz/vJi\nAC7p3ZZfXtSD7MxUjysTEQAzm+ecy49q22iC28ySgXeAic65R2vaPj8/3xUUFETz96WeLd6wkwfe\nXsq8ddsB+NbgDvz0gu40Tk3yuDKRhi2mwW1mBrwAbHPO3R3Niyq4/W/W6i38bPxi1m2t7Da5aUhH\nfnheN5qkJ3tcmUjDFOvgHgZMBxYDFZHV9znnJlT3HAV3cLy7eBO/ensZmyP3PBnZszUPX3GaAlyk\nnsW8q6S2FNzB83rBekaPX3zkqstffP0Ubj2js8dViTQctQluXTkpAHwzvz2Fv76Ae4Z3A+DX/1rO\nub+fSpHuPijiOwpuOSIUMu48tysf/ewc2jVtxOqSPQx6aDKj/7mI/Yd0AY+IXyi45SvaNGnEzNHn\ncP/FlRfIjpu7nu6//LcmbxDxCQW3VOvmoZ1Y9d8X8M3+uQD85B+L6PXARN5csFFzX4p4SCcnJSob\nd+zjR68t5KM1WwFolBzmlxf14NqB7akcMSoiJ0KjSiRuVpfs5ldvL2PayhIAMlLCTPzhmeQ2S/e4\nMpFg06gSiZsu2Y158dsD+fjn59ItpzF7DpYz7OEpjHqxQLeQFaknCm6pk1aZabz3w7N4+IpeALy3\nrIie90/kicmFVGgGHpG4UnDLCbl6QB5rHrqQm4Z0BODRSSvpfN8Elmzc6W1hIglMwS0nLBQyHrik\nJwv+czj98poCcNGTM7j86ZmsKt7tcXUiiUfBLTHTND2F8bcP5U/X9SMzNYn5n+/gvEc/5Jt/nsX2\nPQe9Lk8kYSi4JeYu6NWGRQ+M4Mlr+5KVlsTctdvp++AkfvG/izlUXlHzC4jIcSm4JS7MjIt7t2Xh\n/SP4xddPAeDl2Z/T9efv8scpq3QJvcgJUHBLXJkZt57RmRW/HslFp7UB4HcTV9D9l//mmQ9XawSK\nSB3oAhypV7sPlPHQhOW8OufzI+sevao3l/fL9bAqEe/pyknxvR17D3LHq58wIzLzfFLIeOzqPlzc\nu63HlYl4Q1dOiu81TU/h5VsH8fHPz+WsbtmUVTh+8LdPOOeRqcwo3KIuFJHj0BG3+MLSL3Zy3/jF\nLNxQeeFO66w0fnVpT87v2drjykTqh7pKJLBmrdrCneMWsGX3AQB6ts3iRyNOZkCn5pqJXhKaglsC\nzTnHko27+P2kFUxdUXJk/T3Du3Fhrzac1Kqxh9WJxIeCWxLGzFVbmLSsiOdnrT2y7rtndebHI04m\nKaxTNJI4FNyScEpKDzB25mf8aepqANJTwtwyrBP3jjjZ48pEYkPBLQmrdP8h/vPNpbzxyUYAwiHj\nmev7M7hLC/WBS6ApuCXhrS7ZzZgP1/D3KhMY//N7Qzi1XRapSWEPKxOpGwW3NBhTVhQzdsZnTC+s\nvJBnUKfm3DO8G71ym5CeoiNwCQ4FtzQoh8ormLNmGzeMncPh63a+0act3zq9Ax1bZNCicaq3BYpE\nQcEtDVLRrv2sLtnNg+8sZ/mmXUDlOPAnr+1LZloy2ZkKcPEvBbc0aJt37mdlUSnj5n7OhMWbgcqT\nmLNGn0NOVprH1YkcW22CW52AknBaN0mjdZM0Tm3XhPN7tmZ18W6e+GAVgx6aDMD1g/P49Td6eVyl\nSN0puCVhNc9I4dI+7ThYVkFmWjK7D5Txr8WbGD9/I8u+2EVSOMT9F/egZ9smXpcqUisKbkl4KUkh\nvnNmZwBOaZPFK3PW4RzMWLWFR99bycBOzclqlMzV+e0JhczjakVqpuCWBmXkqa0ZeWrlHQfPfmQq\nkz8tZvKnxQB0y8mkZ9ssQmakJOlyevGvGk9OmtlY4CKg2Dl3ajQvqpOTEgRl5RUcLK9gycZdXPXM\nR0fWm8Ez1/dnhG4pK/Uo1icnnweeAl48kaJE/CYpHCIpHCK/QzMe/MaplO4/hHOVc2K+NHsdKzaX\nAtCxZYZm5hFfqTG4nXPTzKxj/EsR8UYoZHxrcIcj/3574RdML9xy5GpMMxjRM0eX0otvqI9b5Cjv\n3nUG5ZFLMF+Z8zn3v7WUy5+eRVLkxOVJrTL5/VW9vSxRGriYnYExs1FmVmBmBSUlJTU/QcSnzOxI\nN8rXTs5mRI8csjNTaZaRQumBMv45fwMHysq9LlMasKiunIx0lbyjk5PS0L300Vp++eZSmqUnE7Iv\nDx3snJ3Ba989HTMNKZTa05WTInEyomdr1mzZw6Hyii+tX76plLlrt7PvULnuSihxV+M7zMz+BnwN\naGlmG4D7nXPPxbswET/KyUrj/ot7fmX9y7PXMW/ddi55auaRvnCA1OQwj13Vm87ZmidTYieaUSXX\n1kchIkF2VrdsLu7dloNV+r73HixneuEWFm3YqeCWmNJ3OpEYaN88nSev7fuldcW79jPwocm8t2wz\nW3YfOObz8js2p0/7pvVRoiQQBbdInDRJT6Zl4xQmLN585PayR+ud24Q37xhWz5VJ0Cm4ReIkNSnM\nrNHnVjt08MevL2JlcWk9VyWJQMEtEkcpSaFqb1iVmZbExu37+Pbzc4/5+IgeOVwzMC+e5UlAKbhF\nPHLuKTmsKCqlpPSr/d9rt+6huHS/gluOScEt4pGqt5g92u2vzGNl0e56rkiCQsEt4kNpyWFKSg/w\n5w9XV7tNr3ZNGHpSy3qsSvxCwS3iQye1asz4+Rv57bufVrtNu6aNmDn6nHqsSvxCwS3iQ7d/7SRu\nHtKp2sf/651lTFx67CGGkvgU3CI+1Sil+vt/p6eEOVhWUe3jktgU3CIBlJIUYt+hch5/vzCq7VOT\nQ1w3KI/MtOQ4Vyb1QcEtEkDdchpT4RyPvb8y6ue0bdqISzQFW0JQcIsE0GV9c7m0d7uotl2/fS9n\n/W4qBw5p8odEoeAWCahQKLoJGw7PlXmovOZJUyQYYjZ1mYj4U3K4MuDLKnQyM1HoiFskwSVH7pVS\nsHY7jZJrP1N9vw7N6KL7ifuKglskwTVKDpOVlsRbC7/grYVf1Pr5Q7q04NXvDI5DZVJXCm6RBJcc\nDjFj9Dns3Huo1s+997WF7NNJTd9RcIs0AFlpyWTVYQx3RmqYrXsU3H6jk5MiUq1wKESZRqP4joJb\nRKqVFDLKKxTcfqPgFpFqhcOmYYQ+pD5uEalWUsjYd7CcwqLYz41pBp1aNiYc5YVE8v8U3CJSrfSU\nJL7YuZ/hj02Ly+t//+wu/Pj87nF57USm4BaRat07ohtDT2oRl9f+2fjFbNtT+yGKouAWkeNo2TiV\ni06Lzx0FH3xnGc7pxGdd6OSkiHgibBqxUlcKbhHxhJmh3K4bBbeIeCIUggp1ldSJgltEPBE2U3DX\nkYJbRDwRUldJnSm4RcQToZBRoeSuk6iC28xGmtkKM1tlZqPjXZSIJL6QqY+7rmoMbjMLA38ELgB6\nANeaWY94FyYiiS2k4YB1Fs0R90BglXNujXPuIDAOuDS+ZYlIolMfd91Fc+VkO2B9lX9vAAbFpxwR\naShCIfho9RaGP/qh16XETLP0FF677fS4/52YXfJuZqOAUQB5eXmxelkRSVA3DenEB58WeV1GTNVl\nlqG6iCa4NwLtq/w7N7LuS5xzY4AxAPn5+foCJCLHdWX/XK7sn+t1GYEUTR/3XKCrmXUysxTgGuCt\n+JYlIiLVqfGI2zlXZmZ3ABOBMDDWObc07pWJiMgxRdXH7ZybAEyIcy0iIhIFXTkpIhIwCm4RkYBR\ncIuIBIyCW0QkYBTcIiIBY/GYrNPMSoB1dXx6S2BLDMtJNGqf41P7HJ/ap3pet00H51x2NBvGJbhP\nhJkVOOfyva7Dr9Q+x6f2OT61T/WC1DbqKhERCRgFt4hIwPgxuMd4XYDPqX2OT+1zfGqf6gWmbXzX\nxy0iIsfnxyNuERE5Dt8Ed0OekNjM1prZYjNbYGYFkXXNzWySmRVGfjeLrDczeyLSTovMrF+V17kx\nsn2hmd3o1f6cKDMba2bFZrakyrqYtYeZ9Y+096rIc61+9/DEVNM+D5jZxsh7aIGZXVjlsZ9F9nWF\nmZ1fZf0xP3ORWzjPiaz/e+R2zoFhZu3NbIqZLTOzpWZ2V2R94ryHnHOe/1B5u9jVQGcgBVgI9PC6\nrnrc/7VAy6PW/Q8wOrI8Gng4snwh8C5gwGBgTmR9c2BN5HezyHIzr/etju1xJtAPWBKP9gA+jmxr\nkede4PU+x6B9HgB+dIxte0Q+T6lAp8jnLHy8zxzwGnBNZPnPwPe83udatk8boF9kORNYGWmHhHkP\n+eWIWxMSf9WlwAuR5ReAb1RZ/6KrNBtoamZtgPOBSc65bc657cAkYGR9Fx0LzrlpwLajVsekPSKP\nZTnnZrvKT+CLVV4rEKppn+pcCoxzzh1wzn0GrKLy83bMz1zkyPEc4B+R51dt60Bwzm1yzs2PLJcC\ny6mcOzdh3kN+Ce5jTUjczqNavOCA98xsXmTuToAc59ymyPJmICeyXF1bJXobxqo92kWWj16fCO6I\nfNUfe7gbgNq3Twtgh3Ou7Kj1gWRmHYG+wBwS6D3kl+Bu6IY55/oBFwDfN7Mzqz4Y+b+6hv9EqD2O\n6U9AF6APsAn4vbfleM/MGgP/BO52zu2q+ljQ30N+Ce6oJiROVM65jZHfxcAbVH6NLYp8JSPyuziy\neXVtlehtGKv22BhZPnp9oDnnipxz5c65CuAvVL6HoPbts5XKroKko9YHipklUxnarzjnxkdWJ8x7\nyC/B3WAnJDazDDPLPLwMjACWULn/h89i3wi8GVl+C7ghciZ8MLAz8vVvIjDCzJpFviaPiKxLFDFp\nj8hju8xscKQ/94YqrxVYhwMp4jIq30NQ2T7XmFmqmXUCulJ5Yu2Yn7nIkegU4MrI86u2dSBE/rs+\nByx3zj1a5aHEeQ95fQb48A+VZ3ZXUnmm++de11OP+92ZyjP6C4Glh/edyr7GyUAh8D7QPLLegD9G\n2mkxkF/ltb5N5cmnVcDNXu/bCbTJ36j8un+Iyv7DW2LZHkA+lcG2GniKyIVoQfmppn1eiuz/IiqD\nqE2V7X8e2dcVVBn9UN1nLvKe/DjSbq8DqV7vcy3bZxiV3SCLgAWRnwsT6T2kKydFRALGL10lIiIS\nJQW3iEjAKLhFRAJGwS0iEjAKbhGRgFFwi4gEjIJbRCRgFNwiIgHzfwvnQoSGc4mTAAAAAElFTkSu\nQmCC\n", 130 | "text/plain": [ 131 | "
" 132 | ] 133 | }, 134 | "metadata": {}, 135 | "output_type": "display_data" 136 | } 137 | ], 138 | "source": [ 139 | "plt.plot(np.log10(a.values))\n", 140 | "plt.show()" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [] 156 | } 157 | ], 158 | "metadata": { 159 | "kernelspec": { 160 | "display_name": "Python 2", 161 | "language": "python", 162 | "name": "python2" 163 | }, 164 | "language_info": { 165 | "codemirror_mode": { 166 | "name": "ipython", 167 | "version": 2 168 | }, 169 | "file_extension": ".py", 170 | "mimetype": "text/x-python", 171 | "name": "python", 172 | "nbconvert_exporter": "python", 173 | "pygments_lexer": "ipython2", 174 | "version": "2.7.6" 175 | } 176 | }, 177 | "nbformat": 4, 178 | "nbformat_minor": 2 179 | } 180 | -------------------------------------------------------------------------------- /data_files/chembl_preparation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from __future__ import division\n", 12 | "\n", 13 | "import os\n", 14 | "import numpy as np\n", 15 | "import pandas as pd\n", 16 | "from scipy import stats\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "%matplotlib inline" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 30, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "from rdkit import Chem\n", 30 | "from rdkit.Chem import Draw\n", 31 | "from rdkit.Chem.Draw import IPythonConsole\n", 32 | "from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprint" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "txt_dir = \"chembl_source\"" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [ 53 | { 54 | "name": "stderr", 55 | "output_type": "stream", 56 | "text": [ 57 | "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (1,3,6,7,8,9,11,14,16,19,23,27,31,34,35,38,44,48,50,52,53,54,55,56,57,58) have mixed types. Specify dtype option on import or set low_memory=False.\n", 58 | " interactivity=interactivity, compiler=compiler, result=result)\n" 59 | ] 60 | }, 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "(1235867, 59)" 65 | ] 66 | }, 67 | "execution_count": 4, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "# read all chembl bioactivity records\n", 74 | "chembl = pd.read_csv(os.path.join(txt_dir, \"inhibitor_2017_06_08.csv\"), delimiter=\"\\t\")\n", 75 | "chembl.shape" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 5, 81 | "metadata": { 82 | "collapsed": false, 83 | "scrolled": true 84 | }, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "(1230260, 59)" 90 | ] 91 | }, 92 | "execution_count": 5, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "# Remove records has no canonical smiles\n", 99 | "m = chembl[\"CANONICAL_SMILES\"].isnull()\n", 100 | "chembl = chembl[~m]\n", 101 | "chembl.shape" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 21, 107 | "metadata": { 108 | "collapsed": true 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "# save inhibitors' smiles and apfp\n", 113 | "smiles = chembl[[\"CMPD_CHEMBLID\", \"CANONICAL_SMILES\"]].copy()\n", 114 | "smiles.drop_duplicates(subset=\"CMPD_CHEMBLID\", inplace=True)\n", 115 | "smiles.set_index(keys=\"CMPD_CHEMBLID\", drop=True, inplace=True)\n", 116 | "smiles.to_csv(txt_dir + \"/inhibitor_smiles.csv\")" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 31, 122 | "metadata": { 123 | "collapsed": false, 124 | "scrolled": true 125 | }, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "CHEMBL1161633\n", 132 | "CHEMBL2097021\n", 133 | "CHEMBL471869\n", 134 | "CHEMBL1161635\n", 135 | "CHEMBL181124\n", 136 | "CHEMBL1161637\n", 137 | "CHEMBL181880\n", 138 | "CHEMBL3593577\n", 139 | "CHEMBL450200\n", 140 | "CMPD_CHEMBLID\n", 141 | "CHEMBL450642\n", 142 | "CHEMBL2205792\n", 143 | "CHEMBL2205793\n", 144 | "CHEMBL490121\n", 145 | "CHEMBL523281\n", 146 | "CHEMBL463327\n", 147 | "CHEMBL522826\n", 148 | "CHEMBL2205790\n", 149 | "CHEMBL495469\n", 150 | "CHEMBL2205791\n", 151 | "CHEMBL492602\n", 152 | "CHEMBL2205788\n", 153 | "CHEMBL2205787\n", 154 | "CHEMBL452133\n", 155 | "CHEMBL2205785\n", 156 | "CHEMBL508580\n", 157 | "CHEMBL508803\n", 158 | "CHEMBL2205789\n", 159 | "CHEMBL2205786\n", 160 | "CHEMBL493431\n", 161 | "CHEMBL2087763\n", 162 | "CHEMBL2087764\n", 163 | "CHEMBL2179461\n", 164 | "CHEMBL2179458\n", 165 | "CHEMBL2179464\n", 166 | "CHEMBL2179462\n", 167 | "CHEMBL2179459\n", 168 | "CHEMBL2179463\n", 169 | "CHEMBL1083554\n", 170 | "CHEMBL2179460\n", 171 | "CHEMBL3327018\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "def dict_2_str(d):\n", 177 | " keylist = d.keys()\n", 178 | " keylist.sort()\n", 179 | " kv_list = [\"{}: {}\".format(k, d[k]) for k in keylist] \n", 180 | " return \", \".join(kv_list)\n", 181 | "\n", 182 | "apfp_file = open(txt_dir + \"/inhibitor_apfp.csv\", \"w\")\n", 183 | "for id_, row in smiles.iterrows():\n", 184 | " m = Chem.MolFromSmiles(row.values[0])\n", 185 | " if m is None:\n", 186 | " print id_\n", 187 | " continue\n", 188 | " apfps = GetAtomPairFingerprint(Chem.RemoveHs(m)).GetNonzeroElements()\n", 189 | " apfp_file.write(\"%s\\t{%s}\\n\" % (id_, dict_2_str(apfps)))\n", 190 | "apfp_file.close()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 6, 196 | "metadata": { 197 | "collapsed": true 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "# calculate some molecules's weight\n", 202 | "def molwt(x):\n", 203 | " try:\n", 204 | " value = Chem.Descriptors.MolWt(Chem.MolFromSmiles(x))\n", 205 | " except:\n", 206 | " value = np.nan\n", 207 | " return value\n", 208 | "\n", 209 | "m = chembl[\"MOLWEIGHT\"].isnull()\n", 210 | "chembl.loc[m, \"MOLWEIGHT\"] = chembl.loc[m, \"CANONICAL_SMILES\"].apply(molwt)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 8, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/plain": [ 223 | "(1223639, 59)" 224 | ] 225 | }, 226 | "execution_count": 8, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "# remove molecules that has no \"MOLWEIGHT\"\n", 233 | "m = chembl[\"MOLWEIGHT\"].isnull()\n", 234 | "chembl = chembl[~m]\n", 235 | "chembl.shape" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 32, 241 | "metadata": { 242 | "collapsed": false, 243 | "scrolled": false 244 | }, 245 | "outputs": [ 246 | { 247 | "data": { 248 | "text/plain": [ 249 | "(835299, 59)" 250 | ] 251 | }, 252 | "execution_count": 32, 253 | "metadata": {}, 254 | "output_type": "execute_result" 255 | } 256 | ], 257 | "source": [ 258 | "# pick out inhibitor records\n", 259 | "inhibitor = chembl[chembl[\"STANDARD_TYPE\"].isin([\"IC50\", \"Ki\", \"EC50\"])]\n", 260 | "\n", 261 | "# inhibitor records: all IC50, a part of Ki and EC50 with \"inhibit\" in \"DESCRIPTION\"\n", 262 | "m0 = inhibitor[\"STANDARD_TYPE\"].isin([\"IC50\"]) \n", 263 | "m1 = inhibitor[\"STANDARD_TYPE\"].isin([\"Ki\", \"EC50\"]) \n", 264 | "m2 = inhibitor[\"DESCRIPTION\"].apply(lambda x: \"inhibit\" in x.lower())\n", 265 | "m = m0 | (m1 & m2)\n", 266 | "\n", 267 | "inhibitor = inhibitor[m]\n", 268 | "inhibitor.shape" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 33, 274 | "metadata": { 275 | "collapsed": false 276 | }, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "(716442, 59)" 282 | ] 283 | }, 284 | "execution_count": 33, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "# some records without \"STANDARD_VALUE\" should be cleared away\n", 291 | "m = inhibitor[\"STANDARD_VALUE\"].isnull()\n", 292 | "inhibitor = inhibitor[~m]\n", 293 | "inhibitor.shape" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 34, 299 | "metadata": { 300 | "collapsed": false 301 | }, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "Outside typical range 26411\n", 307 | "Potential transcription error 378\n", 308 | "Non standard unit for type 370\n", 309 | "Manually validated 163\n", 310 | "Name: DATA_VALIDITY_COMMENT, dtype: int64" 311 | ] 312 | }, 313 | "execution_count": 34, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "inhibitor[\"DATA_VALIDITY_COMMENT\"].value_counts()" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 35, 325 | "metadata": { 326 | "collapsed": false 327 | }, 328 | "outputs": [ 329 | { 330 | "data": { 331 | "text/plain": [ 332 | "(690031, 59)" 333 | ] 334 | }, 335 | "execution_count": 35, 336 | "metadata": {}, 337 | "output_type": "execute_result" 338 | } 339 | ], 340 | "source": [ 341 | "# some records with abnormal data also should be cleared away\n", 342 | "#error_comment = [\"Outside typical range\", \"Non standard unit for type\", \"Potential transcription error\"]\n", 343 | "error_comment = [\"Outside typical range\"]\n", 344 | "m = inhibitor[\"DATA_VALIDITY_COMMENT\"].isin(error_comment)\n", 345 | "inhibitor = inhibitor[~m]\n", 346 | "inhibitor.shape" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 36, 352 | "metadata": { 353 | "collapsed": false 354 | }, 355 | "outputs": [ 356 | { 357 | "data": { 358 | "text/plain": [ 359 | "(689725, 59)" 360 | ] 361 | }, 362 | "execution_count": 36, 363 | "metadata": {}, 364 | "output_type": "execute_result" 365 | } 366 | ], 367 | "source": [ 368 | "# correct some STANDARD_UNITS\n", 369 | "m = inhibitor[\"STANDARD_UNITS\"].isin([\"/uM\"])\n", 370 | "inhibitor.loc[m, \"STANDARD_VALUE\"] = inhibitor.loc[m, \"STANDARD_VALUE\"].astype(float).values * 1000\n", 371 | "inhibitor.loc[m, \"STANDARD_UNITS\"] = \"nM\"\n", 372 | "\n", 373 | "m = inhibitor[\"STANDARD_UNITS\"].isin([\"/nM\", \"ug nM-1\", \"Ke nM-1\"])\n", 374 | "inhibitor.loc[m, \"STANDARD_UNITS\"] = \"nM\"\n", 375 | "\n", 376 | "m = inhibitor[\"STANDARD_UNITS\"].isin([\"ug.mL-1\"])\n", 377 | "inhibitor.loc[m, \"STANDARD_VALUE\"] = inhibitor.loc[m, \"STANDARD_VALUE\"].astype(float) / inhibitor.loc[m, \"MOLWEIGHT\"].astype(float) * 10**6\n", 378 | "inhibitor.loc[m, \"STANDARD_UNITS\"] = \"nM\"\n", 379 | "\n", 380 | "m = inhibitor[\"STANDARD_UNITS\"].isin([\"nM\"])\n", 381 | "inhibitor = inhibitor[m]\n", 382 | "inhibitor.shape" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 37, 388 | "metadata": { 389 | "collapsed": false 390 | }, 391 | "outputs": [ 392 | { 393 | "data": { 394 | "text/plain": [ 395 | "(662788, 59)" 396 | ] 397 | }, 398 | "execution_count": 37, 399 | "metadata": {}, 400 | "output_type": "execute_result" 401 | } 402 | ], 403 | "source": [ 404 | "# remove duplicates\n", 405 | "m = inhibitor[\"POTENTIAL_DUPLICATE\"].fillna(0).astype(int) == 0\n", 406 | "inhibitor = inhibitor[m]\n", 407 | "inhibitor.shape" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 39, 413 | "metadata": { 414 | "collapsed": false 415 | }, 416 | "outputs": [], 417 | "source": [ 418 | "inhibitor.to_csv(txt_dir + \"/inhibitor_clean_2017_06_08.csv\", index=False)" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": { 425 | "collapsed": true 426 | }, 427 | "outputs": [], 428 | "source": [] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": { 434 | "collapsed": true 435 | }, 436 | "outputs": [], 437 | "source": [ 438 | "# judge a record's clf label\n", 439 | "def is_pos(row):\n", 440 | " r = row[\"RELATION\"]\n", 441 | " v = np.float32(row[\"STANDARD_VALUE\"])\n", 442 | " if r == \"<\" or r == \"<=\":\n", 443 | " return 1 if v <= 10000 else np.nan\n", 444 | " elif r == \">\" or r == \">=\":\n", 445 | " return -1 if v >= 10000 else np.nan\n", 446 | " elif r == \"=\":\n", 447 | " return 1 if v <= 10000 else -1\n", 448 | " else:\n", 449 | " return np.nan" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 89, 455 | "metadata": { 456 | "collapsed": true 457 | }, 458 | "outputs": [], 459 | "source": [ 460 | "inhibitor[\"CLF_LABEL\"] = inhibitor.apply(is_pos, axis=1)\n", 461 | "inhibitor = inhibitor[~inhibitor[\"CLF_LABEL\"].isnull()]\n", 462 | "inhibitor.loc[:, \"YEAR\"] = inhibitor.loc[:, \"YEAR\"].astype(float)" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 131, 468 | "metadata": { 469 | "collapsed": false, 470 | "scrolled": true 471 | }, 472 | "outputs": [ 473 | { 474 | "data": { 475 | "text/html": [ 476 | "
\n", 477 | "\n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | "
TARGET_CHEMBLIDPREF_NAMECMPD_CHEMBLIDCLF_LABELYEAR
0CHEMBL1075092Glycine receptor subunit alpha-3CHEMBL1092618-1.02010.0
1CHEMBL1075092Glycine receptor subunit alpha-3CHEMBL1092619-1.02010.0
2CHEMBL1075092Glycine receptor subunit alpha-3CHEMBL1093582-1.02010.0
3CHEMBL1075092Glycine receptor subunit alpha-3CHEMBL1093848-1.02010.0
4CHEMBL1075092Glycine receptor subunit alpha-3CHEMBL2398350-1.02013.0
5CHEMBL1075092Glycine receptor subunit alpha-3CHEMBL2398352-1.02013.0
6CHEMBL1075092Glycine receptor subunit alpha-3CHEMBL4646511.02010.0
7CHEMBL1075097Arginase-1CHEMBL10991691.02010.0
8CHEMBL1075101G-protein coupled receptor 81CHEMBL37148171.0NaN
9CHEMBL1075101G-protein coupled receptor 81CHEMBL37148791.0NaN
10CHEMBL1075101G-protein coupled receptor 81CHEMBL37148851.0NaN
11CHEMBL1075101G-protein coupled receptor 81CHEMBL37149091.0NaN
12CHEMBL1075101G-protein coupled receptor 81CHEMBL37149601.0NaN
13CHEMBL1075101G-protein coupled receptor 81CHEMBL37149701.0NaN
14CHEMBL1075101G-protein coupled receptor 81CHEMBL37150041.0NaN
15CHEMBL1075101G-protein coupled receptor 81CHEMBL37150171.0NaN
16CHEMBL1075101G-protein coupled receptor 81CHEMBL37150771.0NaN
17CHEMBL1075101G-protein coupled receptor 81CHEMBL37151551.0NaN
18CHEMBL1075101G-protein coupled receptor 81CHEMBL37151741.0NaN
19CHEMBL1075101G-protein coupled receptor 81CHEMBL37152181.0NaN
20CHEMBL1075101G-protein coupled receptor 81CHEMBL37153591.0NaN
21CHEMBL1075101G-protein coupled receptor 81CHEMBL37153611.0NaN
22CHEMBL1075101G-protein coupled receptor 81CHEMBL37153751.0NaN
23CHEMBL1075101G-protein coupled receptor 81CHEMBL37153941.0NaN
24CHEMBL1075101G-protein coupled receptor 81CHEMBL37154781.0NaN
25CHEMBL1075101G-protein coupled receptor 81CHEMBL37155361.0NaN
26CHEMBL1075101G-protein coupled receptor 81CHEMBL37155581.0NaN
27CHEMBL1075101G-protein coupled receptor 81CHEMBL37155721.0NaN
28CHEMBL1075101G-protein coupled receptor 81CHEMBL37155771.0NaN
29CHEMBL1075101G-protein coupled receptor 81CHEMBL37155991.0NaN
..................
542203CHEMBL6175Lysine-specific demethylase 4CCHEMBL37868621.02016.0
542204CHEMBL6175Lysine-specific demethylase 4CCHEMBL37869521.02016.0
542205CHEMBL6175Lysine-specific demethylase 4CCHEMBL37869631.02016.0
542206CHEMBL6175Lysine-specific demethylase 4CCHEMBL37870201.02016.0
542207CHEMBL6175Lysine-specific demethylase 4CCHEMBL37870441.02016.0
542208CHEMBL6175Lysine-specific demethylase 4CCHEMBL37871331.02016.0
542209CHEMBL6175Lysine-specific demethylase 4CCHEMBL37871931.02016.0
542210CHEMBL6175Lysine-specific demethylase 4CCHEMBL37874381.02016.0
542211CHEMBL6175Lysine-specific demethylase 4CCHEMBL37875161.02016.0
542212CHEMBL6175Lysine-specific demethylase 4CCHEMBL37875341.02016.0
542213CHEMBL6175Lysine-specific demethylase 4CCHEMBL37875481.02016.0
542214CHEMBL6175Lysine-specific demethylase 4CCHEMBL37875561.02016.0
542215CHEMBL6175Lysine-specific demethylase 4CCHEMBL37876641.02016.0
542216CHEMBL6175Lysine-specific demethylase 4CCHEMBL37876691.02016.0
542217CHEMBL6177NAD kinaseCHEMBL233434-1.02008.0
542218CHEMBL6177NAD kinaseCHEMBL538665-1.02009.0
542219CHEMBL6177NAD kinaseCHEMBL5603151.02009.0
542220CHEMBL6177NAD kinaseCHEMBL561654-1.02009.0
542221CHEMBL6177NAD kinaseCHEMBL5620561.02009.0
542222CHEMBL6186Serine/threonine-protein kinase Sgk3CHEMBL2333365-1.02013.0
542223CHEMBL6186Serine/threonine-protein kinase Sgk3CHEMBL3092460-1.02015.0
542224CHEMBL6186Serine/threonine-protein kinase Sgk3CHEMBL30924681.02015.0
542225CHEMBL6186Serine/threonine-protein kinase Sgk3CHEMBL3745885-1.02016.0
542226CHEMBL6195Ubiquitin carboxyl-terminal hydrolase isozyme L3CHEMBL1190585-1.02007.0
542227CHEMBL6195Ubiquitin carboxyl-terminal hydrolase isozyme L3CHEMBL12410281.02007.0
542228CHEMBL6195Ubiquitin carboxyl-terminal hydrolase isozyme L3CHEMBL1241672-1.02007.0
542229CHEMBL6195Ubiquitin carboxyl-terminal hydrolase isozyme L3CHEMBL1241673-1.02007.0
542230CHEMBL6195Ubiquitin carboxyl-terminal hydrolase isozyme L3CHEMBL1241765-1.02007.0
542231CHEMBL6195Ubiquitin carboxyl-terminal hydrolase isozyme L3CHEMBL1241766-1.02007.0
542232CHEMBL6195Ubiquitin carboxyl-terminal hydrolase isozyme L3CHEMBL590-1.02007.0
\n", 979 | "

542233 rows × 5 columns

\n", 980 | "
" 981 | ], 982 | "text/plain": [ 983 | " TARGET_CHEMBLID PREF_NAME \\\n", 984 | "0 CHEMBL1075092 Glycine receptor subunit alpha-3 \n", 985 | "1 CHEMBL1075092 Glycine receptor subunit alpha-3 \n", 986 | "2 CHEMBL1075092 Glycine receptor subunit alpha-3 \n", 987 | "3 CHEMBL1075092 Glycine receptor subunit alpha-3 \n", 988 | "4 CHEMBL1075092 Glycine receptor subunit alpha-3 \n", 989 | "5 CHEMBL1075092 Glycine receptor subunit alpha-3 \n", 990 | "6 CHEMBL1075092 Glycine receptor subunit alpha-3 \n", 991 | "7 CHEMBL1075097 Arginase-1 \n", 992 | "8 CHEMBL1075101 G-protein coupled receptor 81 \n", 993 | "9 CHEMBL1075101 G-protein coupled receptor 81 \n", 994 | "10 CHEMBL1075101 G-protein coupled receptor 81 \n", 995 | "11 CHEMBL1075101 G-protein coupled receptor 81 \n", 996 | "12 CHEMBL1075101 G-protein coupled receptor 81 \n", 997 | "13 CHEMBL1075101 G-protein coupled receptor 81 \n", 998 | "14 CHEMBL1075101 G-protein coupled receptor 81 \n", 999 | "15 CHEMBL1075101 G-protein coupled receptor 81 \n", 1000 | "16 CHEMBL1075101 G-protein coupled receptor 81 \n", 1001 | "17 CHEMBL1075101 G-protein coupled receptor 81 \n", 1002 | "18 CHEMBL1075101 G-protein coupled receptor 81 \n", 1003 | "19 CHEMBL1075101 G-protein coupled receptor 81 \n", 1004 | "20 CHEMBL1075101 G-protein coupled receptor 81 \n", 1005 | "21 CHEMBL1075101 G-protein coupled receptor 81 \n", 1006 | "22 CHEMBL1075101 G-protein coupled receptor 81 \n", 1007 | "23 CHEMBL1075101 G-protein coupled receptor 81 \n", 1008 | "24 CHEMBL1075101 G-protein coupled receptor 81 \n", 1009 | "25 CHEMBL1075101 G-protein coupled receptor 81 \n", 1010 | "26 CHEMBL1075101 G-protein coupled receptor 81 \n", 1011 | "27 CHEMBL1075101 G-protein coupled receptor 81 \n", 1012 | "28 CHEMBL1075101 G-protein coupled receptor 81 \n", 1013 | "29 CHEMBL1075101 G-protein coupled receptor 81 \n", 1014 | "... ... ... \n", 1015 | "542203 CHEMBL6175 Lysine-specific demethylase 4C \n", 1016 | "542204 CHEMBL6175 Lysine-specific demethylase 4C \n", 1017 | "542205 CHEMBL6175 Lysine-specific demethylase 4C \n", 1018 | "542206 CHEMBL6175 Lysine-specific demethylase 4C \n", 1019 | "542207 CHEMBL6175 Lysine-specific demethylase 4C \n", 1020 | "542208 CHEMBL6175 Lysine-specific demethylase 4C \n", 1021 | "542209 CHEMBL6175 Lysine-specific demethylase 4C \n", 1022 | "542210 CHEMBL6175 Lysine-specific demethylase 4C \n", 1023 | "542211 CHEMBL6175 Lysine-specific demethylase 4C \n", 1024 | "542212 CHEMBL6175 Lysine-specific demethylase 4C \n", 1025 | "542213 CHEMBL6175 Lysine-specific demethylase 4C \n", 1026 | "542214 CHEMBL6175 Lysine-specific demethylase 4C \n", 1027 | "542215 CHEMBL6175 Lysine-specific demethylase 4C \n", 1028 | "542216 CHEMBL6175 Lysine-specific demethylase 4C \n", 1029 | "542217 CHEMBL6177 NAD kinase \n", 1030 | "542218 CHEMBL6177 NAD kinase \n", 1031 | "542219 CHEMBL6177 NAD kinase \n", 1032 | "542220 CHEMBL6177 NAD kinase \n", 1033 | "542221 CHEMBL6177 NAD kinase \n", 1034 | "542222 CHEMBL6186 Serine/threonine-protein kinase Sgk3 \n", 1035 | "542223 CHEMBL6186 Serine/threonine-protein kinase Sgk3 \n", 1036 | "542224 CHEMBL6186 Serine/threonine-protein kinase Sgk3 \n", 1037 | "542225 CHEMBL6186 Serine/threonine-protein kinase Sgk3 \n", 1038 | "542226 CHEMBL6195 Ubiquitin carboxyl-terminal hydrolase isozyme L3 \n", 1039 | "542227 CHEMBL6195 Ubiquitin carboxyl-terminal hydrolase isozyme L3 \n", 1040 | "542228 CHEMBL6195 Ubiquitin carboxyl-terminal hydrolase isozyme L3 \n", 1041 | "542229 CHEMBL6195 Ubiquitin carboxyl-terminal hydrolase isozyme L3 \n", 1042 | "542230 CHEMBL6195 Ubiquitin carboxyl-terminal hydrolase isozyme L3 \n", 1043 | "542231 CHEMBL6195 Ubiquitin carboxyl-terminal hydrolase isozyme L3 \n", 1044 | "542232 CHEMBL6195 Ubiquitin carboxyl-terminal hydrolase isozyme L3 \n", 1045 | "\n", 1046 | " CMPD_CHEMBLID CLF_LABEL YEAR \n", 1047 | "0 CHEMBL1092618 -1.0 2010.0 \n", 1048 | "1 CHEMBL1092619 -1.0 2010.0 \n", 1049 | "2 CHEMBL1093582 -1.0 2010.0 \n", 1050 | "3 CHEMBL1093848 -1.0 2010.0 \n", 1051 | "4 CHEMBL2398350 -1.0 2013.0 \n", 1052 | "5 CHEMBL2398352 -1.0 2013.0 \n", 1053 | "6 CHEMBL464651 1.0 2010.0 \n", 1054 | "7 CHEMBL1099169 1.0 2010.0 \n", 1055 | "8 CHEMBL3714817 1.0 NaN \n", 1056 | "9 CHEMBL3714879 1.0 NaN \n", 1057 | "10 CHEMBL3714885 1.0 NaN \n", 1058 | "11 CHEMBL3714909 1.0 NaN \n", 1059 | "12 CHEMBL3714960 1.0 NaN \n", 1060 | "13 CHEMBL3714970 1.0 NaN \n", 1061 | "14 CHEMBL3715004 1.0 NaN \n", 1062 | "15 CHEMBL3715017 1.0 NaN \n", 1063 | "16 CHEMBL3715077 1.0 NaN \n", 1064 | "17 CHEMBL3715155 1.0 NaN \n", 1065 | "18 CHEMBL3715174 1.0 NaN \n", 1066 | "19 CHEMBL3715218 1.0 NaN \n", 1067 | "20 CHEMBL3715359 1.0 NaN \n", 1068 | "21 CHEMBL3715361 1.0 NaN \n", 1069 | "22 CHEMBL3715375 1.0 NaN \n", 1070 | "23 CHEMBL3715394 1.0 NaN \n", 1071 | "24 CHEMBL3715478 1.0 NaN \n", 1072 | "25 CHEMBL3715536 1.0 NaN \n", 1073 | "26 CHEMBL3715558 1.0 NaN \n", 1074 | "27 CHEMBL3715572 1.0 NaN \n", 1075 | "28 CHEMBL3715577 1.0 NaN \n", 1076 | "29 CHEMBL3715599 1.0 NaN \n", 1077 | "... ... ... ... \n", 1078 | "542203 CHEMBL3786862 1.0 2016.0 \n", 1079 | "542204 CHEMBL3786952 1.0 2016.0 \n", 1080 | "542205 CHEMBL3786963 1.0 2016.0 \n", 1081 | "542206 CHEMBL3787020 1.0 2016.0 \n", 1082 | "542207 CHEMBL3787044 1.0 2016.0 \n", 1083 | "542208 CHEMBL3787133 1.0 2016.0 \n", 1084 | "542209 CHEMBL3787193 1.0 2016.0 \n", 1085 | "542210 CHEMBL3787438 1.0 2016.0 \n", 1086 | "542211 CHEMBL3787516 1.0 2016.0 \n", 1087 | "542212 CHEMBL3787534 1.0 2016.0 \n", 1088 | "542213 CHEMBL3787548 1.0 2016.0 \n", 1089 | "542214 CHEMBL3787556 1.0 2016.0 \n", 1090 | "542215 CHEMBL3787664 1.0 2016.0 \n", 1091 | "542216 CHEMBL3787669 1.0 2016.0 \n", 1092 | "542217 CHEMBL233434 -1.0 2008.0 \n", 1093 | "542218 CHEMBL538665 -1.0 2009.0 \n", 1094 | "542219 CHEMBL560315 1.0 2009.0 \n", 1095 | "542220 CHEMBL561654 -1.0 2009.0 \n", 1096 | "542221 CHEMBL562056 1.0 2009.0 \n", 1097 | "542222 CHEMBL2333365 -1.0 2013.0 \n", 1098 | "542223 CHEMBL3092460 -1.0 2015.0 \n", 1099 | "542224 CHEMBL3092468 1.0 2015.0 \n", 1100 | "542225 CHEMBL3745885 -1.0 2016.0 \n", 1101 | "542226 CHEMBL1190585 -1.0 2007.0 \n", 1102 | "542227 CHEMBL1241028 1.0 2007.0 \n", 1103 | "542228 CHEMBL1241672 -1.0 2007.0 \n", 1104 | "542229 CHEMBL1241673 -1.0 2007.0 \n", 1105 | "542230 CHEMBL1241765 -1.0 2007.0 \n", 1106 | "542231 CHEMBL1241766 -1.0 2007.0 \n", 1107 | "542232 CHEMBL590 -1.0 2007.0 \n", 1108 | "\n", 1109 | "[542233 rows x 5 columns]" 1110 | ] 1111 | }, 1112 | "execution_count": 131, 1113 | "metadata": {}, 1114 | "output_type": "execute_result" 1115 | } 1116 | ], 1117 | "source": [ 1118 | "# group\n", 1119 | "grouped = inhibitor.groupby(by=[\"TARGET_CHEMBLID\", \"PREF_NAME\", \"CMPD_CHEMBLID\"], as_index=False)\n", 1120 | "# judge one molecule's label by the average label\n", 1121 | "clf_label = grouped[[\"CLF_LABEL\", \"YEAR\"]].mean()\n", 1122 | "clf_label" 1123 | ] 1124 | }, 1125 | { 1126 | "cell_type": "code", 1127 | "execution_count": 105, 1128 | "metadata": { 1129 | "collapsed": true 1130 | }, 1131 | "outputs": [], 1132 | "source": [ 1133 | "clf_label.to_csv(txt_dir + \"/inhibitor_clf_label.csv\")" 1134 | ] 1135 | }, 1136 | { 1137 | "cell_type": "code", 1138 | "execution_count": null, 1139 | "metadata": { 1140 | "collapsed": true 1141 | }, 1142 | "outputs": [], 1143 | "source": [] 1144 | }, 1145 | { 1146 | "cell_type": "code", 1147 | "execution_count": 144, 1148 | "metadata": { 1149 | "collapsed": true 1150 | }, 1151 | "outputs": [], 1152 | "source": [ 1153 | "cancer_approved_target = [\"CHEMBL279\", \"CHEMBL203\", \"CHEMBL333\", \"CHEMBL325\", \"CHEMBL267\", \"CHEMBL2842\"]\n", 1154 | "cancer_clinical_target = [\"CHEMBL340\", \"CHEMBL4005\", \"CHEMBL332\"]" 1155 | ] 1156 | }, 1157 | { 1158 | "cell_type": "code", 1159 | "execution_count": 158, 1160 | "metadata": { 1161 | "collapsed": false 1162 | }, 1163 | "outputs": [], 1164 | "source": [ 1165 | "for target in cancer_approved_target + cancer_clinical_target:\n", 1166 | " df = clf_label[clf_label[\"TARGET_CHEMBLID\"] == target]\n", 1167 | " df.to_csv(txt_dir + \"/%s_clf_label.csv\" % target, index=False)" 1168 | ] 1169 | }, 1170 | { 1171 | "cell_type": "code", 1172 | "execution_count": null, 1173 | "metadata": { 1174 | "collapsed": true 1175 | }, 1176 | "outputs": [], 1177 | "source": [] 1178 | } 1179 | ], 1180 | "metadata": { 1181 | "kernelspec": { 1182 | "display_name": "Python 2", 1183 | "language": "python", 1184 | "name": "python2" 1185 | }, 1186 | "language_info": { 1187 | "codemirror_mode": { 1188 | "name": "ipython", 1189 | "version": 2 1190 | }, 1191 | "file_extension": ".py", 1192 | "mimetype": "text/x-python", 1193 | "name": "python", 1194 | "nbconvert_exporter": "python", 1195 | "pygments_lexer": "ipython2", 1196 | "version": "2.7.6" 1197 | } 1198 | }, 1199 | "nbformat": 4, 1200 | "nbformat_minor": 1 1201 | } 1202 | --------------------------------------------------------------------------------