├── README.md
├── .gitignore
├── dnn_model
    ├── pk_queue.py
    ├── PickOutTop1000.py
    ├── single_vs_chemdiv.py
    ├── dnn_model.py
    ├── pk_eval.py
    ├── single_train.py
    ├── pk_virtual_screen.py
    ├── single_eval.py
    ├── pk_train.py
    ├── single_vs.py
    └── pk_input.py
├── data_files
    ├── chembl_cal_fp.py
    ├── chembl_cal_mask.py
    ├── chembl_input.py
    ├── 3_chembl_analyse_fp.ipynb
    ├── 3_fingerprint_analyse_additional.ipynb
    └── chembl_preparation.ipynb
├── rf_model
    ├── chembl_rf_vs.py
    └── chembl_rf.py
└── LICENSE


/README.md:
--------------------------------------------------------------------------------
 1 | # Abstract
 2 | Author: xiaotaw@qq.com (Any bug report is welcome)
 3 | 
 4 | Time Created: Aug 2016
 5 | 
 6 | Time Updated: Dec 2016
 7 | 
 8 | Addr: Shenzhen, China
 9 | 
10 | Description: We attempt to explore ChEMBL's Inhibitors by deep neural network
11 | 
12 | Website: https://xiaotaw.github.io/chembl/
13 | 
14 | 
15 | # Background
16 |   (add background for using DNN and RF to build this qsar model)
17 | 
18 | # Problem
19 |   (add one sentence abstract for current challenge)
20 | 
21 | # Solution
22 |   (how we solve the problem)
23 | 
24 | # Method
25 | 
26 | ## 1 get data
27 |      1.1 positive dataset was downloaded from chembl database
28 |      1.2 negtive dataset was selected from pubchem and chembl database(based on a reasonable assumption that almost the compound in pubchem was NOT the substrate of a protein kinase)
29 | 
30 | ## 2 build the model
31 |      2.1 deep neural network(based on tensorflow)
32 |      2.2 random forest(based on scikit-learn)
33 |      2.3 a 'Tree' comprises one 'Term' and several 'Branches', where the 'Term' extracts the mutual figures of all the protein kinase.
34 | 
35 | ## 3 train and evaluation
36 |      3.1 we train the model seperately and jointly, and then apply the model on pubchem dataset for virtual screening.
37 |  
38 | 
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # IPython Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # dotenv
 79 | .env
 80 | 
 81 | # virtualenv
 82 | .venv/
 83 | venv/
 84 | ENV/
 85 | 
 86 | # Spyder project settings
 87 | .spyderproject
 88 | 
 89 | # Rope project settings
 90 | .ropeproject
 91 | 
 92 | 
 93 | # costom ingores
 94 | #
 95 | 
 96 | !.gitignore
 97 | 
 98 | # log files
 99 | log_files/
100 | *.log
101 | 
102 | # pred files
103 | pred_files/
104 | *.pred
105 | 
106 | # data file
107 | structure_files/
108 | txt_files/
109 | csv_files/
110 | mask_files/
111 | 
112 | # cpkt files
113 | ckpt*/
114 | *ckpt/
115 | tmp*/
116 | 
117 | # back up files
118 | *.bk
119 | 
120 | # edit temp files
121 | *~
122 | 
123 | # rf model files
124 | *.m
125 | 
126 | # excel xlsx files
127 | *.xlsx
128 | 
129 | # png files
130 | *.png
131 | 


--------------------------------------------------------------------------------
/dnn_model/pk_queue.py:
--------------------------------------------------------------------------------
 1 | # Author: xiaotaw@qq.com (Any bug report is welcome)
 2 | # Time Created: Oct 2016
 3 | # Time Last Updated: Oct 2016
 4 | # Addr: Shenzhen, China
 5 | # Description: using multi-thread to load input data and generate batch.
 6 | 
 7 | import time
 8 | 
 9 | import Queue
10 | import threading
11 | 
12 | import pk_input as pki
13 | 
14 | 
15 | 
16 | target_list = ["cdk2", "egfr_erbB1", "gsk3b", "hgfr", "map_k_p38a", "tpk_lck", "tpk_src", "vegfr2"]
17 | target = target_list[0]
18 | d = pki.Datasets(target_list)
19 | 
20 | 
21 | # using queue
22 | 
23 | # producer thread
24 | class Producer(threading.Thread):
25 |   def __init__(self, t_name, d, queue):
26 |     threading.Thread.__init__(self, name=t_name)
27 |     self.queue = queue
28 |     self.d = d
29 |   def run(self):
30 |     for i in range(10):
31 |       t0 = time.time()
32 |       batch = self.d.next_train_batch(target, 100, 200)
33 |       t1 = time.time()
34 |       #print("%s: %s generate batch with neg_begin=%d %5.3f" % (time.ctime(), self.getName(), self.d.neg.train_begin, t1-t0))
35 |       self.queue.put(batch, block=True, timeout=None)
36 |       time.sleep(0.5)
37 |     #print("%s: %s finished!" % (time.ctime(), self.getName()))
38 | 
39 | # consumer thread
40 | class Consumer(threading.Thread):
41 |   def __init__(self, t_name, queue):
42 |     threading.Thread.__init__(self, name=t_name)
43 |     self.queue = queue
44 |   def run(self):
45 |     while True:
46 |       try:
47 |         t0 = time.time()
48 |         batch = self.queue.get(block=True, timeout=5)
49 |         time.sleep(0.5)
50 |         t1 = time.time()
51 |         #print("%s: %s generate batch %5.3f" % (time.ctime(), self.getName(), t1-t0))
52 |       except:
53 |         #print("%s: %s finished!" % (time.ctime(), self.getName()))
54 |         break
55 | 
56 | 
57 | if __name__ == "__main__":
58 |   queue = Queue.Queue(50)
59 |   pro_list = []
60 |   for i in range(10):
61 |     pro_list.append(Producer("Pro%d" % i, d, queue))
62 | 
63 |   con = Consumer("Con", queue)
64 | 
65 |   for pro in pro_list:
66 |     pro.start()
67 | 
68 |   con.start()
69 | 
70 |   for pro in pro_list:
71 |     pro.join()
72 | 
73 |   con.join()
74 |   
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/dnn_model/PickOutTop1000.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[16]:
 5 | 
 6 | import pandas as pd
 7 | from rdkit import Chem
 8 | from rdkit.Chem import Draw
 9 | 
10 | 
11 | # In[17]:
12 | 
13 | target_list = ["CHEMBL203", "CHEMBL204", "CHEMBL235", "CHEMBL236", 
14 |                "CHEMBL244", "CHEMBL260", "CHEMBL4805", "CHEMBL4822"]
15 | 
16 | g_step_list = [2161371, 2236500, 2086841, 2236500, 
17 |                2161951, 2252100, 2168041, 1936221]
18 | 
19 | # In[18]:
20 | 
21 | ChemDiv_dir = "/raid/xiaotaw/ChemDiv/"
22 | fn_list = ["DC01_350000.sdf", "DC02_350000.sdf", 
23 |            "DC03_222773.sdf", "DC_saltdata_not-available_124145.sdf", 
24 |            "IC_non-excl_82693.sdf", "NC_340320.sdf"]
25 | #sup0 = Chem.SDMolSupplier(ChemDiv_dir + fn_list[0])
26 | #ms0 = [x for x in sup0 if x is not None]
27 | 
28 | 
29 | # In[19]:
30 | 
31 | #sup1 = Chem.SDMolSupplier(ChemDiv_dir + fn_list[1])
32 | #ms1 = [x for x in sup1 if x is not None]
33 | #sup2 = Chem.SDMolSupplier(ChemDiv_dir + fn_list[2])
34 | #ms2 = [x for x in sup2 if x is not None]
35 | 
36 | 
37 | # In[20]:
38 | 
39 | i = 7
40 | target = target_list[i]
41 | g_step = g_step_list[i]
42 | pred_dir = "/home/scw4750/Documents/chembl/dnn_model/pred_files/%s/" % target
43 | pred_fn = pred_dir + "vs_chemdiv_%s_128_0.800_4.000e-03_%d.pred1000" % (target, g_step)
44 | chemdiv_pred = pd.read_csv(pred_fn, sep="\t", index_col=0, names=["id", "pred"])
45 | #chemdiv_pred
46 | id_list = chemdiv_pred["id"].values
47 | #id_list
48 | 
49 | 
50 | # In[23]:
51 | 
52 | m1000 = []
53 | for fn in fn_list:
54 |     print("start %s" % fn)
55 |     sup = Chem.SDMolSupplier(ChemDiv_dir + fn)
56 |     for m in sup:
57 |         if (m is not None) and (m.GetProp("IDNUMBER") in id_list):
58 |             m1000.append(m)
59 |             #print(m.GetProp("IDNUMBER"))        
60 |     print("finished %s" % fn)
61 | 
62 | 
63 | # In[38]:
64 | 
65 | def get_pred_value(id_):
66 |   return chemdiv_pred["pred"][chemdiv_pred["id"] == id_].values[0]
67 | 
68 | m1000.sort(key=lambda x: get_pred_value(x.GetProp("IDNUMBER")), reverse=True)
69 | 
70 | 
71 | # In[40]:
72 | 
73 | writer = Chem.SDWriter(pred_fn.replace(".pred1000", "_top1000.sdf"))
74 | for m in m1000:
75 |     writer.write(m)
76 |     
77 | 
78 | 
79 | # In[ ]:
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/data_files/chembl_cal_fp.py:
--------------------------------------------------------------------------------
 1 | # Author: xiaotaw@qq.com (Any bug report is welcome)
 2 | # Time Created: Nov 2016
 3 | # Time Last Updated: Dec 2016
 4 | # Addr: Shenzhen, China
 5 | # Description: 1. calculate atom pair fingerprint(apfp) for chembl molecules
 6 | #              2. analyse apfp
 7 | 
 8 | import os
 9 | import gzip
10 | import numpy as np
11 | from collections import defaultdict
12 | 
13 | from rdkit import Chem
14 | from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprint
15 | 
16 | def dict_2_str(d):
17 |   keylist = d.keys()
18 |   keylist.sort()
19 |   kv_list = ["{}: {}".format(k, d[k]) for k in keylist] 
20 |   return ", ".join(kv_list)
21 | 
22 | """
23 | ## calculate chembl apfp
24 | #
25 | sup = Chem.SmilesMolSupplier("structure_files/chembl.smiles", delimiter=",", smilesColumn=1, nameColumn=0, titleLine=True)
26 | 
27 | if not os.path.exists("fp_files"):
28 |   os.mkdir("fp_files")
29 | 
30 | apfp_file = open("fp_files/chembl.apfp", "w")
31 | 
32 | for m in sup:
33 |   if m is None:
34 |     continue
35 |   id_ = m.GetProp("_Name")
36 |   apfps = GetAtomPairFingerprint(Chem.RemoveHs(m)).GetNonzeroElements()
37 |   apfp_file.write("%s\t{%s}\n" % (id_, dict_2_str(apfps)))
38 | 
39 | apfp_file.close()
40 | 
41 | 
42 | ## calculate pns apfp
43 | #
44 | sup = Chem.SDMolSupplier("structure_files/pubchem_neg_sample.sdf")
45 | 
46 | apfp_file = open("fp_files/pubchem_neg_sample.apfp", "w")
47 | 
48 | for m in sup:
49 |   if m is None:
50 |     continue
51 |   id_ = m.GetProp("PUBCHEM_COMPOUND_CID")
52 |   apfps = GetAtomPairFingerprint(Chem.RemoveHs(m)).GetNonzeroElements()
53 |   apfp_file.write("%s\t{%s}\n" % (id_, dict_2_str(apfps)))
54 | 
55 | apfp_file.close()
56 | """
57 | 
58 | ## calculate ChemDiv apfp
59 | ChemDiv_dir = "/raid/xiaotaw/ChemDiv"
60 | fn_list = ["DC01_350000.sdf", "DC02_350000.sdf", "DC03_222773.sdf", "DC_saltdata_not-available_124145.sdf", "IC_non-excl_82693.sdf", "NC_340320.sdf"]
61 | 
62 | for fn in fn_list:
63 |   gzsup = Chem.SDMolSupplier(ChemDiv_dir + "/" + fn)
64 |   molecules = [x for x in gzsup if x is not None]
65 |   apfp_file = open(ChemDiv_dir + "/" + fn.replace("sdf", "apfp"), "w")
66 |   for mol in molecules:
67 |     id_ = mol.GetProp("IDNUMBER")
68 |     apfps = GetAtomPairFingerprint(Chem.RemoveHs(mol)).GetNonzeroElements()
69 |     apfp_file.write("%s\t{%s}\n" % (id_, dict_2_str(apfps)))
70 |   apfp_file.close()
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/rf_model/chembl_rf_vs.py:
--------------------------------------------------------------------------------
 1 | # Author: xiaotaw@qq.com (Any bug report is welcome)
 2 | # Time Created: Dec 2016
 3 | # Time Last Updated: Dec 2016
 4 | # Addr: Shenzhen, China
 5 | # Description:
 6 | 
 7 | import os
 8 | import sys
 9 | import time
10 | import getpass
11 | import numpy as np
12 | from scipy import sparse
13 | from collections import defaultdict
14 | from matplotlib import pyplot as plt
15 | from sklearn.externals import joblib
16 | from sklearn.ensemble import RandomForestClassifier
17 | 
18 | sys.path.append("/home/%s/Documents/chembl/data_files/" % getpass.getuser())
19 | import chembl_input as ci
20 | 
21 | # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others.
22 | target_list = ["CHEMBL279", "CHEMBL203", # Protein Kinases
23 |                "CHEMBL217", "CHEMBL253", # GPCRs (Family A)
24 |                "CHEMBL235", "CHEMBL206", # Nuclear Hormone Receptors
25 |                "CHEMBL240", "CHEMBL4296", # Voltage Gated Ion Channels
26 |                "CHEMBL4805", # Ligand Gated Ion Channels
27 |                "CHEMBL204", "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others
28 |               ] 
29 | 
30 | 
31 | target_list = ["CHEMBL206", "CHEMBL217", "CHEMBL235", "CHEMBL240", 
32 |                "CHEMBL253", "CHEMBL4296",  
33 |               ] 
34 | 
35 | def virtual_screening(target):
36 |   # input dataset
37 |   d = ci.DatasetVS(target)
38 |   # read saved rf clf model
39 |   clf = joblib.load("model_files/rf_%s.m" % target)
40 |   # pred file
41 |   pred_dir = "pred_files/%s" % target
42 |   if not os.path.exists(pred_dir):
43 |     os.mkdir(pred_dir)
44 |   for part_num in range(13):
45 |     t0 = time.time()
46 |     pred_path = os.path.join(pred_dir, "vs_pubchem_%d.pred" % part_num)
47 |     predfile = open(pred_path, "w")
48 |     fp_dir = "/raid/xiaotaw/pubchem/fp_files/%d" % part_num
49 |     for i in range(part_num * 10000000 + 1, (part_num + 1) * 10000000, 25000):
50 |       fp_fn = os.path.join(fp_dir, "Compound_{:0>9}_{:0>9}.apfp".format(i, i + 24999))
51 |       if os.path.exists(fp_fn):
52 |         d.reset(fp_fn)
53 |         features = d.features_dense
54 |         pred = clf.predict_proba(features)
55 |         for id_, pred_v in zip(d.pubchem_id, pred[:, 1]):
56 |           predfile.write("%s\t%f\n" % (id_, pred_v))
57 |         #print("%s\t%d\n" % (fp_fn, pred.shape[0]))
58 |     t1 = time.time()
59 |     print("%s %d: %.3f" %(target, part_num, t1-t0))
60 | 
61 | 
62 | def analyse(target):
63 |   vs_pred_file = "pred_files/%s/vs_pubchem.pred" % (target)
64 |   if not os.path.exists(vs_pred_file):
65 |     os.system("cat pred_files/%s/vs_pubchem_*.pred > pred_files/%s/vs_pubchem.pred" % (target, target))
66 |   aa = np.genfromtxt(vs_pred_file, delimiter="\t")
67 |   a = aa[:, 1]
68 |   test_pred_file = "pred_files/test_%s.pred" % (target)
69 |   bb = np.genfromtxt(test_pred_file, delimiter="\t", usecols=[1,2])
70 |   b = bb[:, 0][bb[:, 1].astype(bool)]
71 |   x = []
72 |   y = []
73 |   for i in range(10):
74 |     mark = (i + 1) / 20.0
75 |     xi = 1.0 * (b > mark).sum() / b.shape[0]
76 |     yi = (a > mark).sum()
77 |     x.append(xi)
78 |     y.append(yi)
79 |   plt.plot(x, y, "*")
80 |   plt.xlabel("pos yeild rate")
81 |   plt.ylabel("vs pubchem false pos")
82 |   plt.savefig("pred_files/%s/analyse.png" % (target))
83 | 
84 | 
85 | target = target_list[int(sys.argv[1])]
86 | virtual_screening(target)
87 | analyse(target)
88 | 
89 | 
90 | """
91 | for target in target_list:
92 |   virtual_screening(target)
93 |   #analyse(target)
94 | """
95 | 


--------------------------------------------------------------------------------
/dnn_model/single_vs_chemdiv.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | from __future__ import absolute_import
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | 
  7 | import os
  8 | import sys
  9 | import time
 10 | import datetime
 11 | import numpy as np
 12 | import pandas as pd
 13 | import tensorflow as tf
 14 | from matplotlib import pyplot as plt
 15 | 
 16 | 
 17 | import dnn_model
 18 | sys.path.append("/home/scw4750/Documents/chembl/data_files/")
 19 | import chembl_input as ci
 20 | 
 21 | vs_batch_size = 1024
 22 | 
 23 | def virtual_screening_chemdiv(target, g_step, gpu_num=0):
 24 |   t_0 = time.time()
 25 |   
 26 |   # dataset
 27 |   d = ci.DatasetChemDiv(target)
 28 |   # batch size
 29 |   batch_size = 128
 30 |   # input vec_len
 31 |   input_vec_len = d.num_features
 32 |   # keep prob
 33 |   keep_prob = 0.8
 34 |   # weight decay
 35 |   wd = 0.004
 36 |   # g_step
 37 |   #g_step = 2236500 
 38 | 
 39 |   # virtual screen pred file
 40 |   pred_dir = "pred_files/%s" % target
 41 |   if not os.path.exists(pred_dir):
 42 |     os.makedirs(pred_dir)
 43 |   pred_path = os.path.join(pred_dir, "vs_chemdiv_%s_%d_%4.3f_%4.3e_%d.pred" % (target, batch_size, keep_prob, wd, g_step))
 44 |   predfile = open(pred_path, 'w')
 45 |   print("virtual screen ChemDiv starts at: %s\n" % datetime.datetime.now())
 46 | 
 47 |   # checkpoint file
 48 |   ckpt_dir = "ckpt_files/%s" % target
 49 |   ckpt_path = os.path.join(ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd))
 50 | 
 51 |   # screening
 52 |   with tf.Graph().as_default(), tf.device("/gpu: %d" % gpu_num):
 53 |     # the input
 54 |     input_placeholder = tf.placeholder(tf.float32, shape = (None, input_vec_len))
 55 |     # the term
 56 |     base = dnn_model.term(input_placeholder, in_units=input_vec_len, wd=wd, keep_prob=1.0)
 57 |     # the branches
 58 |     softmax = dnn_model.branch(target, base, wd=wd, keep_prob=1.0)
 59 |     # create a saver.
 60 |     saver = tf.train.Saver(tf.trainable_variables())
 61 |     # Start screen
 62 |     config=tf.ConfigProto(allow_soft_placement=True)
 63 |     config.gpu_options.per_process_gpu_memory_fraction = 0.35
 64 | 
 65 | 
 66 |     with tf.Session(config=config) as sess:
 67 |       # Restores variables from checkpoint
 68 |       saver.restore(sess, ckpt_path + "-%d" % g_step)
 69 | 
 70 |       for ids, features in d.batch_generator_chemdiv(vs_batch_size):
 71 |         sm = sess.run(softmax, feed_dict = {input_placeholder: features})
 72 |         for id_, sm_v in zip(ids, sm[:, 1]):
 73 |           predfile.write("%s\t%f\n" % (id_, sm_v))
 74 |       """
 75 |       try:     
 76 |         while True:
 77 |           ids, features = d.generate_batch(vs_batch_size)
 78 |           sm = sess.run(softmax, feed_dict = {input_placeholder: features.toarray()})
 79 |           for id_, sm_v in zip(ids, sm[:, 1]):
 80 |             predfile.write("%s\t%f\n" % (id_, sm_v))
 81 |       except StopIteration:
 82 |         pass
 83 |       """
 84 |   predfile.close()
 85 |   print("duration: %.3f" % (time.time() - t_0))
 86 | 
 87 | 
 88 | def analyse_sort_chemdiv(target, g_step):
 89 |   pred_file = "pred_files/%s/vs_chemdiv_%s_128_0.800_4.000e-03_%d.pred" % (target, target, g_step)
 90 |   pred = pd.read_csv(pred_file, sep="\t", names=("id", "pred"))
 91 |   pred.sort_values(by="pred", ascending=False, inplace=True)
 92 |   pred1000 = pred.iloc[:1000]
 93 |   pred1000.to_csv(pred_file.replace(".pred", ".pred1000"), header=False, sep="\t")
 94 | 
 95 | 
 96 | if __name__ == "__main__":
 97 |   target_list = ["CHEMBL203", "CHEMBL204", "CHEMBL205",
 98 |                  "CHEMBL206", "CHEMBL217", "CHEMBL235", "CHEMBL240",
 99 |                  "CHEMBL244", "CHEMBL253", "CHEMBL279", "CHEMBL340", 
100 |                  "CHEMBL4005", "CHEMBL4296", "CHEMBL4805", "CHEMBL4822", 
101 |                 ] 
102 | 
103 |   g_list = [2161371, 2236500, 2235600, 
104 |             2091321, 2161661, 2086841, 2020411,
105 |             2161951, 2012041, 2161661, 2246400, 
106 |             2235900, 2238000, 2168041,  1936221,
107 |            ]
108 | 
109 |   #i = int(sys.argv[1])
110 |   #target = target_list[i]
111 |   #g_step = g_list[i]
112 |   virtual_screening_chemdiv(target="CHEMBL4005", g_step=2235900, gpu_num=1)
113 |   analyse_sort_chemdiv("CHEMBL4005", g_step=2235900)
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/rf_model/chembl_rf.py:
--------------------------------------------------------------------------------
  1 | # Author: xiaotaw@qq.com (Any bug report is welcome)
  2 | # Time Created: Nov 2016
  3 | # Time Last Updated: Dec 2016
  4 | # Addr: Shenzhen, China
  5 | # Description:
  6 | 
  7 | import os
  8 | import sys
  9 | import math
 10 | import time
 11 | import getpass
 12 | import numpy as np
 13 | import pandas as pd
 14 | from scipy import sparse
 15 | from collections import defaultdict
 16 | from matplotlib import pyplot as plt
 17 | from sklearn.externals import joblib
 18 | from sklearn.metrics import roc_curve, auc
 19 | from sklearn.ensemble import RandomForestClassifier
 20 | 
 21 | sys.path.append("/home/%s/Documents/chembl/data_files/" % getpass.getuser())
 22 | import chembl_input as ci
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 | # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others.
 29 | target_list = ["CHEMBL279", "CHEMBL203", # Protein Kinases
 30 |                "CHEMBL217", "CHEMBL253", # GPCRs (Family A)
 31 |                "CHEMBL235", "CHEMBL206", # Nuclear Hormone Receptors
 32 |                "CHEMBL240", "CHEMBL4296", # Voltage Gated Ion Channels
 33 |                "CHEMBL4805", # Ligand Gated Ion Channels
 34 |                "CHEMBL204", "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others
 35 |               ] 
 36 | 
 37 | # the target 
 38 | target = "CHEMBL203"
 39 | 
 40 | 
 41 | # 
 42 | model_dir = "model_files"
 43 | if not os.path.exists(model_dir):
 44 |   os.mkdir(model_dir)
 45 | 
 46 | #
 47 | pred_dir = "pred_files"
 48 | if not os.path.exists(pred_dir):
 49 |   os.mkdir(pred_dir)
 50 | 
 51 | 
 52 | def train_pred(target, train_pos_multiply=0):
 53 |   # 
 54 |   d = ci.Dataset(target, train_pos_multiply=train_pos_multiply)
 55 |   # random forest clf
 56 |   clf = RandomForestClassifier(n_estimators=100, max_features=1.0/3, n_jobs=10, max_depth=None, min_samples_split=5, random_state=0)
 57 |   # fit model
 58 |   clf.fit(d.train_features, d.train_labels)
 59 |   # save model
 60 |   joblib.dump(clf, model_dir + "/rf_%s.m" % target)
 61 |   # predict class probabilities
 62 |   #train_pred_proba = clf.predict_proba(d.train_features)[:, 1]
 63 |   test_pred_proba = clf.predict_proba(d.test_features)[:, 1]
 64 |   # save pred
 65 |   test_pred_file = open(pred_dir + "/test_%s.pred" % target, "w")
 66 |   for id_, pred_v, l_v in zip(d.target_ids_test, test_pred_proba, d.test_labels):
 67 |     test_pred_file.write("%s\t%f\t%f\n" % (id_, pred_v, l_v))
 68 |   test_pred_file.close()
 69 |   # draw roc fig
 70 |   fpr, tpr, _ = roc_curve(d.test_labels, test_pred_proba)
 71 |   roc_auc = auc(fpr, tpr)
 72 |   plt.figure()
 73 |   plt.plot(fpr, tpr, color="r", lw=2, label="ROC curve (area = %.2f)" % roc_auc)
 74 |   plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
 75 |   plt.xlim([0.0, 1.0])
 76 |   plt.ylim([0.0, 1.05])
 77 |   plt.xlabel("False Positive Rate")
 78 |   plt.ylabel("True Positive Rate")
 79 |   plt.title("Receiver operating characteristic of RF model on %s" % target)
 80 |   plt.legend(loc="lower right")
 81 |   plt.savefig("%s.png" % target)
 82 |   #plt.show()
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | target_list = ["CHEMBL206", "CHEMBL217", "CHEMBL235", "CHEMBL240", 
 89 |                "CHEMBL253", "CHEMBL4296",  
 90 |               ] 
 91 | 
 92 | 
 93 | for target in target_list:
 94 |   t0 = time.time()
 95 |   train_pred(target, train_pos_multiply=0)
 96 |   t1 = time.time()
 97 |   print("%s duration: %.3f" % (target, t1-t0))
 98 | 
 99 | 
100 | """
101 | 
102 | t0 = time.time()
103 | train_pred("CHEMBL4805", train_pos_multiply=0)
104 | t1 = time.time()
105 | print("%s duration: %.3f" % (target, t1-t0))
106 | 
107 | """
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | """
120 | pns_pred = clf.predict(d.target_pns_features)
121 | cns_pred = clf.predict(d.target_cns_features_train)
122 | train_pred = clf.predict(d.train_features)
123 | test_pred = clf.predict(d.test_features)
124 | 
125 | pns_result = ci.compute_performance(d.target_pns_mask.values.astype(int), pns_pred)
126 | cns_result = ci.compute_performance(d.target_cns_mask_train.values.astype(int), cns_pred)
127 | train_result = ci.compute_performance(d.train_labels, train_pred)
128 | test_result = ci.compute_performance(d.test_labels, test_pred)
129 | 
130 | print(train_result)
131 | 
132 | print(test_result)
133 | """
134 | 
135 | # load model
136 | #clf = joblib.load(model_dir + "/rf_%s.m" % target)
137 | 
138 | 
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/dnn_model/dnn_model.py:
--------------------------------------------------------------------------------
 1 | # Author: xiaotaw@qq.com (Any bug report is welcome)
 2 | # Time Created: Aug 2016
 3 | # Time Last Updated: Nov 2016
 4 | # Addr: Shenzhen, China
 5 | # Description: dnn model for pk
 6 | 
 7 | from __future__ import absolute_import
 8 | from __future__ import division
 9 | from __future__ import print_function
10 | 
11 | import numpy as np
12 | import tensorflow as tf
13 | 
14 | 
15 | def fcnn_layer(input_tensor, input_dim, output_dim, layer_name,
16 |                wd=False, wd_collection=False,
17 |                keep_prob=0.8, variable_collection=False):
18 |   with tf.name_scope(layer_name):
19 |     weights = tf.Variable(tf.truncated_normal([input_dim, output_dim], stddev=1.0 / np.sqrt(float(input_dim))), name="weights")
20 |     if wd is not None:
21 |       weight_decay = tf.mul(tf.nn.l2_loss(weights), wd, name="weight_loss")
22 |       tf.add_to_collection(wd_collection, weight_decay)
23 |     biases  = tf.Variable(tf.zeros([output_dim]), name="biases")
24 |     if variable_collection:
25 |       tf.add_to_collection(variable_collection, weights)
26 |       tf.add_to_collection(variable_collection, biases)
27 |     relu = tf.nn.relu(tf.matmul(input_tensor, weights) + biases, name="relu")
28 |     if keep_prob:
29 |       dropout = tf.nn.dropout(relu, keep_prob, name="dropout")
30 |       return dropout
31 |     else:
32 |       return relu
33 | 
34 | def term_reg(in_layer, in_units=4852, th1_units=4096, th2_units=3072, th3_units=2048, 
35 |                   wd=0.004, keep_prob=0.8):
36 |   th1 = fcnn_layer(in_layer, in_units, th1_units, "term_layer1", wd=wd, wd_collection="term_wd_loss", keep_prob=keep_prob, variable_collection="term")
37 |   th2 = fcnn_layer(th1, th1_units, th2_units, "term_layer2", wd=wd, wd_collection="term_wd_loss", keep_prob=keep_prob, variable_collection="term")
38 |   th3 = fcnn_layer(th2, th2_units, th3_units, "term_layer3", wd=wd, wd_collection="term_wd_loss", keep_prob=keep_prob, variable_collection="term")
39 |   return th3
40 | 
41 | def branch_reg(branch_name, base_layer, wd=0.004, keep_prob=0.8,
42 |                base_units=2048, bh1_units=2048, bh2_units=1024, out_units = 1):
43 |   var_collection="branch_"+branch_name
44 |   with tf.name_scope(branch_name):
45 |     bh1 = fcnn_layer(base_layer, base_units, bh1_units, "branch_layer1", wd=wd, wd_collection=branch_name+"_wd_loss", keep_prob=keep_prob, variable_collection=var_collection)
46 |     bh2 = fcnn_layer(bh1, bh1_units, bh2_units, "branch_layer2", wd=wd, wd_collection=branch_name+"_wd_loss", keep_prob=keep_prob, variable_collection=var_collection)
47 |     with tf.name_scope("out_relu"):
48 |       weights = tf.Variable(tf.truncated_normal([bh2_units, out_units], stddev=1.0 / np.sqrt(float(bh2_units))), name="weights")
49 |       biases  = tf.Variable(tf.zeros([out_units]), name="biases")
50 |       tf.add_to_collection(var_collection, weights)
51 |       tf.add_to_collection(var_collection, biases)
52 |       out_relu = tf.nn.relu(tf.matmul(bh2, weights) + biases, name="out_relu")
53 |     return out_relu
54 | 
55 | 
56 | def term(in_layer, in_units = 9561, th1_units = 8192, th2_units = 6144, th3_units = 4096, 
57 |          wd=0.004, keep_prob=0.8):
58 |   th1 = fcnn_layer(in_layer, in_units, th1_units, "term_layer1", wd=wd, wd_collection="term_wd_loss", keep_prob=keep_prob, variable_collection="term")
59 |   th2 = fcnn_layer(th1, th1_units, th2_units, "term_layer2", wd=wd, wd_collection="term_wd_loss", keep_prob=keep_prob, variable_collection="term")
60 |   th3 = fcnn_layer(th2, th2_units, th3_units, "term_layer3", wd=wd, wd_collection="term_wd_loss", keep_prob=keep_prob, variable_collection="term")
61 |   #th4 = fcnn_layer(th3, th3_units, th4_units, "term_layer4", wd=wd, wd_collection="term_wd_loss", keep_prob=keep_prob, variable_collection="term")
62 |   return th3
63 | 
64 | def branch(branch_name, base_layer, wd=0.004, keep_prob=0.8,
65 |            base_units = 4096, bh1_units = 4096, bh2_units = 2048, bh3_units = 1024, out_units = 2):
66 |   var_collection="branch_"+branch_name
67 |   with tf.name_scope(branch_name):
68 |     bh1 = fcnn_layer(base_layer, base_units, bh1_units, "branch_layer1", wd=wd, wd_collection=branch_name+"_wd_loss", keep_prob=keep_prob, variable_collection=var_collection)
69 |     bh2 = fcnn_layer(bh1, bh1_units, bh2_units, "branch_layer2", wd=wd, wd_collection=branch_name+"_wd_loss", keep_prob=keep_prob, variable_collection=var_collection)
70 |     bh3 = fcnn_layer(bh2, bh2_units, bh3_units, "branch_layer3", wd=wd, wd_collection=branch_name+"_wd_loss", keep_prob=keep_prob, variable_collection=var_collection)
71 |     with tf.name_scope("softmax_linear"):
72 |       weights = tf.Variable(tf.truncated_normal([bh3_units, out_units], stddev=1.0 / np.sqrt(float(bh3_units))), name="weights")
73 |       biases  = tf.Variable(tf.zeros([out_units]), name="biases")
74 |       tf.add_to_collection(var_collection, weights)
75 |       tf.add_to_collection(var_collection, biases)
76 |       softmax = tf.nn.softmax(tf.matmul(bh3, weights) + biases, name="softmax")
77 |     return softmax
78 | 
79 | def x_entropy(softmax, labels, loss_name, neg_weight=1):
80 |   with tf.name_scope(loss_name):
81 |     weight = np.array([neg_weight, 1]).astype(np.float32)
82 |     cross_entropy = -tf.reduce_sum(tf.reduce_mean(labels * tf.log(softmax) * weight, reduction_indices=[0]), name="x_entropy")
83 |     return cross_entropy
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/dnn_model/pk_eval.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Author: xiaotaw@qq.com (Any bug report is welcome)
  3 | # Time: Aug 2016
  4 | # Addr: Shenzhen
  5 | # Description: evaluate pk model
  6 | 
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import os
 12 | import time
 13 | import numpy
 14 | import datetime
 15 | import tensorflow as tf
 16 | import dnn_model
 17 | import pk_input
 18 | 
 19 | def evaluate(target_list):
 20 |   """ evaluate the model 
 21 |   """
 22 |   # virtual screen log file
 23 |   log_dir = "log_files"
 24 |   logpath = os.path.join(log_dir, "pk_eval.log")
 25 |   logfile = open(logpath, "w")
 26 |   logfile.write("pk_eval starts at: %s\n" % datetime.datetime.now())
 27 | 
 28 |   # get input dataset
 29 |   train_dataset_dict = dict()
 30 |   test_dataset_dict = dict()
 31 |   for target in target_list:
 32 |     train_dataset_dict[target] = pk_input.get_inputs_by_cpickle("data_files/pkl_files/" + target + "_train.pkl") 
 33 |     test_dataset_dict[target] = pk_input.get_inputs_by_cpickle("data_files/pkl_files/" + target + "_test.pkl") 
 34 | 
 35 |   neg_dataset = pk_input.get_inputs_by_cpickle("data_files/pkl_files/pubchem_neg_sample.pkl")
 36 |   
 37 | 
 38 | 
 39 |   with tf.Graph().as_default(), tf.device("/gpu:0"):
 40 |     
 41 |     # build the model
 42 |     input_placeholder = tf.placeholder(tf.float32, shape = (None, 8192))
 43 |     label_placeholder = tf.placeholder(tf.float32, shape = (None, 2))
 44 |     # build the "Tree" with a mutual "Term" and several "Branches"
 45 |     base = dnn_model.term(input_placeholder, keep_prob=1.0)
 46 |     softmax_dict = dict()
 47 |     wd_loss_dict = dict()
 48 |     x_entropy_dict = dict()
 49 |     loss_dict = dict()
 50 |     accuracy_dict = dict()
 51 |     for target in target_list:
 52 |       # compute softmax
 53 |       softmax_dict[target] = dnn_model.branch(target, base, keep_prob=1.0)
 54 |       # compute loss.
 55 |       wd_loss_dict[target] = tf.add_n(tf.get_collection("term_wd_loss") + tf.get_collection(target+"_wd_loss"))
 56 |       x_entropy_dict[target] = dnn_model.x_entropy(softmax_dict[target], label_placeholder, target)
 57 |       loss_dict[target]  = tf.add(wd_loss_dict[target], x_entropy_dict[target])
 58 |       # compute accuracy
 59 |       accuracy_dict[target] = dnn_model.accuracy(softmax_dict[target], label_placeholder, target)
 60 | 
 61 |     # create a saver.
 62 |     saver = tf.train.Saver(tf.trainable_variables())
 63 | 
 64 |     # create session.
 65 |     config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
 66 |     config.gpu_options.per_process_gpu_memory_fraction = 0.5
 67 |     sess = tf.Session(config=config)
 68 | 
 69 |     # Restores variables from checkpoint
 70 |     saver.restore(sess, "ckpt_files/model.ckpt-40000")
 71 | 
 72 |     
 73 |     
 74 |     # eval train dataset
 75 |     for target in target_list:
 76 |       t0 = float(time.time())
 77 |       compds = numpy.vstack([train_dataset_dict[target].compds, neg_dataset.compds])
 78 |       labels = numpy.vstack([train_dataset_dict[target].labels, neg_dataset.labels])
 79 |       t1 = float(time.time())
 80 |       LV, XLV, ACC, prediction, label_dense = sess.run(
 81 |         [wd_loss_dict[target], 
 82 |          x_entropy_dict[target],
 83 |          accuracy_dict[target], 
 84 |          tf.argmax(softmax_dict[target], 1), 
 85 |          tf.argmax(labels, 1)], 
 86 |         feed_dict = {
 87 |           input_placeholder: compds,
 88 |           label_placeholder: labels,
 89 |         }
 90 |       )
 91 |       t2 = time.time()
 92 |       TP, TN, FP, FN, SEN, SPE, MCC = dnn_model.compute_performance(label_dense, prediction)
 93 |       format_str = "%6d %6d %6.3f %6.3f %10.3f %5d %5d %5d %5d %6.3f %6.3f %6.3f %6.3f %5.3f %5.3f %s"
 94 |       logfile.write(format_str % (5000, 40000, LV, XLV, 0, TP, FN, TN, FP, SEN, SPE, ACC, MCC, t1-t0, t2-t1, target))
 95 |       logfile.write('\n')
 96 |       print(format_str % (5000, 40000, LV, XLV, 0, TP, FN, TN, FP, SEN, SPE, ACC, MCC, t1-t0, t2-t1, target))  
 97 | 
 98 |     # eval test dataset
 99 |     for target in target_list:  
100 |       t0 = float(time.time())
101 |       compds = test_dataset_dict[target].compds
102 |       labels = test_dataset_dict[target].labels
103 |       t1 = float(time.time())
104 |       LV, XLV, ACC, prediction, label_dense = sess.run(
105 |         [wd_loss_dict[target], 
106 |          x_entropy_dict[target],
107 |          accuracy_dict[target], 
108 |          tf.argmax(softmax_dict[target], 1), 
109 |          tf.argmax(labels, 1)], 
110 |         feed_dict = {
111 |           input_placeholder: compds,
112 |           label_placeholder: labels,
113 |         }
114 |       )
115 |       t2 = time.time()
116 |       TP, TN, FP, FN, SEN, SPE, MCC = dnn_model.compute_performance(label_dense, prediction)
117 |       format_str = "%6d %6d %6.3f %6.3f %10.3f %5d %5d %5d %5d %6.3f %6.3f %6.3f %6.3f %5.3f %5.3f %s"
118 |       logfile.write(format_str % (5000, 40000, LV, XLV, 0, TP, FN, TN, FP, SEN, SPE, ACC, MCC, t1-t0, t2-t1, target))
119 |       logfile.write('\n')
120 |       print(format_str % (5000, 40000, LV, XLV, 0, TP, FN, TN, FP, SEN, SPE, ACC, MCC, t1-t0, t2-t1, target))  
121 | 
122 |   logfile.close()
123 | 
124 | 
125 | if __name__ == "__main__":
126 |   target_list = ["cdk2", "egfr_erbB1", "gsk3b", "hgfr",
127 |                  "map_k_p38a", "tpk_lck", "tpk_src", "vegfr2"]
128 | 
129 |   evaluate(target_list)
130 | 
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/dnn_model/single_train.py:
--------------------------------------------------------------------------------
  1 | # Author: xiaotaw@qq.com (Any bug report is welcome)
  2 | # Time Created: Aug 2016
  3 | # Time Last Updated: Dec 2016
  4 | # Addr: Shenzhen, China
  5 | # Description: train chembl model for a single target
  6 | 
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import os
 12 | import sys
 13 | import time
 14 | import getpass
 15 | import datetime
 16 | import tensorflow as tf
 17 | 
 18 | sys.path.append("/home/%s/Documents/chembl/data_files/" % getpass.getuser())
 19 | import dnn_model
 20 | import chembl_input as ci
 21 | 
 22 | 
 23 | def train(target, gpu_num=0, tpm=0, 
 24 |           train_from=0, keep_prob=0.8, wd=0.004, batch_size=128):
 25 |   """"""
 26 |   # dataset
 27 |   d = ci.Dataset(target, train_pos_multiply=tpm)
 28 |   d.test_features_dense = d.test_features.toarray()
 29 |   # learning rate 
 30 |   step_per_epoch = int(d.train_size / batch_size) # approximately equal to 7456
 31 |   start_learning_rate = 0.05
 32 |   decay_step = step_per_epoch * 10
 33 |   decay_rate = 0.9
 34 |   # max train steps
 35 |   max_step = 300 * step_per_epoch
 36 |   # input vec_len
 37 |   input_vec_len = d.num_features
 38 |   # checkpoint file
 39 |   ckpt_dir = "ckpt_files/%s" % target
 40 |   ckpt_path = os.path.join(ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd))
 41 |   if not os.path.exists(ckpt_dir):
 42 |     os.makedirs(ckpt_dir)
 43 |   # train log file
 44 |   log_dir = "log_files"
 45 |   if not os.path.exists(log_dir):
 46 |     os.mkdir(log_dir)
 47 |   log_path = os.path.join(log_dir, "train_%s_%d_%4.3f_%4.3e.log" % (target, batch_size, keep_prob, wd))
 48 |   logfile = open(log_path, 'w')
 49 |   logfile.write("train starts at: %s\n" % datetime.datetime.now())
 50 | 
 51 | 
 52 |   # build dnn model and train
 53 |   with tf.Graph().as_default(), tf.device('/gpu: %d' % gpu_num):
 54 |     # placeholders
 55 |     input_placeholder = tf.placeholder(tf.float32, shape = (None, input_vec_len))
 56 |     label_placeholder = tf.placeholder(tf.float32, shape = (None, 2))
 57 |     # global step and learning rate
 58 |     global_step = tf.Variable(train_from, trainable=False)
 59 |     learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, decay_step, decay_rate)
 60 |     # build a Graph that computes the softmax predictions from the
 61 |     # inference model.
 62 |     base = dnn_model.term(input_placeholder, in_units=input_vec_len, wd=wd, keep_prob=keep_prob)
 63 |     # compute softmax
 64 |     softmax = dnn_model.branch(target, base, wd=wd, keep_prob=keep_prob)
 65 |     # compute loss.
 66 |     wd_loss = tf.add_n(tf.get_collection("term_wd_loss") + tf.get_collection(target+"_wd_loss"))
 67 |     x_entropy = dnn_model.x_entropy(softmax, label_placeholder, target, neg_weight=1)
 68 |     loss  = tf.add(wd_loss, x_entropy)
 69 |     # train op
 70 |     train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 71 |     # create a saver.
 72 |     saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=None)
 73 |     # start running operations on the Graph.
 74 |     config=tf.ConfigProto(allow_soft_placement=True)
 75 |     config.gpu_options.per_process_gpu_memory_fraction = 0.3
 76 |     sess = tf.Session(config=config)
 77 |     # initialize all variables at first.
 78 |     sess.run(tf.initialize_all_variables())
 79 |     if train_from != 0:
 80 |       saver.restore(sess, ckpt_path + "-%d" % train_from)
 81 |     # print title to screen and log file
 82 |     title_str = "  step g_step wdloss   xloss learn_rate    TP    FN    TN    FP    SEN    SPE    ACC    MCC t1-t0 t2-t1 t3-t2  target"
 83 |     print(title_str)
 84 |     logfile.write(title_str + "\n")
 85 | 
 86 |     # format str
 87 |     format_str = "%6d %6d %6.4f %7.5f %10.8f %5d %5d %5d %5d %6.4f %6.4f %6.4f %6.4f %5.3f %5.3f %5.3f %10s "
 88 | 
 89 |     # train the model
 90 |     for step in xrange(max_step):
 91 |       t0 = time.time()
 92 | 
 93 |       # get a batch sample
 94 |       perm = d.generate_perm_for_train_batch(batch_size)
 95 |       compds_batch = d.train_features[perm].toarray()
 96 |       labels_batch_one_hot = d.train_labels_one_hot[perm]
 97 |       t1 = time.time()
 98 |       # train once
 99 |       _ = sess.run([train_op],feed_dict = {input_placeholder: compds_batch, label_placeholder: labels_batch_one_hot})
100 |       t2 = time.time()
101 | 
102 |       # compute performance for the train batch
103 |       if step % step_per_epoch == 0 or (step + 1) == max_step:
104 |         g_step, wd_ls, x_ls, lr, pred = sess.run([global_step, wd_loss, x_entropy, learning_rate, tf.argmax(softmax, 1)],
105 |           feed_dict = {input_placeholder: compds_batch, label_placeholder: labels_batch_one_hot})
106 |         tp, tn, fp, fn, sen, spe, acc, mcc = ci.compute_performance(d.train_labels[perm], pred)
107 |         t3 = float(time.time())    
108 |         logfile.write(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target) + "\n")
109 |         print(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target))      
110 | 
111 |       # save the model checkpoint periodically.
112 |       if step % (10 * step_per_epoch) == 0 or (step + 1) == max_step:
113 |         saver.save(sess, ckpt_path, global_step=global_step, write_meta_graph=False)
114 | 
115 |       # compute performance for the test data
116 |       if step % (10 * step_per_epoch) == 0 or (step + 1) == max_step:
117 |         x_ls, pred = sess.run([x_entropy, tf.argmax(softmax, 1)],
118 |           feed_dict = {input_placeholder: d.test_features_dense, label_placeholder: d.test_labels_one_hot})
119 |         tp, tn, fp, fn, sen, spe, acc, mcc = ci.compute_performance(d.test_labels, pred)
120 |         logfile.write(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, 0, 0, 0, target) + "\n")
121 |         print(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, 0, 0, 0, target)) 
122 | 
123 |   logfile.write("train ends at: %s\n" % datetime.datetime.now())
124 |   logfile.close()
125 | 
126 | if __name__ == "__main__":
127 | 
128 |   # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others.
129 |   target_list = ["CHEMBL279", 
130 |                "CHEMBL4805", # Ligand Gated Ion Channels
131 |                "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others
132 |               ] 
133 | 
134 | 
135 |   #for target in target_list:
136 |   train(target="CHEMBL4722", gpu_num=0, tpm=1) 
137 | 
138 | 


--------------------------------------------------------------------------------
/dnn_model/pk_virtual_screen.py:
--------------------------------------------------------------------------------
  1 | # Author: xiaotaw@qq.com (Any bug report is welcome)
  2 | # Time Created: Aug 2016
  3 | # Time Last Updated: Oct 2016
  4 | # Addr: Shenzhen, China
  5 | # Description: apply pk model to pubchem dataset, to screen potential active substrate(drugs)
  6 | 
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import os
 12 | import sys
 13 | import glob
 14 | import time
 15 | import numpy
 16 | import cPickle
 17 | import datetime
 18 | import tensorflow as tf
 19 | from scipy import sparse
 20 | 
 21 | import dnn_model
 22 | 
 23 | 
 24 | def virtual_screening(target_list, part_num):
 25 | 
 26 |   # virtual screen log file
 27 |   log_dir = "log_files"
 28 |   logpath = os.path.join(log_dir, "virtual_screen_pubchem_%d.log" % part_num)
 29 |   logfile = open(logpath, "w")
 30 |   logfile.write("virtual screen %d starts at: %s\n" % (part_num, datetime.datetime.now()))
 31 | 
 32 |   # input and output dir
 33 |   pkl_dir = "/raid/xiaotaw/pubchem/pkl_files"
 34 |   prediction_dir = "/raid/xiaotaw/pubchem/prediction_files"
 35 |   if not os.path.exists(prediction_dir):
 36 |     os.mkdir(prediction_dir)
 37 | 
 38 |   # screening
 39 |   with tf.Graph().as_default(), tf.device("/gpu:%d" % (part_num // 3)):
 40 |     # the input
 41 |     input_placeholder = tf.placeholder(tf.float32, shape = (None, 8192))
 42 | 
 43 |     # the term
 44 |     base = dnn_model.term(input_placeholder, keep_prob=1.0)
 45 | 
 46 |     # the branches
 47 |     softmax_dict = dict()
 48 |     for target in target_list:
 49 |       softmax_dict[target] = dnn_model.branch(target, base, keep_prob=1.0)
 50 | 
 51 |     # create a saver.
 52 |     saver = tf.train.Saver(tf.trainable_variables())
 53 | 
 54 |     # Start screen
 55 |     prediction_dict = dict()
 56 |     config=tf.ConfigProto(allow_soft_placement=True)
 57 |     config.gpu_options.per_process_gpu_memory_fraction = 0.2
 58 |     with tf.Session(config=config) as sess:
 59 |       # Restores variables from checkpoint
 60 |       saver.restore(sess, "ckpt_files/model.ckpt-40000")
 61 | 
 62 |       
 63 |       #for i in xrange(1, 121225001, 25000):
 64 |       begin_num = part_num * 10000000 + 1
 65 |       if part_num == 11:
 66 |         end_num = 121225001
 67 |       else:
 68 |         end_num = (part_num + 1) * 10000000 + 1  
 69 | 
 70 |       for i in xrange(begin_num, end_num, 25000):
 71 |         start_time = float(time.time())
 72 |         # get input compounds
 73 |         in_file = "Compound_" + "{:0>9}".format(i) + "_" + "{:0>9}".format(i + 24999) + ".pkl"
 74 |         if not os.path.exists(os.path.join(pkl_dir, in_file)):
 75 |           logfile.write("%s\t0\tnot exists" % in_file)
 76 |           continue
 77 |         infile = open(os.path.join(pkl_dir, in_file), "rb")
 78 |         data = cPickle.load(infile)
 79 |         numpy.clip(data, 0, 1, out=data)
 80 |         compds = data.astype(numpy.float32)
 81 |         infile.close()
 82 |         for target in target_list:
 83 |           prediction_dict[target] = sess.run(tf.argmax(softmax_dict[target], 1), feed_dict = {input_placeholder: compds})
 84 | 
 85 |         # stack prediction result into a matrix with shape = (num_compds, num_targets)
 86 |         prediction = numpy.vstack([prediction_dict[k] for k in target_list]).T
 87 |         logfile.write("%s\t%s\t%d\n" % (in_file, prediction.sum(axis=0), compds.shape[0]))
 88 |         # convert into sparse matrix
 89 |         if not prediction.sum()==0:
 90 |           sparse_prediction = sparse.csr_matrix(prediction)
 91 |           # save result into file
 92 |           out_file = in_file.replace("pkl", "prediction")
 93 |           outfile = open(os.path.join(prediction_dir, out_file), "wb")
 94 |           cPickle.dump(sparse_prediction, outfile, protocol=2)
 95 |           outfile.close()
 96 |           #logfile.write(str(sparse_prediction)+"\n")
 97 |         print("%s\t%s\t%d\t%.3f" % (in_file, prediction.sum(axis=0), compds.shape[0], time.time()-start_time))  
 98 |   logfile.write("virtual screen %d ends at: %s\n" % (part_num, datetime.datetime.now()))
 99 |   logfile.close()
100 | 
101 | 
102 | # analyse vs result
103 | def analyse_vs_result():
104 |   prediction_dir = "/raid/xiaotaw/pubchem/prediction_files"
105 |   mgfp_dir = "/raid/xiaotaw/pubchem/morgan_fp"
106 | 
107 |   cid_list = []
108 |   result_list = []
109 |  
110 |   for i in xrange(1, 121225001, 25000):
111 | 
112 |   #for i in xrange(1, 125001, 25000):
113 | 
114 |     # load data from prediction file
115 |     pre_file = "Compound_" + "{:0>9}".format(i) + "_" + "{:0>9}".format(i + 24999) + ".prediction"
116 |     pre_filepath = os.path.join(prediction_dir, pre_file)
117 |     if not os.path.exists(pre_filepath):
118 |       continue
119 |     prefile = open(pre_filepath, "rb")
120 |     sp = cPickle.load(prefile)
121 |     prefile.close()
122 | 
123 |     # get potential hit compounds' index
124 |     index, _ = sp.nonzero()
125 |     index = sorted(list(set(index)))
126 |     # get potential hit compounds' prediction result
127 |     result = sp.toarray()[index]
128 | 
129 |     # get potential hit compounds' cids from mgfp file
130 |     mgfp_file = pre_file.replace("prediction", "mgfp") 
131 |     mgfp_filepath = os.path.join(mgfp_dir, mgfp_file)
132 |     mgfpfile = open(mgfp_filepath, "r")
133 |     lines = mgfpfile.readlines()
134 |     mgfpfile.close()
135 |     cid = [lines[x].split("\t")[0] for x in index]
136 |     
137 |     # append each file to 
138 |     cid_list.extend(cid)
139 |     result_list.append(result)
140 | 
141 |     print("%s\t%d" % (pre_file, len(index)))
142 | 
143 |   results_pre = numpy.vstack(result_list)
144 |   results_cid = numpy.array(cid_list, dtype=numpy.int)
145 |   results = numpy.hstack([results_cid.reshape(len(cid_list), 1), results_pre])
146 | 
147 |   outfile = open("vs_pubchem.result", "wb")
148 |   cPickle.dump(results, outfile, protocol=2)
149 |   outfile.close()
150 | 
151 |   return results
152 | 
153 | 
154 |   
155 | def get_chembl_pos(target_list):
156 |   mgfp_dir = "data_files/mgfp_files/"
157 |   cid_dir = "data_files/id_files/"
158 |   
159 |   def get_cids(target):
160 |     tmp_list = list()
161 |     infile = open(mgfp_dir + target + ".mgfp6", "r")
162 |     lines = infile.readlines()
163 |     infile.close()
164 |     lines = [x.split("\t") for x in lines]
165 |     infile = open(cid_dir + target + ".cids", "r")
166 |     cids = [x.split("\t")[1] for x in infile.readlines()]
167 |  
168 |     for i in range(len(lines)):
169 |       line = lines[i]
170 |       if line[1] == "1":
171 |         tmp_list.append(cids[i])
172 |     return tmp_list
173 | 
174 | 
175 |   pos_cid_dict = dict()
176 |   for target in target_list:
177 |     pos_cid_dict[target] = set(get_cids(target))
178 | 
179 |   return pos_cid_dict
180 | 
181 | 
182 | 
183 | 
184 | if __name__ == "__main__":
185 | 
186 |   target_list = ["cdk2", "egfr_erbB1", "gsk3b", "hgfr",
187 |                  "map_k_p38a", "tpk_lck", "tpk_src", "vegfr2"]
188 | 
189 |   #virtual_screening(target_list, int(sys.argv[1]))
190 | 
191 |   
192 | 
193 | 
194 | 
195 | 
196 | """
197 | import virtual_screen_pubchem as vsp
198 | import cPickle
199 | 
200 | target_list = ["cdk2", "egfr_erbB1", "gsk3b", "hgfr",
201 |                  "map_k_p38a", "tpk_lck", "tpk_src", "vegfr2"]
202 | 
203 | f = open("vs_pubchem.result", "r")
204 | results = cPickle.load(f)
205 | f.close()
206 | 
207 | pos_cid_dict = vsp.get_chembl_pos(target_list)
208 | 
209 | # test cdk2
210 | cdk2_vs = [results[i, 0] for i in range(results.shape[0]) if results[i, 1]==1]
211 | vs = set(cdk2_vs)
212 | cdk2_re = [int(x) for x in pos_cid_dict["cdk2"]]
213 | re = set(cdk2_re)
214 | len(list(vs | re))
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 
221 | """
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | 
236 | 
237 | 
238 | 
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 


--------------------------------------------------------------------------------
/dnn_model/single_eval.py:
--------------------------------------------------------------------------------
  1 | # Author: xiaotaw@qq.com (Any bug report is welcome)
  2 | # Time Created: Aug 2016
  3 | # Time Last Updated: Oct 2016
  4 | # Addr: Shenzhen, China
  5 | # Description: evaluate pk model for a single target
  6 | 
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import os
 12 | import sys
 13 | import time
 14 | import datetime
 15 | import numpy as np
 16 | import tensorflow as tf
 17 | 
 18 | from matplotlib import pyplot as plt
 19 | from sklearn.metrics import roc_curve, auc
 20 | 
 21 | import dnn_model 
 22 | sys.path.append("/home/scw4750/Documents/chembl/data_files/")
 23 | import chembl_input as ci
 24 | 
 25 | 
 26 | eval_batch_size = 1024
 27 | 
 28 | 
 29 | def evaluate(target, g_step_list=None, gpu_num=0, 
 30 |              keep_prob=0.8, wd=0.004, batch_size=128):
 31 |   """ evaluate the model 
 32 |   """
 33 |   # dataset
 34 |   d = ci.Dataset(target)
 35 |   # learning rate 
 36 |   step_per_epoch = int(d.train_size / batch_size)
 37 |   # input vec_len
 38 |   input_vec_len = d.num_features  
 39 |   # checkpoint file
 40 |   ckpt_dir = "ckpt_files/%s" % target
 41 |   ckpt_path = os.path.join(ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd))
 42 | 
 43 |   # pred file
 44 |   pred_dir = "pred_files/%s" % target
 45 |   if not os.path.exists(pred_dir):
 46 |     os.mkdir(pred_dir)
 47 | 
 48 |   print("%s eval starts at: %s\n" % (target, datetime.datetime.now()))
 49 |   
 50 |   # g_step_list
 51 |   #g_step_list = range(1, 2235900, 10 * step_per_epoch)
 52 |   #g_step_list.append(2235900)
 53 | 
 54 |   with tf.Graph().as_default(), tf.device("/gpu: %d" % gpu_num):
 55 |     # build the model
 56 |     input_placeholder = tf.placeholder(tf.float32, shape = (None, input_vec_len))
 57 |     label_placeholder = tf.placeholder(tf.float32, shape = (None, 2))
 58 |     # build the "Tree" with a mutual "Term" and several "Branches"
 59 |     base = dnn_model.term(input_placeholder, in_units=input_vec_len, wd=wd, keep_prob=1.0)
 60 |     # compute softmax
 61 |     softmax = dnn_model.branch(target, base, wd=wd, keep_prob=1.0)
 62 | 
 63 |     # create a saver.
 64 |     saver = tf.train.Saver(tf.trainable_variables())
 65 |     # create session.
 66 |     config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
 67 |     config.gpu_options.per_process_gpu_memory_fraction = 0.2
 68 |     sess = tf.Session(config=config)
 69 | 
 70 | 
 71 |     for g_step in g_step_list:
 72 |       # Restores variables from checkpoint
 73 |       saver.restore(sess, ckpt_path + "-%d" % g_step)
 74 | 
 75 |       # the whole pns
 76 |       pns_pred_file = open(pred_dir + "/pns_%s_%d_%4.3f_%4.3e_%d.pred" % (target, batch_size, keep_prob, wd, g_step), "w") 
 77 |       for ids, features, mask in d.batch_generator_pns(eval_batch_size):
 78 |         sm = sess.run(softmax, feed_dict={input_placeholder: features})
 79 |         for i, s, m in zip(ids, sm[:, 1], mask):
 80 |           pns_pred_file.write("%s\t%f\t%d\n" % (i, s, m))
 81 |       pns_pred_file.close()
 82 | 
 83 |       # the whole cns
 84 |       cns_pred_file = open(pred_dir + "/cns_%s_%d_%4.3f_%4.3e_%d.pred" % (target, batch_size, keep_prob, wd, g_step), "w") 
 85 |       for ids, features, mask in d.batch_generator_cns(eval_batch_size):
 86 |         sm = sess.run(softmax, feed_dict={input_placeholder: features})
 87 |         for i, s, m in zip(ids, sm[:, 1], mask):
 88 |           cns_pred_file.write("%s\t%f\t%d\n" % (i, s, m))
 89 |       cns_pred_file.close()   
 90 | 
 91 |       # the target's train
 92 |       train_pred_file = open(pred_dir + "/train_%s_%d_%4.3f_%4.3e_%d.pred" % (target, batch_size, keep_prob, wd, g_step), "w")
 93 |       sm = sess.run(softmax, feed_dict={input_placeholder: d.target_features_train.toarray()})
 94 |       for i, s, m in zip(d.target_ids_train, sm[:, 1], d.target_labels_train):
 95 |         train_pred_file.write("%s\t%f\t%d\n" % (i, s, m))
 96 |       train_pred_file.close()     
 97 | 
 98 |       # the target's test
 99 |       test_pred_file = open(pred_dir + "/test_%s_%d_%4.3f_%4.3e_%d.pred" % (target, batch_size, keep_prob, wd, g_step), "w")
100 |       sm = sess.run(softmax, feed_dict={input_placeholder: d.target_features_test.toarray()})
101 |       for i, s, m in zip(d.target_ids_test, sm[:, 1], d.target_labels_test):
102 |         test_pred_file.write("%s\t%f\t%d\n" % (i, s, m))
103 |       test_pred_file.close()   
104 | 
105 |   print("eval ends at: %s\n" % datetime.datetime.now())
106 | 
107 | 
108 | def test(target, g_step):
109 |   # dataset
110 |   d = ci.DatasetTarget(target)  
111 |   # batch size
112 |   batch_size = 128
113 |   # keep prob
114 |   keep_prob = 0.8
115 |   # weight decay
116 |   wd = 0.004
117 |   # checkpoint file
118 |   ckpt_dir = "ckpt_files/%s" % target
119 |   ckpt_path = os.path.join(ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd))
120 |   # input vec_len
121 |   input_vec_len = d.num_features
122 | 
123 |   with tf.Graph().as_default(), tf.device("/gpu:3"):
124 |     # build the model
125 |     input_placeholder = tf.placeholder(tf.float32, shape = (None, input_vec_len))
126 |     label_placeholder = tf.placeholder(tf.float32, shape = (None, 2))
127 |     # build the "Tree" with a mutual "Term" and several "Branches"
128 |     base = dnn_model.term(input_placeholder, in_units=input_vec_len, wd=wd, keep_prob=1.0)
129 |     # compute softmax
130 |     softmax = dnn_model.branch(target, base, wd=wd, keep_prob=1.0)
131 |     # compute loss.
132 |     wd_loss = tf.add_n(tf.get_collection("term_wd_loss") + tf.get_collection(target+"_wd_loss"))
133 |     x_entropy = dnn_model.x_entropy(softmax, label_placeholder, target)
134 |     loss  = tf.add(wd_loss, x_entropy)
135 |     # create a saver.
136 |     saver = tf.train.Saver(tf.trainable_variables())
137 |     # create session.
138 |     config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
139 |     config.gpu_options.per_process_gpu_memory_fraction = 0.2
140 |     sess = tf.Session(config=config)
141 | 
142 |     saver.restore(sess, ckpt_path + "-%d" % g_step)
143 |     sm = sess.run(softmax, feed_dict = {input_placeholder: d.target_features_test.toarray()})
144 | 
145 |     fpr, tpr, _ = roc_curve(d.target_labels_test, sm[:, 1])
146 |     roc_auc = auc(fpr, tpr)
147 |     plt.figure()
148 |     plt.plot(fpr, tpr, color="r", lw=2, label="ROC curve (area = %.2f)" % roc_auc)
149 |     plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
150 |     plt.xlim([0.0, 1.0])
151 |     plt.ylim([0.0, 1.05])
152 |     plt.xlabel("False Positive Rate")
153 |     plt.ylabel("True Positive Rate")
154 |     plt.title("Receiver operating characteristic of DNN model on %s" % target)
155 |     plt.legend(loc="lower right")
156 |     plt.savefig("%s.png" % target)
157 |     #plt.show()
158 |     
159 | 
160 | 
161 | if __name__ == "__main__":
162 |   # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others.
163 |   target_list = ["CHEMBL279", "CHEMBL203", # Protein Kinases
164 |                "CHEMBL217", "CHEMBL253", # GPCRs (Family A)
165 |                "CHEMBL235", "CHEMBL206", # Nuclear Hormone Receptors
166 |                "CHEMBL240", "CHEMBL4296", # Voltage Gated Ion Channels
167 |                "CHEMBL4805", # Ligand Gated Ion Channels
168 |                "CHEMBL204", "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others
169 |               ] 
170 | 
171 |   target_list = ["CHEMBL203", "CHEMBL204", "CHEMBL205",
172 |                  "CHEMBL206", "CHEMBL217", "CHEMBL235", "CHEMBL240",
173 |                  "CHEMBL244", "CHEMBL253", "CHEMBL279", "CHEMBL340", 
174 |                  "CHEMBL4005", "CHEMBL4296", "CHEMBL4805", "CHEMBL4822", 
175 |                 ] 
176 | 
177 |   g_list = [2161371, 2236500, 2235600, 
178 |             2091321, 2161661, 2086841, 2020411,
179 |             2161951, 2012041, 2161661, 2246400, 
180 |             2235900, 2238000, 2168041,  1936221
181 |            ]
182 | 
183 |   #i = int(sys.argv[1])
184 |   #target = target_list[i]
185 |   #g_step = g_list[i]
186 |   #evaluate(target=target, g_step_list=[g_step], gpu_num=i % 4)
187 |   evaluate(target="CHEMBL4722", g_step_list=[2242500], gpu_num=0)
188 |   #test(target, g_step, )
189 | 
190 | 
191 | 
192 | 


--------------------------------------------------------------------------------
/dnn_model/pk_train.py:
--------------------------------------------------------------------------------
  1 | # Author: xiaotaw@qq.com (Any bug report is welcome)
  2 | # Time Created: Aug 2016
  3 | # Time Last Updated: Nov 2016
  4 | # Addr: Shenzhen, China
  5 | # Description: train pk model
  6 | 
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import os
 12 | import time
 13 | import datetime
 14 | import math
 15 | import numpy
 16 | import random
 17 | import tensorflow as tf
 18 | 
 19 | import pk_input as pki
 20 | import dnn_model
 21 | 
 22 | 
 23 | def train(target_list, train_from = 0):
 24 | 
 25 |   # dataset
 26 |   d = pki.Datasets(target_list)
 27 | 
 28 |   # batch size.
 29 |   # note: the mean number of neg sample is 25.23 times as many as pos's.
 30 |   neg_batch_size = 512
 31 |   pos_batch_size_dict = {}
 32 |   pos_sum = 0
 33 |   for target in target_list:
 34 |     pos_sum += d.pos[target].size
 35 |   pos_batch_size = int(neg_batch_size * pos_sum / d.neg.size)
 36 |   for target in target_list:
 37 |     pos_batch_size_dict[target] = int(neg_batch_size * d.pos[target].size / d.neg.size)
 38 |     #pos_batch_size_dict[target] = pos_batch_size
 39 |   # learning rate 
 40 |   step_per_epoch = int(d.neg.size / neg_batch_size)
 41 |   start_learning_rate = 0.05
 42 |   decay_step = step_per_epoch * 10 * 8
 43 |   decay_rate = 0.9 
 44 |   # max train steps
 45 |   max_step = 50 * step_per_epoch
 46 |   # input vec_len
 47 |   input_vec_len = d.neg.features.shape[1]
 48 |   # keep prob
 49 |   keep_prob = 0.8
 50 |   # weight decay
 51 |   wd = 0.001
 52 |   # checkpoint file
 53 |   ckpt_dir = "ckpt_files_big_tree/pk"
 54 |   ckpt_path = os.path.join(ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (neg_batch_size, keep_prob, wd))
 55 |   if not os.path.exists(ckpt_dir):
 56 |     os.makedirs(ckpt_dir)
 57 |   # train log file
 58 |   log_dir = "log_files_big_tree"
 59 |   if not os.path.exists(log_dir):
 60 |     os.mkdir(log_dir)
 61 |   log_path = os.path.join(log_dir, "train_pk_%d_%4.3f_%4.3e.log" % (neg_batch_size, keep_prob, wd))
 62 |   logfile = open(log_path, 'w')
 63 |   logfile.write("train starts at: %s\n" % datetime.datetime.now())
 64 |    
 65 | 
 66 |   # train the model 
 67 |   with tf.Graph().as_default(), tf.device("/gpu:0"):
 68 | 
 69 |     # exponential decay learning rate
 70 |     global_step = tf.Variable(train_from, trainable=False)
 71 |     learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, decay_step, decay_rate)
 72 | 
 73 |     # build the model
 74 |     input_placeholder = tf.placeholder(tf.float32, shape = (None, input_vec_len))
 75 |     label_placeholder = tf.placeholder(tf.float32, shape = (None, 2))
 76 |     # build the "Tree" with a mutual "Term" and several "Branches"
 77 |     base = dnn_model.term(input_placeholder, wd=wd, keep_prob=keep_prob)
 78 |     softmax_dict = dict()
 79 |     wd_loss_dict = dict()
 80 |     x_entropy_dict = dict()
 81 |     loss_dict = dict()
 82 |     accuracy_dict = dict()
 83 |     train_op_dict = dict()
 84 |     for target in target_list:
 85 |       # compute softmax
 86 |       softmax_dict[target] = dnn_model.branch(target, base, wd=wd, keep_prob=keep_prob)
 87 |       # compute loss.
 88 |       wd_loss_dict[target] = tf.add_n(tf.get_collection("term_wd_loss") + tf.get_collection(target+"_wd_loss"))
 89 |       x_entropy_dict[target] = dnn_model.x_entropy(softmax_dict[target], label_placeholder, target)
 90 |       loss_dict[target]  = tf.add(wd_loss_dict[target], x_entropy_dict[target])
 91 |       # compute accuracy
 92 |       accuracy_dict[target] = dnn_model.accuracy(softmax_dict[target], label_placeholder, target)
 93 |       # train op
 94 |       train_op_dict[target] = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss_dict[target], global_step=global_step)
 95 |     # create a saver.
 96 |     saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=None)
 97 |     # start running operations on the Graph.
 98 |     config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
 99 |     config.gpu_options.per_process_gpu_memory_fraction = 0.8
100 |     sess = tf.Session(config=config)
101 |     # initialize all variables at first.
102 |     sess.run(tf.initialize_all_variables())
103 |     if train_from != 0:
104 |       saver.restore(sess, ckpt_path + "-%d" % train_from)
105 |     # print title to screen and log file
106 |     title_str = "  step g_step wdloss   xloss learn_rate    TP    FN    TN    FP    SEN    SPE    ACC    MCC t1-t0 t2-t1 t3-t2  target"
107 |     print(title_str)
108 |     logfile.write(title_str + "\n")
109 | 
110 |     # format str
111 |     format_str = "%6d %6d %6.4f %7.5f %10.8f %5d %5d %5d %5d %6.4f %6.4f %6.4f %6.4f %5.3f %5.3f %5.3f %10s "
112 | 
113 |     # train with max step
114 |     for step in xrange(max_step):
115 |       for target in target_list:
116 |         t0 = time.time()
117 | 
118 |         # get a batch sample
119 |         compds_batch, labels_batch = d.next_train_batch(target, pos_batch_size_dict[target], neg_batch_size)
120 |         t1 = float(time.time())
121 | 
122 |         _ = sess.run(train_op_dict[target], feed_dict={input_placeholder: compds_batch, label_placeholder: labels_batch})
123 |         t2 = float(time.time())
124 | 
125 |         # compute performance
126 |         # compute performance
127 |         if step % step_per_epoch == 0 or (step + 1) == max_step:
128 |           g_step, wd_ls, x_ls, lr, acc, pred, label_dense = sess.run([global_step, wd_loss_dict[target], x_entropy_dict[target], learning_rate, accuracy_dict[target], tf.argmax(softmax_dict[target], 1), tf.argmax(labels_batch, 1)],
129 |             feed_dict = {input_placeholder: compds_batch, label_placeholder: labels_batch})
130 |           tp, tn, fp, fn, sen, spe, mcc = dnn_model.compute_performance(label_dense, pred)
131 |           t3 = float(time.time())       
132 |           # print to file and screen
133 | 
134 |           logfile.write(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target))
135 |           logfile.write('\n')
136 |           print(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target))      
137 | 
138 | 
139 |       # save the model checkpoint periodically.
140 |       if step % (10 * step_per_epoch) == 0 or (step + 1) == max_step:
141 |         saver.save(sess, ckpt_path, global_step=global_step, write_meta_graph=False)
142 | 
143 |       if (step > 3 * 10 * step_per_epoch) and (step % (10 * step_per_epoch) == 0 or (step + 1) == max_step):
144 |         for target in target_list:
145 |           # the whole train
146 |           t0 = time.time()
147 |           compds_batch = numpy.vstack([d.pos[target].features[d.pos[target].train_perm], d.neg.features[d.neg.train_perm]])
148 |           labels_batch = numpy.vstack([d.pos[target].labels[d.pos[target].train_perm], d.neg.mask_dict[target][d.neg.train_perm]])
149 |           t1 = time.time()
150 |           t2 = time.time()
151 |           g_step, wd_ls, x_ls, lr, acc, pred, label_dense = sess.run([global_step, wd_loss_dict[target], x_entropy_dict[target], learning_rate, accuracy_dict[target], tf.argmax(softmax_dict[target], 1), tf.argmax(labels_batch, 1)],
152 |             feed_dict = {input_placeholder: compds_batch, label_placeholder: labels_batch})
153 |           t3 = float(time.time()) 
154 |           tp, tn, fp, fn, sen, spe, mcc = dnn_model.compute_performance(label_dense, pred)
155 |           # print to file and screen
156 |           logfile.write(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target))
157 |           logfile.write('\n')
158 |           print(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target))    
159 | 
160 |           # the whole test
161 |           t0 = time.time()
162 |           compds_batch = numpy.vstack([d.pos[target].features[d.pos[target].test_perm], d.neg.features[d.neg.test_perm]])
163 |           labels_batch = numpy.vstack([d.pos[target].labels[d.pos[target].test_perm], d.neg.mask_dict[target][d.neg.test_perm]])
164 |           t1 = time.time()
165 |           t2 = time.time()
166 |           g_step, wd_ls, x_ls, lr, acc, pred, label_dense = sess.run([global_step, wd_loss_dict[target], x_entropy_dict[target], learning_rate, accuracy_dict[target], tf.argmax(softmax_dict[target], 1), tf.argmax(labels_batch, 1)],
167 |             feed_dict = {input_placeholder: compds_batch, label_placeholder: labels_batch})
168 |           t3 = float(time.time()) 
169 |           tp, tn, fp, fn, sen, spe, mcc = dnn_model.compute_performance(label_dense, pred)
170 |           # print to file and screen
171 |           logfile.write(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target))
172 |           logfile.write('\n')
173 |           print(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe, acc, mcc, t1-t0, t2-t1, t3-t2, target)) 
174 | 
175 | 
176 |   logfile.write("train ends at: %s\n" % datetime.datetime.now())
177 |   logfile.close()
178 | 
179 | 
180 | 
181 | if __name__ == "__main__":
182 | 
183 |   target_list = ["cdk2", "egfr_erbB1", "gsk3b", "hgfr",
184 |                  "map_k_p38a", "tpk_lck", "tpk_src", "vegfr2"]
185 | 
186 |   train(target_list, train_from=0)
187 | 
188 | 
189 | 
190 | 


--------------------------------------------------------------------------------
/dnn_model/single_vs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Author: xiaotaw@qq.com (Any bug report is welcome)
  3 | # Time: Aug 2016
  4 | # Addr: Shenzhen
  5 | # Description: apply pk model to pubchem dataset, to screen potential active substrate(drugs)
  6 | 
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import os
 12 | import sys
 13 | import time
 14 | import numpy as np
 15 | import datetime
 16 | import tensorflow as tf
 17 | from matplotlib import pyplot as plt
 18 | 
 19 | import dnn_model
 20 | sys.path.append("/home/scw4750/Documents/chembl/data_files/")
 21 | import chembl_input as ci
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 
 27 | def virtual_screening_single(target, g_step, part_num, gpu_num):
 28 |   t_0 = time.time()
 29 | 
 30 |   # dataset
 31 |   d = ci.DatasetVS(target)
 32 |   # batch size
 33 |   batch_size = 128
 34 |   # input vec_len
 35 |   input_vec_len = d.num_features
 36 |   # keep prob
 37 |   keep_prob = 0.8
 38 |   # weight decay
 39 |   wd = 0.004
 40 |   # g_step
 41 |   #g_step = 2236500 
 42 | 
 43 |   # virtual screen pred file
 44 |   pred_dir = "pred_files/%s" % target
 45 |   if not os.path.exists(pred_dir):
 46 |     os.makedirs(pred_dir)
 47 |   pred_path = os.path.join(pred_dir, "vs_pubchem_%s_%d_%4.3f_%4.3e_%d_%d.pred" % (target, batch_size, keep_prob, wd, g_step, part_num))
 48 |   predfile = open(pred_path, 'w')
 49 |   print("virtual screen %d starts at: %s\n" % (part_num, datetime.datetime.now()))
 50 | 
 51 |   # checkpoint file
 52 |   ckpt_dir = "ckpt_files/%s" % target
 53 |   ckpt_path = os.path.join(ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd))
 54 | 
 55 |   # input and output dir
 56 |   fp_dir = "/raid/xiaotaw/pubchem/fp_files/%d" % part_num
 57 | 
 58 |   # screening
 59 |   with tf.Graph().as_default(), tf.device("/gpu: %d" % gpu_num):
 60 |   #with tf.Graph().as_default(), tf.device("/gpu:%d" % (part_num % 4)):
 61 |     # the input
 62 |     input_placeholder = tf.placeholder(tf.float32, shape = (None, input_vec_len))
 63 |     # the term
 64 |     base = dnn_model.term(input_placeholder, in_units=input_vec_len, wd=wd, keep_prob=1.0)
 65 |     # the branches
 66 |     softmax = dnn_model.branch(target, base, wd=wd, keep_prob=1.0)
 67 |     # create a saver.
 68 |     saver = tf.train.Saver(tf.trainable_variables())
 69 |     # Start screen
 70 |     config=tf.ConfigProto(allow_soft_placement=True)
 71 |     config.gpu_options.per_process_gpu_memory_fraction = 0.35
 72 |     with tf.Session(config=config) as sess:
 73 |       # Restores variables from checkpoint
 74 |       saver.restore(sess, ckpt_path + "-%d" % g_step)
 75 |       for i in xrange(part_num * 10000000 + 1, (part_num + 1) * 10000000, 25000):
 76 |         in_file = "Compound_" + "{:0>9}".format(i) + "_" + "{:0>9}".format(i + 24999) + ".apfp"
 77 |         fp_fn = os.path.join(fp_dir, in_file)
 78 |         if not os.path.exists(fp_fn):
 79 |           print("%s not exists" % fp_fn)
 80 |           continue
 81 |         d.reset(fp_fn)
 82 |         compds = d.features_dense
 83 |         sm = sess.run(softmax, feed_dict = {input_placeholder: compds})
 84 |         for id_, sm_v in zip(d.pubchem_id, sm[:, 1]):
 85 |           predfile.writelines("%s\t%f\n" % (id_, sm_v))
 86 |         print("%s\t%d\n" % (fp_fn, len(d.pubchem_id)))
 87 | 
 88 |   print("duration: %.3f" % (time.time() - t_0))
 89 | 
 90 | 
 91 | """
 92 | def predict(target, g_step_list=None):
 93 |   # dataset
 94 |   d = ci.Dataset(target)
 95 |   # batch size
 96 |   batch_size = 128
 97 |   # learning rate 
 98 |   step_per_epoch = int(d.train_size / batch_size)
 99 |   # input vec_len
100 |   input_vec_len = d.train_features.shape[1]
101 |   # keep prob
102 |   keep_prob = 0.8
103 |   # weight decay
104 |   wd = 0.004
105 |   # checkpoint file
106 |   ckpt_dir = "ckpt_files/%s" % target
107 |   ckpt_path = os.path.join(ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd))
108 |   # pred file
109 |   pred_dir = "pred_files/%s" % target
110 |   if not os.path.exists(pred_dir):
111 |     os.makedirs(pred_dir)
112 |   
113 |   # g_step_list
114 |   #g_step_list = range(1, 2235900, 10 * step_per_epoch)
115 |   #g_step_list = [2161371]
116 | 
117 |   with tf.Graph().as_default(), tf.device("/gpu:3"):
118 |     
119 |     # build the model
120 |     input_placeholder = tf.placeholder(tf.float32, shape = (None, input_vec_len))
121 |     label_placeholder = tf.placeholder(tf.float32, shape = (None, 2))
122 |     # build the "Tree" with a mutual "Term" and several "Branches"
123 |     base = dnn_model.term(input_placeholder, in_units=input_vec_len, wd=wd, keep_prob=1.0)
124 |     # compute softmax
125 |     softmax = dnn_model.branch(target, base, wd=wd, keep_prob=1.0)
126 |     # compute loss.
127 |     wd_loss = tf.add_n(tf.get_collection("term_wd_loss") + tf.get_collection(target+"_wd_loss"))
128 |     x_entropy = dnn_model.x_entropy(softmax, label_placeholder, target)
129 |     loss  = tf.add(wd_loss, x_entropy)
130 |     # create a saver.
131 |     saver = tf.train.Saver(tf.trainable_variables())
132 |     # create session.
133 |     config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
134 |     config.gpu_options.per_process_gpu_memory_fraction = 0.9
135 |     sess = tf.Session(config=config)
136 | 
137 |     # target test
138 |     test_chemblid = d.time_split_test["CMPD_CHEMBLID"]
139 |     test_compds = d.test_features.toarray()
140 |     test_labels_dense = d.test_labels
141 | 
142 |     # target train
143 |     time_split_train = d.target_clf_label[d.target_clf_label["YEAR"] <= 2014]
144 |     target_train_chemblid = time_split_train["CMPD_CHEMBLID"]
145 |     m = d.target_cns_mask.index.isin(time_split_train["CMPD_CHEMBLID"])
146 |     target_train_features = d.target_cns_features[m].toarray()
147 |     target_train_labels_dense = d.target_cns_mask[m].values.astype(int)
148 | 
149 |     for g_step in g_step_list:
150 |       # Restores variables from checkpoint
151 |       saver.restore(sess, ckpt_path + "-%d" % g_step)
152 | 
153 |       # the target's test
154 |       sm = sess.run(softmax, feed_dict = {input_placeholder: test_compds})
155 | 
156 |       test_pred_path = os.path.join(pred_dir, "test_%s_%d_%4.3f_%4.3e_%d.pred" % (target, batch_size, keep_prob, wd, g_step))
157 |       test_pred_file = open(test_pred_path, 'w')
158 | 
159 |       for id_, sm_v, l_v in zip(test_chemblid, sm[:, 1], test_labels_dense):
160 |         test_pred_file.writelines("%s\t%f\t%f\n" % (id_, sm_v, l_v))
161 | 
162 |       test_pred_file.close()
163 | 
164 |       # the target's train
165 |       sm = sess.run(softmax, feed_dict = {input_placeholder: target_train_features})
166 | 
167 |       train_pred_path = os.path.join(pred_dir, "train_%s_%d_%4.3f_%4.3e_%d.pred" % (target, batch_size, keep_prob, wd, g_step))
168 |       train_pred_file = open(train_pred_path, 'w')
169 | 
170 |       for id_, sm_v, l_v in zip(target_train_chemblid, sm[:, 1], target_train_labels_dense):
171 |         train_pred_file.writelines("%s\t%f\t%f\n" % (id_, sm_v, l_v))
172 | 
173 |       train_pred_file.close()
174 | """
175 | 
176 | def analyse(target, g_step):
177 |   vs_pred_file = "pred_files/%s/vs_pubchem_%s_128_0.800_4.000e-03_%d.pred" % (target, target, g_step)
178 |   aa = np.genfromtxt(vs_pred_file, delimiter="\t")
179 |   a = aa[:, 1]
180 | 
181 |   test_pred_file = "pred_files/%s/test_%s_128_0.800_4.000e-03_%d.pred" % (target, target, g_step)
182 |   bb = np.genfromtxt(test_pred_file, delimiter="\t", usecols=[1,2])
183 |   b = bb[:, 0][bb[:, 1].astype(bool)]
184 | 
185 |   """
186 |   train_pred_file = "pred_files/%s/train_%s_128_0.800_4.000e-03_%d.pred" % (target, target, g_step)
187 |   cc = np.genfromtxt(train_pred_file, delimiter="\t", usecols=[1,2])
188 |   c = cc[:, 0][cc[:, 1].astype(bool)]
189 | 
190 | bhist = plt.hist(b, bins=100, range=(0, 1), cumulative=False, histtype="stepfilled", )
191 | plt.hist(b, bins=100, range=(0, 1), cumulative=True, histtype="step", )
192 | plt.show()
193 | 
194 | 
195 | chist = plt.hist(c, bins=100, cumulative=False, histtype="stepfilled", )
196 | plt.hist(c, bins=100, cumulative=True, histtype="step", )
197 | plt.show()
198 | 
199 |   histtype="bar", "barstacked", "step", "stepfilled",
200 | 
201 | 
202 |   """
203 | 
204 |   x = []
205 |   y = []
206 |   for i in range(10):
207 |     mark = (i + 1) / 20.0
208 |     xi = 1.0 * (b > mark).sum() / b.shape[0]
209 |     yi = (a > mark).sum()
210 |     x.append(xi)
211 |     y.append(yi)
212 | 
213 |   plt.plot(x, y, "*")
214 |   plt.xlabel("pos yeild rate")
215 |   plt.ylabel("vs pubchem false pos")
216 | 
217 |   plt.savefig("pred_files/%s/%d.png" % (target, g_step))
218 | 
219 | 
220 |   
221 | 
222 | 
223 | if __name__ == "__main__":
224 |   # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others.
225 |   target_list = ["CHEMBL279", "CHEMBL203", # Protein Kinases
226 |                "CHEMBL217", "CHEMBL253", # GPCRs (Family A)
227 |                "CHEMBL235", "CHEMBL206", # Nuclear Hormone Receptors
228 |                "CHEMBL240", "CHEMBL4296", # Voltage Gated Ion Channels
229 |                "CHEMBL4805", # Ligand Gated Ion Channels
230 |                "CHEMBL204", "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others
231 |               ] 
232 | 
233 |   # the target
234 |   target = "CHEMBL4722"
235 | 
236 |   #part_num range from 0 to 12(included)
237 |   #for i in range(9, 13):
238 |   #  virtual_screening_single(target, 2260800, i, 3)
239 | 
240 |   #predict(target, g_step_list=[2252100])
241 | 
242 | 
243 |   analyse(target, g_step=2242500)
244 | 
245 | 
246 | 
247 | 


--------------------------------------------------------------------------------
/data_files/chembl_cal_mask.py:
--------------------------------------------------------------------------------
  1 | # Author: xiaotaw@qq.com (Any bug report is welcome)
  2 | # Time Created: Dec 2016
  3 | # Time Last Updated: Dec 2016
  4 | # Addr: Shenzhen, China
  5 | # Description: calculate mask(label) of chembl molecules for specific targets
  6 | 
  7 | import os
  8 | import sys
  9 | import math
 10 | import time
 11 | import datetime
 12 | import multiprocessing
 13 | import numpy as np
 14 | from scipy import sparse
 15 | from collections import defaultdict
 16 | 
 17 | # folders
 18 | fp_dir = "fp_files"
 19 | structure_dir = "structure_files"
 20 | mask_dir = "mask_files"
 21 | if not os.path.exists(mask_dir):
 22 |   os.mkdir(mask_dir)
 23 | log_dir = "log_files"
 24 | if not os.path.exists(log_dir):
 25 |   os.mkdir(log_dir)
 26 | 
 27 | 
 28 | # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others.
 29 | target_list = ["CHEMBL279", "CHEMBL203", # Protein Kinases
 30 |                "CHEMBL217", "CHEMBL253", # GPCRs (Family A)
 31 |                "CHEMBL235", "CHEMBL206", # Nuclear Hormone Receptors
 32 |                "CHEMBL240", "CHEMBL4296", # Voltage Gated Ion Channels
 33 |                "CHEMBL4805", # Ligand Gated Ion Channels
 34 |                "CHEMBL204", "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others
 35 |               ] 
 36 | 
 37 | # the target
 38 | #target = target_list[int(sys.argv[1])]
 39 | 
 40 | # read chembl id and apfp
 41 | chembl_id = []
 42 | chembl_apfp = {}
 43 | f = open(os.path.join(fp_dir, "chembl.apfp"), "r")
 44 | for line in f:
 45 |   id_, fps_str = line.split("\t")
 46 |   id_ = id_.strip()
 47 |   fps_str = fps_str.strip()
 48 |   chembl_id.append(id_)
 49 |   chembl_apfp[id_] = fps_str
 50 | 
 51 | f.close()
 52 | 
 53 | # read (pubchem negative sample)pns apfp and counts the fps that appeared in pns compounds
 54 | pns_id = []
 55 | pns_apfp = {}
 56 | pns_count = defaultdict(lambda : 0)
 57 | f = open(os.path.join(fp_dir, "pubchem_neg_sample.apfp"), "r")
 58 | for line in f:
 59 |   id_, fps_str = line.split("\t")
 60 |   id_ = id_.strip()
 61 |   fps_str = fps_str.strip()
 62 |   pns_id.append(id_)
 63 |   pns_apfp[id_] = fps_str
 64 |   for fp in fps_str[1:-1].split(","):
 65 |     if ":" in fp:
 66 |       k, _ = fp.split(":")
 67 |       pns_count[int(k)] += 1
 68 | 
 69 | f.close()
 70 | 
 71 | 
 72 | # read top 79 targets' label
 73 | clf_label_79 = np.genfromtxt(os.path.join(structure_dir, "chembl_top79.label"), usecols=[0, 2, 3], delimiter="\t", skip_header=1, dtype=str)
 74 | 
 75 | def cal_mask(target):
 76 |   ################################################################################
 77 |   # generate sparse matrix for target features 
 78 | 
 79 |   # target compounds' chembl_id and clf label.
 80 |   target_clf_label = clf_label_79[clf_label_79[:, 0] == target]
 81 | 
 82 |   # remove compounds whose apfp cannot be caculated
 83 |   m = []
 84 |   for cmpd_id in target_clf_label[:, 1]:
 85 |     if cmpd_id in chembl_id:
 86 |       m.append(True)
 87 |     else:
 88 |       m.append(False)
 89 |   target_clf_label = target_clf_label[np.array(m)]  
 90 | 
 91 |   # target fps
 92 |   target_fps = [chembl_apfp[x] for x in target_clf_label[:, 1]]
 93 | 
 94 |   # count the fps that appeared in the compounds of the target
 95 |   target_count = defaultdict(lambda : 0)
 96 |   for fps_str in target_fps:
 97 |     for fp in fps_str[1:-1].split(","):
 98 |       if ":" in fp:
 99 |         k, _ = fp.split(":")
100 |         target_count[int(k)] += 1
101 | 
102 |   target_count.update(pns_count)
103 | 
104 |   # save target apfp count 
105 |   count_file = open(os.path.join(mask_dir, "%s_apfp.count" % target), "w")
106 |   for k in target_count.keys():
107 |     count_file.write("%d\t%d\n" % (k, target_count[k]))
108 | 
109 |   count_file.close()
110 |     
111 |   # pick out that fps that appeared for more than 10 times.
112 |   # Here we assume that the more frequently a fp appeared, the more important it is.
113 |   v = np.array([[k, target_count[k]] for k in target_count.keys()])
114 |   m = v[:, 1] > 10
115 |   target_apfp_picked = v[m][:, 0]
116 |   
117 |   # according to the apfp that picked out, define the columns in the feature sparse matrix
118 |   # Note: a defaultdict is used. 
119 |   # And the purpose is assign a default value(length of target_apfp_picked) for the apfps 
120 |   # which is not included in target_apfp_picked. And this column(the last column) was finally 
121 |   # not used at all.
122 |   columns_dict = defaultdict(lambda : len(target_apfp_picked))
123 |   for i, apfp in enumerate(target_apfp_picked):
124 |     columns_dict[apfp] = i
125 |   
126 |   # define the function which can construct a feature sparse matrix according to the columns_dict
127 |   def sparse_features(fps_list):
128 |     data = []
129 |     indices = []
130 |     indptr = [0]
131 |     for fps_str in fps_list:
132 |       n = indptr[-1]
133 |       for fp in fps_str[1:-1].split(","):
134 |         if ":" in fp:
135 |           k, v = fp.split(":")
136 |           indices.append(columns_dict[int(k)])
137 |           data.append(int(v))
138 |           n += 1
139 |       indptr.append(n)
140 |     a = sparse.csr_matrix((np.array(data), indices, indptr), shape=(len(fps_list), len(target_apfp_picked) + 1))
141 |     return a
142 |   
143 |   # pick out target compounds with pos labels
144 |   # normally, abs(clf_label) > 0.5(refer to chembl_preparation.py), 
145 |   # so it also works when using the following line:
146 |   # target_pos_id = target_clf_label[target_clf_label[:, 2].astype(float) > 0.5][:, 1]
147 |   target_pos_id = target_clf_label[target_clf_label[:, 2].astype(float) > 0][:, 1]
148 |   target_pos_fps = [chembl_apfp[x] for x in target_pos_id]
149 |   
150 |   # generate feature sparse matrix for target's pos compounds
151 |   target_pos_features = sparse_features(target_pos_fps)[:, :-1].toarray()
152 |   
153 |   # generate feature sparse matrix for pns compounds
154 |   target_pns_features = sparse_features([pns_apfp[k] for k in pns_id])[:, :-1]
155 |   
156 |   # generate feature sparse matrix for (chembl negative sample)cns compounds
157 |   target_cns_features = sparse_features([chembl_apfp[k] for k in chembl_id])[:, :-1]
158 |   
159 |   
160 |   ################################################################################
161 |   # generate mask for pns and cns
162 |   
163 |   # define a task function for sub process:
164 |   # it can compare a part of negative sample(cns or pns) with pos samples,
165 |   # and return the mask of those samples back to the main process.
166 |   def sub_compare(sub_neg_id, sub_neg_features, conn):
167 |     mask = {}
168 |     log_str = []
169 |     for neg_k, neg_f in zip(sub_neg_id, sub_neg_features):
170 |       for pos_k, pos_f in zip(target_pos_id, target_pos_features):
171 |         if (neg_f != pos_f).sum() == 0:
172 |           mask[neg_k] = True
173 |           log_str.append("%s\t%s\n" % (neg_k, pos_k))
174 |     conn.send((mask, log_str))
175 |     conn.close()
176 |   
177 |   # the number of sub process for computation 
178 |   n_jobs = 6
179 |   
180 |   
181 |   # using multiprocessing compute mask for pns
182 |   t1 = time.time()
183 |   date1 = datetime.datetime.now()
184 |   
185 |   num_per_job = int(math.ceil(target_pns_features.shape[0] / float(n_jobs)))
186 |   thread_list = []
187 |   conn_list = []
188 |   for i in range(0, n_jobs):
189 |     begin = i * num_per_job
190 |     end = (i + 1) * num_per_job
191 |     if end > target_pns_features.shape[0]:
192 |       end = target_pns_features.shape[0]
193 |     p_conn, c_conn = multiprocessing.Pipe()
194 |     conn_list.append((p_conn, c_conn))
195 |     t = multiprocessing.Process(target=sub_compare, args=(pns_id[begin: end], target_pns_features[begin: end], c_conn))
196 |     thread_list.append(t)
197 |   
198 |   for i in range(n_jobs):  
199 |     thread_list[i].start()
200 |   
201 |   for i in range(n_jobs):  
202 |     thread_list[i].join()
203 |   
204 |   t2 = time.time()
205 |   
206 |   target_pns_mask = defaultdict(lambda : False)
207 |   
208 |   log = open(log_dir + "/" + target + "_gen_pns_mask.log", "w")
209 |   log.write("%s generate mask for pubchem neg sample, begins at %s\n" % (target, str(date1)))
210 |   
211 |   for i in range(n_jobs):  
212 |     p_conn = conn_list[i][0]
213 |     mask, log_str = p_conn.recv()
214 |     target_pns_mask.update(mask)
215 |     log.writelines(log_str)
216 |   
217 |   log.write("generate mask for pns, duration: %.3f\n" % (t2 - t1))
218 |   log.close()
219 |   
220 |   mask_file = open(os.path.join(mask_dir, "%s_pns.mask" % target), "w")
221 |   mask_file.writelines(["%s\t%s\n" % (x, target_pns_mask[x]) for x in pns_id])
222 |   mask_file.close()
223 |   
224 |   print("generate mask for pns, duration: %.3f" % (t2 - t1))
225 |   
226 |   
227 |   # using multiprocessing compute mask for cns 
228 |   t2 = time.time()
229 |   date2 = datetime.datetime.now()
230 |   
231 |   num_per_job = int(math.ceil(target_cns_features.shape[0] / float(n_jobs)))
232 |   thread_list = []
233 |   conn_list = []
234 |   for i in range(0, n_jobs):
235 |     begin = i * num_per_job
236 |     end = (i + 1) * num_per_job
237 |     if end > target_cns_features.shape[0]:
238 |       end = target_cns_features.shape[0]
239 |     p_conn, c_conn = multiprocessing.Pipe()
240 |     conn_list.append((p_conn, c_conn))
241 |     t = multiprocessing.Process(target=sub_compare, args=(chembl_id[begin: end], target_cns_features[begin: end], c_conn))
242 |     thread_list.append(t)
243 |   
244 |   for i in range(n_jobs):  
245 |     thread_list[i].start()
246 |   
247 |   for i in range(n_jobs):  
248 |     thread_list[i].join()
249 |   
250 |   t3 = time.time()
251 |   
252 |   target_cns_mask = defaultdict(lambda : False)
253 |   
254 |   log = open(log_dir + "/" + target + "_gen_cns_mask.log", "w")
255 |   log.write("%s generate mask for chembl neg sample, begins at %s\n" % (target, str(date2)))
256 |   
257 |   for i in range(n_jobs):  
258 |     p_conn = conn_list[i][0]
259 |     mask, log_str = p_conn.recv()
260 |     target_cns_mask.update(mask)
261 |     log.writelines(log_str)
262 |   
263 |   log.write("generate mask for cns, duration: %.3f\n" % (t3 - t2))
264 |   log.close()
265 |   
266 |   mask_file = open(os.path.join(mask_dir, "%s_cns.mask" % target), "w")
267 |   mask_file.writelines(["%s\t%s\n" % (x, target_cns_mask[x]) for x in chembl_id])
268 |   mask_file.close()
269 |   
270 |   print("generate mask for cns, duration: %.3f" % (t3 - t2))
271 |   
272 | 
273 | # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others.
274 | target_list = [
275 |                "CHEMBL4805", # Ligand Gated Ion Channels
276 |                "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others
277 |               ] 
278 | 
279 | 
280 | #for target in target_list:
281 | #  cal_mask(target)
282 | cal_mask(sys.argv[1])
283 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/dnn_model/pk_input.py:
--------------------------------------------------------------------------------
  1 | # Author: xiaotaw@qq.com (Any bug report is welcome)
  2 | # Time Created: Aug 2016
  3 | # Time Last Updated: Nov 2016
  4 | # Addr: Shenzhen, China
  5 | # Description: define functions and parameters related to input data
  6 | 
  7 | 
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | from __future__ import print_function
 11 | 
 12 | import os
 13 | import h5py
 14 | import time
 15 | import random
 16 | 
 17 | import numpy as np
 18 | import pandas as pd
 19 | 
 20 | from scipy import sparse
 21 | 
 22 | vec_len = 9561
 23 | data_dir = "../data_files"
 24 | h5_dir = os.path.join(data_dir, "h5_files")
 25 | 
 26 | 
 27 | def dense_to_one_hot(labels_dense, num_classes=2, dtype=np.int):
 28 |   """Convert class labels from scalars to one-hot vectors.
 29 |   Args:
 30 |     labels_dense: <type 'numpy.ndarray'> dense label
 31 |     num_classes: <type 'int'> the number of classes in one hot label
 32 |     dtype: <type 'type'> data type
 33 |   Return:
 34 |     labels_ont_hot: <type 'numpy.ndarray'> one hot label
 35 |   """
 36 |   num_labels = labels_dense.shape[0]
 37 |   index_offset = np.arange(num_labels) * num_classes
 38 |   labels_one_hot = np.zeros((num_labels, num_classes))
 39 |   labels_one_hot.flat[index_offset + labels_dense.ravel().astype(dtype)] = 1
 40 |   return labels_one_hot
 41 | 
 42 | 
 43 | class Dataset(object):
 44 |   """Base dataset class 
 45 |   """
 46 |   def __init__(self, size, is_shuffle=False, fold=10):
 47 |     """Constructor, create a dataset container. 
 48 |     Args:
 49 |       size: <type 'int'> the number of samples
 50 |       is_shuffle: <type 'bool'> whether shuffle samples when the dataset created
 51 |       fold: <type 'int'> how many folds to split samples
 52 |     Return:
 53 |       None
 54 |     """
 55 |     self.size = size
 56 |     self.perm = np.array(range(self.size))
 57 |     if is_shuffle:
 58 |       random.shuffle(self.perm)
 59 |  
 60 |     self.train_size = int(self.size * (1.0 - 1.0 / fold))
 61 |     self.train_perm = self.perm[range(self.train_size)]
 62 |     self.train_begin = 0
 63 |     self.train_end = 0
 64 | 
 65 |     self.test_perm = self.perm[range(self.train_size, self.size)]
 66 | 
 67 |   def generate_perm_for_train_batch(self, batch_size):
 68 |     """Create the permutation for a batch of train samples
 69 |     Args:
 70 |       batch_size: <type 'int'> the number of samples in the batch
 71 |     Return:
 72 |       perm: <type 'numpy.ndarray'> the permutation of samples which form a batch
 73 |     """
 74 |     self.train_begin = self.train_end
 75 |     self.train_end += batch_size
 76 |     if self.train_end > self.train_size:
 77 |       random.shuffle(self.train_perm)
 78 |       self.train_begin = 0
 79 |       self.train_end = batch_size
 80 |     perm = self.train_perm[self.train_begin: self.train_end]
 81 |     return perm
 82 | 
 83 | 
 84 | class PosDataset(Dataset):
 85 |   """Positive dataset class
 86 |   """
 87 |   def __init__(self, target, one_hot=True, dtype=np.float32):
 88 |     """Create a positive dataset for a protein kinase target.
 89 |       The data is read from hdf5 files.
 90 |     Args:
 91 |       target: <type 'str'> the protein kinase target name, also the name of hdf5 file
 92 |       one_hot: <type 'bool'> whether to convert labels from dense to one_hot
 93 |       dtype: <type 'type'> data type of features 
 94 |     Return:
 95 |       None
 96 |     """
 97 |     # open h5 file
 98 |     self.h5_fn = os.path.join(h5_dir, target + ".h5")
 99 |     self.h5 = h5py.File(self.h5_fn, "r")
100 |     # read ids
101 |     self.ids = self.h5["chembl_id"].value
102 |     # read 3 fp, and stack as feauture
103 |     ap = sparse.csr_matrix((self.h5["ap"]["data"], self.h5["ap"]["indices"], self.h5["ap"]["indptr"]), shape=[len(self.h5["ap"]["indptr"]) - 1, vec_len])
104 |     #mg = sparse.csr_matrix((self.h5["mg"]["data"], self.h5["mg"]["indices"], self.h5["mg"]["indptr"]), shape=[len(self.h5["mg"]["indptr"]) - 1, vec_len])
105 |     #tt = sparse.csr_matrix((self.h5["tt"]["data"], self.h5["tt"]["indices"], self.h5["tt"]["indptr"]), shape=[len(self.h5["tt"]["indptr"]) - 1, vec_len])
106 |     #self.features = sparse.hstack([ap, mg, tt]).toarray()
107 |     self.features = ap.toarray()
108 |     # label 
109 |     self.labels = self.h5["label"].value
110 |     if one_hot == True:
111 |       self.labels = dense_to_one_hot(self.labels)
112 |     # year
113 |     if "year" in self.h5.keys():
114 |       self.years = self.h5["year"].value
115 |     else:
116 |       self.years = None
117 |     # close h5 file
118 |     self.h5.close()
119 |     # dtype
120 |     self.dtype = dtype
121 |     # pre_process
122 |     #self.features = np.log10(1.0 + self.features).astype(self.dtype)
123 |     self.features = np.clip(self.features, 0, 1).astype(self.dtype)
124 |     # 
125 |     Dataset.__init__(self, self.features.shape[0])
126 | 
127 | 
128 |   def next_train_batch(self, batch_size):
129 |     """Generate the next batch of samples
130 |     Args:
131 |       batch_size: <type 'int'> the number of samples in the batch
132 |     Return:
133 |       A tuple of features and labels of the samples in the batch
134 |     """
135 |     perm = self.generate_perm_for_train_batch(batch_size)
136 |     return self.features[perm], self.labels[perm]
137 | 
138 | 
139 | class NegDataset(Dataset):
140 |   """Negative dataset class
141 |   """
142 |   def __init__(self, target_list, one_hot=True, dtype=np.float32):
143 |     """Create a negative dataset for a protein kinase target.
144 |       The data is read from a hdf5 file, pubchem_neg_sample.h5.
145 |       Note that for each target, these samples has the corresponding labels,
146 |       and I use a mask_dict to store these labels, i.e. mask_dict[target] = labels for target
147 |     Args:
148 |       target_list: <type 'list'> the protein kinase targets' list
149 |       one_hot: <type 'bool'> whether to convert labels from dense to one_hot
150 |       dtype: <type 'type'> data type of features 
151 |     Return:
152 |       None
153 |     """
154 |     # open h5 file
155 |     self.h5_fn = os.path.join(h5_dir, "pubchem_neg_sample.h5")
156 |     self.h5 = h5py.File(self.h5_fn, "r")
157 |     # read ids
158 |     self.ids = self.h5["chembl_id"].value
159 |     # read 3 fp, and stack as feauture
160 |     ap = sparse.csr_matrix((self.h5["ap"]["data"], self.h5["ap"]["indices"], self.h5["ap"]["indptr"]), shape=[len(self.h5["ap"]["indptr"]) - 1, vec_len])
161 |     #mg = sparse.csr_matrix((self.h5["mg"]["data"], self.h5["mg"]["indices"], self.h5["mg"]["indptr"]), shape=[len(self.h5["mg"]["indptr"]) - 1, vec_len])
162 |     #tt = sparse.csr_matrix((self.h5["tt"]["data"], self.h5["tt"]["indices"], self.h5["tt"]["indptr"]), shape=[len(self.h5["tt"]["indptr"]) - 1, vec_len])
163 |     #self.features = sparse.hstack([ap, mg, tt]).toarray()
164 |     self.features = ap.toarray()
165 |     # label(mask)
166 |     self.mask_dict = {}
167 |     for target in target_list:
168 |       #mask = self.h5["mask"][target].value
169 |       mask = self.h5["cliped_mask"][target].value
170 |       if one_hot == True:
171 |         self.mask_dict[target] = dense_to_one_hot(mask)
172 |       else:
173 |         self.mask_dict[target] = mask
174 |     # close h5 file
175 |     self.h5.close()
176 |     # dtype
177 |     self.dtype = dtype
178 |     # pre_process
179 |     #self.features = np.log10(1.0 + self.features).astype(self.dtype)
180 |     self.features = np.clip(self.features, 0, 1).astype(self.dtype)
181 |     # 
182 |     Dataset.__init__(self, self.features.shape[0])
183 | 
184 |   def next_train_batch(self, target, batch_size):
185 |     """Generate the next batch of samples
186 |     Args:
187 |       batch_size: <type 'int'> the number of samples in the batch
188 |     Return:
189 |       A tuple of features and labels of the samples in the batch
190 |     """
191 |     perm = self.generate_perm_for_train_batch(batch_size)
192 |     return self.features[perm], self.mask_dict[target][perm]
193 | 
194 | 
195 | class Datasets(object):
196 |   """dataset class, contains several positive datasets and one negative dataset.
197 |   """
198 |   def __init__(self, target_list, one_hot=True):
199 |     """
200 |     Args:
201 |       target_list: <type 'list'> the protein kinase targets' list
202 |       one_hot: <type 'bool'> whether to convert labels from dense to one_hot
203 |     return:
204 |       None
205 |     """
206 |     # read neg dataset
207 |     self.neg = NegDataset(target_list, one_hot=one_hot)
208 |     # read pos datasets
209 |     self.pos = {}
210 |     for target in target_list:
211 |       self.pos[target] = PosDataset(target, one_hot=one_hot)
212 | 
213 |   def next_train_batch(self, target, pos_batch_size, neg_batch_size):
214 |     """Generate the next batch of samples
215 |     Args:
216 |       target: <type 'str'> the positive target name
217 |       pos_batch_size: <type 'int'> the number of samples in the batch from positive target dataset
218 |       neg_batch_size: <type 'int'> the number of samples in the batch from negative target dataset  
219 |     Return:
220 |       A tuple of features and labels of the samples in the batch
221 |     """
222 |     pos_feature_batch, pos_label_batch = self.pos[target].next_train_batch(pos_batch_size)
223 |     neg_feature_batch, neg_label_batch = self.neg.next_train_batch(target, neg_batch_size)
224 |     return np.vstack([pos_feature_batch, neg_feature_batch]), np.vstack([pos_label_batch, neg_label_batch])
225 |     
226 | 
227 | def test_dataset():
228 |   """A simple test
229 |   """
230 |   target_list = ["cdk2", "egfr_erbB1", "gsk3b", "hgfr", "map_k_p38a", "tpk_lck", "tpk_src", "vegfr2"]
231 |   d = Datasets(target_list)
232 |   print("test for batching")
233 |   print("batch_num     target feature_min feature_max label_min label_max")
234 |   for step in range(2 * 500):
235 |     for target in target_list:
236 |       compds_batch, labels_batch = d.next_train_batch(target, 128, 128)
237 |       if np.isnan(compds_batch).sum() > 0:
238 |         print("warning: nan in feature"),
239 |         print("%9d %10s %11.2f %11.2f %9.2f %9.2f" % (step, target, compds_batch.min(), compds_batch.max(), labels_batch.min(), labels_batch.max()))
240 |       if (step % 500) == 0:
241 |         print("%9d %10s %11.2f %11.2f %9.2f %9.2f" % (step, target, compds_batch.min(), compds_batch.max(), labels_batch.min(), labels_batch.max()))
242 | 
243 | # from data_files/fp_2_code.py
244 | def read_fp(filename, dtype=int):
245 |   """ read fingerprint from file
246 |   Args:
247 |     filename: <type 'str'>
248 |   Return:
249 |     chembl_id_list: <type 'list'>, a list of str
250 |     fps_list: <type 'list'>, a list of dict.
251 |   """
252 |   chembl_id_list = []
253 |   fps_list = []
254 |   infile = open(filename, "r")
255 |   line_num = 0
256 |   for line in infile:
257 |     line_num += 1
258 |     chembl_id = line.split("\t")[0].strip()
259 |     fps_str = line.split("\t")[1].strip()
260 |     fps = {}
261 |     fps_str = fps_str[1:-1].split(",")
262 |     for fp in fps_str:
263 |       if ":" in fp:
264 |         k, v = fp.split(":")
265 |         k = dtype(k.strip())
266 |         v = dtype(v.strip())
267 |         assert k not in fps.keys(), ("error in fp_file %s at line %d: dict's keys duplicated" % (filename, line_num))
268 |         fps[k] = v 
269 |     chembl_id_list.append(chembl_id)
270 |     fps_list.append(fps)
271 |   infile.close()
272 |   return chembl_id_list, fps_list
273 | 
274 | 
275 | class Dataset_reg(object):
276 |   def __init__(self, target, train_year_up_limit = 2013):
277 |     """
278 |     """
279 |     fp_dir = "../data_files/fp_files"
280 |     # all apfps that were picked out.
281 |     apfp_picked_fn = os.path.join(fp_dir, target + "_apfp.picked_all")
282 |     self.apfp_picked_all = list(np.genfromtxt(apfp_picked_fn, dtype=str))
283 |     self.apfp_picked_all.sort()
284 |     # read response
285 |     response_df = pd.read_csv(os.path.join(fp_dir, target + ".response"), delimiter="\t", names=["CHEMBL_ID", "YEAR", "LABEL", "TYPE", "RELATION", "VALUE"], index_col=0)
286 |     # read apfp as features
287 |     apfp_fn = os.path.join(fp_dir, target + ".apfp")
288 |     id_list, apfps_list = read_fp(apfp_fn, dtype=str)
289 |     features_df = pd.DataFrame(index=id_list, data=apfps_list, columns=self.apfp_picked_all, dtype=float)
290 |     # merge response and features
291 |     df = pd.concat([response_df, features_df], axis=1)
292 |     # pick out records with explicit values
293 |     df = df[df["RELATION"] == "="]
294 |     df = df[["YEAR", "VALUE"] + self.apfp_picked_all]
295 |     # remove duplicates, keep the mean "VALUE" and mean "YEAR". 
296 |     df.reset_index(drop=False, inplace=True)
297 |     df = df.fillna(0).groupby(by=["CHEMBL_ID"]).mean()
298 |     # log processing for "VALUE" and features
299 |     df["LOG_VALUE"] = np.log(df["VALUE"])
300 |     df[self.apfp_picked_all] = np.log(1 + df[self.apfp_picked_all])
301 |  
302 |     self.df = df
303 | 
304 |     # batch related
305 |     mask = self.df["YEAR"] <= train_year_up_limit
306 |     self.tr_ids = self.df.index[mask].values
307 |     self.te_ids = self.df.index[~mask].values
308 | 
309 |     self.tr_size = self.tr_ids.shape[0]
310 |     self.tr_begin = 0
311 |     self.tr_end = 0
312 |     
313 | 
314 |   def next_batch(self, batch_size):
315 |     """
316 |     """
317 |     self.tr_begin = self.tr_end
318 |     self.tr_end += batch_size
319 |     if self.tr_end > self.tr_size:
320 |       random.shuffle(self.tr_ids)
321 |       self.tr_begin = 0
322 |       self.tr_end = batch_size
323 |     batch = self.df.ix[self.tr_ids[self.tr_begin: self.tr_end]]
324 |     return batch[self.apfp_picked_all].values, batch["LOG_VALUE"].values
325 | 
326 |   def test_batch(self):
327 |     batch = self.df.ix[self.te_ids]
328 |     return batch[self.apfp_picked_all].values, batch["LOG_VALUE"].values
329 | 
330 |   def train_batch(self):
331 |     batch = self.df.ix[self.tr_ids]
332 |     return batch[self.apfp_picked_all].values, batch["LOG_VALUE"].values
333 | 
334 | 
335 | 
336 | if __name__ == "__main__":
337 |   target_list = ["cdk2", "egfr_erbB1", "gsk3b", "hgfr", "map_k_p38a", "tpk_lck", "tpk_src", "vegfr2"]
338 |   test_dataset()
339 | 
340 | 
341 | 
342 | 
343 | 
344 | 
345 | 
346 | 


--------------------------------------------------------------------------------
/data_files/chembl_input.py:
--------------------------------------------------------------------------------
  1 | # Author: xiaotaw@qq.com (Any bug report is welcome)
  2 | # Time Created: Nov 2016
  3 | # Time Last Updated: Dec 2016
  4 | # Addr: Shenzhen, China
  5 | # Description:
  6 | 
  7 | import os
  8 | import getpass
  9 | import numpy as np
 10 | import pandas as pd
 11 | from scipy import sparse
 12 | from collections import defaultdict
 13 | 
 14 | data_dir = "/home/%s/Documents/chembl/data_files/" % getpass.getuser()
 15 | fp_dir = os.path.join(data_dir, "fp_files")
 16 | mask_dir = os.path.join(data_dir, "mask_files")
 17 | structure_dir = os.path.join(data_dir, "structure_files")
 18 | 
 19 | # the newly picked out 15 targets, include 9 targets from 5 big group, and 6 targets from others.
 20 | target_list = ["CHEMBL279", "CHEMBL203", # Protein Kinases
 21 |                "CHEMBL217", "CHEMBL253", # GPCRs (Family A)
 22 |                "CHEMBL235", "CHEMBL206", # Nuclear Hormone Receptors
 23 |                "CHEMBL240", "CHEMBL4296", # Voltage Gated Ion Channels
 24 |                "CHEMBL4805", # Ligand Gated Ion Channels
 25 |                "CHEMBL204", "CHEMBL244", "CHEMBL4822", "CHEMBL340", "CHEMBL205", "CHEMBL4005" # Others
 26 |               ] 
 27 | 
 28 | 
 29 | def dense_to_one_hot(labels_dense, num_classes=2, dtype=np.int):
 30 |   """Convert class labels from scalars to one-hot vectors.
 31 |   Args:
 32 |     labels_dense: <type 'numpy.ndarray'> dense label
 33 |     num_classes: <type 'int'> the number of classes in one hot label
 34 |     dtype: <type 'type'> data type
 35 |   Return:
 36 |     labels_ont_hot: <type 'numpy.ndarray'> one hot label
 37 |   """
 38 |   num_labels = labels_dense.shape[0]
 39 |   index_offset = np.arange(num_labels) * num_classes
 40 |   labels_one_hot = np.zeros((num_labels, num_classes))
 41 |   labels_one_hot.flat[index_offset + labels_dense.ravel().astype(dtype)] = 1
 42 |   return labels_one_hot
 43 | 
 44 | 
 45 | def sparse_features(fps_list, target_columns_dict, num_features, is_log=True):
 46 |   """construct a sparse matrix(csr_matrix) for features according to target_columns_dict.
 47 |   Args:
 48 |     fps_list: <type 'list'> a list of apfps for the molecules
 49 |     is_log: <type 'bool'> flag whether apply np.log to data, default is True
 50 |   Return:
 51 |     features: the sparse matrix of features
 52 |   """
 53 |   data = []
 54 |   indices = []
 55 |   indptr = [0]
 56 |   for fps_str in fps_list:
 57 |     n = indptr[-1]
 58 |     for fp in fps_str[1:-1].split(","):
 59 |       if ":" in fp:
 60 |         k, v = fp.split(":")
 61 |         indices.append(target_columns_dict[int(k)])
 62 |         data.append(int(v))
 63 |         n += 1
 64 |     indptr.append(n)
 65 |   data = np.array(data)
 66 |   if is_log:
 67 |     data = np.log(data).astype(np.float32)
 68 |   # here we add one to num_features, because any apfp not founded in target_apfp_picked will be mapped
 69 |   # to the last column of the features matrix, though the last column will not be used ultimately.
 70 |   features = sparse.csr_matrix((data, indices, indptr), shape=(len(fps_list), num_features + 1))
 71 |   return features
 72 | 
 73 | 
 74 | class DatasetBase(object):
 75 |   def __init__(self, target):
 76 |     # read count and the apfps that were picked out 
 77 |     counts = np.genfromtxt(mask_dir + "/%s_apfp.count" % target, delimiter="\t", dtype=int)
 78 |     self.target_apfp_picked = counts[counts[:, 1] > 10][:, 0]
 79 |     self.target_apfp_picked.sort()
 80 |     self.num_features = len(self.target_apfp_picked)
 81 |     # columns and sparse features
 82 |     # here we use a defaultdict, where any apfp not founded in target_apfp_picked will be mapped
 83 |     # to the last column of the features matrix, though the last column will not be used ultimately.
 84 |     self.target_columns_dict = defaultdict(lambda : self.num_features)
 85 |     for i, apfp in enumerate(self.target_apfp_picked):
 86 |       self.target_columns_dict[apfp] = i
 87 | 
 88 |   def batch_generator_base(self, size, batch_size):
 89 |     begin = 0
 90 |     end = 0
 91 |     while True:
 92 |       begin = end
 93 |       if begin >= size:
 94 |         raise StopIteration()
 95 |       end += batch_size 
 96 |       if end > size:
 97 |         end = size
 98 |       yield begin, end
 99 |     
100 | 
101 | class DatasetTarget(DatasetBase):
102 |   def __init__(self, target, year_split=2014):
103 |     DatasetBase.__init__(self, target)
104 |     # read chembl id and apfp
105 |     self.chembl_id = []
106 |     self.chembl_apfp = {}
107 |     f = open(fp_dir + "/chembl.apfp", "r")
108 |     for line in f:
109 |       id_, fps_str = line.split("\t")
110 |       id_ = id_.strip()
111 |       fps_str = fps_str.strip()
112 |       self.chembl_id.append(id_)
113 |       self.chembl_apfp[id_] = fps_str
114 |     f.close()
115 |     # read top 79 targets' label data, and get the specific target's label data
116 |     clf_label_79 = pd.read_csv(structure_dir + "/chembl_top79.label", usecols=[0, 2, 3, 4], delimiter="\t")
117 |     self.target_clf_label = clf_label_79[clf_label_79["TARGET_CHEMBLID"] == target]
118 |     # remove compounds whose apfp cannot be caculated
119 |     m = self.target_clf_label["CMPD_CHEMBLID"].isin(self.chembl_id)
120 |     self.target_clf_label = self.target_clf_label[m.values] 
121 |     # time split
122 |     time_mask = self.target_clf_label["YEAR"] > year_split
123 |     time_split_train = self.target_clf_label[~time_mask] 
124 |     time_split_test = self.target_clf_label[time_mask]
125 |     # ids
126 |     self.target_ids_train = time_split_train["CMPD_CHEMBLID"].values
127 |     self.target_ids_test = time_split_test["CMPD_CHEMBLID"].values
128 |     # features   
129 |     self.target_features_train = sparse_features([self.chembl_apfp[k] for k in self.target_ids_train], self.target_columns_dict, self.num_features)[:, :-1]
130 |     self.target_features_test = sparse_features([self.chembl_apfp[k] for k in self.target_ids_test], self.target_columns_dict, self.num_features)[:, :-1]
131 |     # labels
132 |     self.target_labels_train = (time_split_train["CLF_LABEL"] > 0).astype(int).values
133 |     self.target_labels_test = (time_split_test["CLF_LABEL"] > 0).astype(int).values
134 | 
135 | 
136 | class DatasetCNS(DatasetTarget):
137 |   def __init__(self, target, year_split=2014):
138 |     DatasetTarget.__init__(self, target, year_split=year_split)
139 |     # read mask
140 |     self.cns_mask = pd.Series.from_csv(mask_dir + "/%s_cns.mask" % target, header=None, sep="\t")    
141 |     # features
142 |     self.cns_features = sparse_features([self.chembl_apfp[k] for k in self.chembl_id], self.target_columns_dict, self.num_features)[:, :-1]
143 |     # 
144 |     m = self.cns_mask.index.isin(self.target_ids_test)
145 |     self.cns_features_train = self.cns_features[~m]
146 |     self.cns_mask_train = self.cns_mask[~m]
147 | 
148 |   def batch_generator_cns(self, batch_size):
149 |     for begin, end in self.batch_generator_base(self.cns_features.shape[0], batch_size):
150 |       ids = self.chembl_id[begin: end]
151 |       features = self.cns_features[begin: end].toarray()
152 |       mask = self.cns_mask[begin: end].values
153 |       yield ids, features, mask
154 | 
155 | 
156 | class DatasetPNS(DatasetBase):
157 |   def __init__(self, target):
158 |     DatasetBase.__init__(self, target)
159 |     # read pns apfp
160 |     self.pns_id = []
161 |     self.pns_apfp = {}
162 |     f = open(fp_dir + "/pubchem_neg_sample.apfp", "r")
163 |     for line in f:
164 |       id_, fps_str = line.split("\t")
165 |       id_ = id_.strip()
166 |       fps_str = fps_str.strip()
167 |       self.pns_id.append(id_)
168 |       self.pns_apfp[id_] = fps_str                
169 |     f.close()
170 |     # read mask
171 |     self.pns_mask = pd.Series.from_csv(mask_dir + "/%s_pns.mask" % target, header=None, sep="\t")
172 |     # features
173 |     self.pns_features = sparse_features([self.pns_apfp[k] for k in self.pns_id], self.target_columns_dict, self.num_features)[:, :-1]
174 | 
175 |   def batch_generator_pns(self, batch_size):
176 |     for begin, end in self.batch_generator_base(self.pns_features.shape[0], batch_size):
177 |       ids = self.pns_id[begin: end]
178 |       features = self.pns_features[begin: end].toarray()
179 |       mask = self.pns_mask[begin: end].values
180 |       yield ids, features, mask
181 | 
182 | 
183 | class Dataset(DatasetCNS, DatasetPNS):
184 |   """Base dataset class for chembl inhibitors
185 |   """
186 |   def __init__(self, target, one_hot=True, is_shuffle_train=True,  train_pos_multiply=0):
187 |     """Constructor, create a dataset container. 
188 |     Args:
189 |       target: <type 'str'> the chemblid of the target, e.g. "CHEMBL203".
190 |       one_hot: <type 'bool'> flag whether create one_hot label, default is True.
191 |       is_shuffle: <type 'bool'> flag whether shuffle samples when the dataset created.
192 |       year_split: <type 'int'> time split year, 
193 |         if a molecule's year > year_split, it will be split into test data,
194 |         otherwise, if a molecule's year <= year_split, it will be split into train data.
195 |     Return:
196 |       None
197 |     """
198 |     DatasetCNS.__init__(self, target, year_split=2014)
199 |     DatasetPNS.__init__(self, target)
200 |     # cns train pos 
201 |     self.cns_features_train_pos = self.cns_features_train[self.cns_mask_train.values]
202 |     self.cns_mask_train_pos = self.cns_mask_train[self.cns_mask_train.values]
203 |     # train, if train_pos_multiply > 0, cns_train_pos will be extra added for train_pos_multiply times .
204 |     tf_list = [self.cns_features_train, self.pns_features]
205 |     tl_list = [self.cns_mask_train, self.pns_mask]
206 |     for _ in range(train_pos_multiply):
207 |       tf_list.append(self.cns_features_train_pos)
208 |       tl_list.append(self.cns_mask_train_pos)
209 |     self.train_features = sparse.vstack(tf_list)
210 |     self.train_labels = np.hstack(tl_list).astype(int)
211 |     # test
212 |     self.test_features = self.target_features_test
213 |     self.test_labels = self.target_labels_test
214 |     # one_hot
215 |     if one_hot:
216 |       self.train_labels_one_hot = dense_to_one_hot(self.train_labels) 
217 |       self.test_labels_one_hot = dense_to_one_hot(self.test_labels) 
218 |     # batch related
219 |     self.train_size = self.train_features.shape[0] # (954049, 9412)
220 |     self.train_perm = np.arange(self.train_size)
221 |     if is_shuffle_train:
222 |       np.random.shuffle(self.train_perm)
223 |     self.train_begin = 0
224 |     self.train_end = 0
225 | 
226 | 
227 |   def generate_perm_for_train_batch(self, batch_size):
228 |     """Create the permutation for a batch of train samples
229 |     Args:
230 |       batch_size: <type 'int'> the number of samples in the batch
231 |     Return:
232 |       perm: <type 'numpy.ndarray'> the permutation of samples which form a batch
233 |     """
234 |     self.train_begin = self.train_end
235 |     self.train_end += batch_size
236 |     if self.train_end > self.train_size:
237 |       np.random.shuffle(self.train_perm)
238 |       self.train_begin = 0
239 |       self.train_end = batch_size
240 |     perm = self.train_perm[self.train_begin: self.train_end]
241 |     return perm
242 | 
243 |   def generate_train_batch(self, batch_size):
244 |     perm = self.generate_perm_for_train_batch(batch_size)
245 |     return self.train_features[perm].toarray().astype(np.float32), self.train_labels_one_hot[perm]
246 | 
247 |   def reset_begin_end(self):
248 |     self.train_begin = 0
249 |     self.train_end = 0
250 | 
251 |   def generate_train_batch_once(self, batch_size):
252 |     self.train_begin = self.train_end
253 |     self.train_end += batch_size
254 |     if self.train_end > self.train_size:
255 |       self.train_end = self.train_size
256 |     perm = self.train_perm[self.train_begin: self.train_end]
257 |     return self.train_features[perm].toarray().astype(np.float32), self.train_labels_one_hot[perm]
258 | 
259 | 
260 | 
261 | # dataset for virtual screening(vs)
262 | class DatasetVS(DatasetBase):
263 |   def __init__(self, target):
264 |     DatasetBase.__init__(self, target)
265 | 
266 |   def reset(self, fp_fn):
267 |     # read chembl id and apfp
268 |     self.pubchem_id = []
269 |     self.pubchem_apfp = {}
270 |     f = open(fp_fn, "r")
271 |     for line in f:
272 |       id_, fps_str = line.split("\t")
273 |       id_ = id_.strip()
274 |       fps_str = fps_str.strip()
275 |       self.pubchem_id.append(id_)
276 |       self.pubchem_apfp[id_] = fps_str
277 |     f.close()
278 |     # generate features
279 |     self.features = sparse_features([self.pubchem_apfp[k] for k in self.pubchem_id], self.target_columns_dict, self.num_features)[:, :-1]
280 |     self.features_dense = self.features.toarray()
281 | 
282 | 
283 | class DatasetChemDiv(DatasetBase):
284 |   def __init__(self, target):
285 |     DatasetBase.__init__(self, target)
286 |     # read ids and apfps
287 |     ChemDiv_dir = "/raid/xiaotaw/ChemDiv"
288 |     fn_list = ["DC01_350000.apfp", "DC02_350000.apfp", "DC03_222773.apfp", "DC_saltdata_not-available_124145.apfp", "IC_non-excl_82693.apfp", "NC_340320.apfp"]
289 |     self.chemdiv_ids = []
290 |     self.chemdiv_apfps = {}
291 |     for fn in fn_list:
292 |       f = open(ChemDiv_dir + "/" + fn, "r")
293 |       for line in f:
294 |         id_, fps_str = line.split("\t")
295 |         id_ = id_.strip()
296 |         fps_str = fps_str.strip()
297 |         self.chemdiv_ids.append(id_)
298 |         self.chemdiv_apfps[id_] = fps_str                
299 |       f.close()
300 |     # batch related
301 |     self.begin = 0
302 |     self.end = 0
303 |     self.size = len(self.chemdiv_ids)
304 |   
305 |   def generate_batch(self, batch_size):
306 |     self.begin = self.end
307 |     if self.begin >= self.size:
308 |       raise StopIteration()
309 |     self.end += batch_size
310 |     if self.end > self.size:
311 |       self.end = self.size
312 |     ids = self.chemdiv_ids[self.begin: self.end]
313 |     apfp_list = [self.chemdiv_apfps[k] for k in ids]
314 |     features = sparse_features(apfp_list, self.target_columns_dict, self.num_features)[:, :-1]
315 |     return ids, features
316 | 
317 |   def batch_generator_chemdiv(self, batch_size):
318 |     for begin, end in self.batch_generator_base(self.size, batch_size):
319 |       ids = self.chemdiv_ids[begin: end]
320 |       apfp_list = [self.chemdiv_apfps[k] for k in ids]
321 |       features = sparse_features(apfp_list, self.target_columns_dict, self.num_features)[:, :-1].toarray()
322 |       yield ids, features
323 | 
324 | def compute_performance(label, prediction):
325 |   """sensitivity(SEN), specificity(SPE), accuracy(ACC), matthews correlation coefficient(MCC) 
326 |   """
327 |   assert label.shape[0] == prediction.shape[0], "label number should be equal to prediction number"
328 |   N = label.shape[0]
329 |   APP = sum(prediction)
330 |   ATP = sum(label)
331 |   TP = sum(prediction * label)
332 |   FP = APP - TP
333 |   FN = ATP - TP
334 |   TN = N - TP - FP - FN
335 |   SEN = float(TP) / (ATP) if ATP != 0 else np.nan
336 |   SPE = float(TN) / (N - ATP)
337 |   ACC = float(TP + TN) / N
338 |   MCC = (TP * TN - FP * FN) / (np.sqrt(long(N - APP) * long(N - ATP) * APP * ATP)) if not (N - APP) * (N - ATP) * APP * ATP == 0 else 0.0
339 |   return TP, TN, FP, FN, SEN, SPE, ACC, MCC
340 | 
341 | 
342 | 
343 | 
344 | 
345 | 
346 | 
347 | 
348 | 


--------------------------------------------------------------------------------
/data_files/3_chembl_analyse_fp.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import json\n",
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "from scipy import sparse\n",
 13 |     "from collections import Counter\n",
 14 |     "from collections import defaultdict\n",
 15 |     "from matplotlib import pyplot as plt\n",
 16 |     "%matplotlib inline"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## calculate count for apfp"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "pns_apfp = pd.Series.from_csv(\"fp_files/pns_apfp.csv\", sep=\"\\t\", header=0) \n",
 33 |     "\n",
 34 |     "pns_apfp_counter = Counter()\n",
 35 |     "for apfp_str in pns_apfp:\n",
 36 |     "    apfp = json.loads(apfp_str)\n",
 37 |     "    pns_apfp_counter.update(apfp.keys())\n",
 38 |     "    \n",
 39 |     "pns_apfp_count = pd.Series(pns_apfp_counter)\n",
 40 |     "pns_apfp_count.index.name = \"APFP\"\n",
 41 |     "pns_apfp_count.name = \"COUNT\"\n",
 42 |     "pns_apfp_count.to_csv(\"fp_files/pns_apfp_count.csv\", header=True)"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": []
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "metadata": {
 58 |     "collapsed": true
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "cancer_approved_target = [\"CHEMBL279\", \"CHEMBL203\", \"CHEMBL333\", \"CHEMBL325\", \"CHEMBL267\", \"CHEMBL2842\"]\n",
 63 |     "cancer_clinical_target = [\"CHEMBL340\", \"CHEMBL4005\", \"CHEMBL332\"]\n",
 64 |     "target_list = cancer_approved_target + cancer_clinical_target"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "inh_apfp = pd.Series.from_csv(\"fp_files/inhibitor_apfp.csv\", sep=\"\\t\", header=0)\n",
 76 |     "\n",
 77 |     "for target in target_list:\n",
 78 |     "    clf_label = pd.read_csv(\"chembl_source/%s_clf_label.csv\" % target)\n",
 79 |     "    target_apfp = inh_apfp.loc[clf_label[\"CMPD_CHEMBLID\"].values]\n",
 80 |     "    target_apfp_counter = Counter()\n",
 81 |     "    for apfp_str in target_apfp:\n",
 82 |     "        apfp = json.loads(apfp_str)\n",
 83 |     "        target_apfp_counter.update(apfp.keys())\n",
 84 |     "    target_apfp_count = pd.Series(target_apfp_counter)\n",
 85 |     "    target_apfp_count.index.name = \"APFP\"\n",
 86 |     "    target_apfp_count.name = \"COUNT\"\n",
 87 |     "    target_apfp_count.to_csv(\"fp_files/%s_apfp_count.csv\" % target, header=True)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {
 93 |     "collapsed": true
 94 |    },
 95 |    "source": [
 96 |     "## pick a threshold for minimun count of apfp"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 5,
102 |    "metadata": {
103 |     "collapsed": true
104 |    },
105 |    "outputs": [],
106 |    "source": [
107 |     "cancer_approved_target = [\"CHEMBL279\", \"CHEMBL203\", \"CHEMBL333\", \"CHEMBL325\", \"CHEMBL267\", \"CHEMBL2842\"]\n",
108 |     "cancer_clinical_target = [\"CHEMBL340\", \"CHEMBL4005\", \"CHEMBL332\"]\n",
109 |     "target_list = cancer_approved_target + cancer_clinical_target"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 6,
115 |    "metadata": {
116 |     "collapsed": true
117 |    },
118 |    "outputs": [],
119 |    "source": [
120 |     "inh_apfp = pd.Series.from_csv(\"fp_files/inhibitor_apfp.csv\", sep=\"\\t\", header=0)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 7,
126 |    "metadata": {
127 |     "collapsed": true
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "pns_count = pd.Series.from_csv(\"fp_files/pns_apfp_count.csv\", header=0)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 8,
137 |    "metadata": {
138 |     "collapsed": true
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "def sparse_features(fps_series, target_apfp_picked):\n",
143 |     "    columns_dict = defaultdict(lambda : len(target_apfp_picked))\n",
144 |     "    for i, apfp in enumerate(target_apfp_picked):\n",
145 |     "        columns_dict[apfp] = i\n",
146 |     "    data = []\n",
147 |     "    indices = []\n",
148 |     "    indptr = [0]\n",
149 |     "    for _, fps in fps_series.iteritems():\n",
150 |     "        n = indptr[-1]\n",
151 |     "        for k, v in fps.items():\n",
152 |     "            indices.append(columns_dict[k])\n",
153 |     "            data.append(v)\n",
154 |     "            n += 1\n",
155 |     "        indptr.append(n)\n",
156 |     "    a = sparse.csr_matrix((np.array(data), indices, indptr), shape=(len(fps_series), len(target_apfp_picked) + 1))\n",
157 |     "    return a"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 9,
163 |    "metadata": {
164 |     "collapsed": true
165 |    },
166 |    "outputs": [],
167 |    "source": [
168 |     "target = \"CHEMBL279\""
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 10,
174 |    "metadata": {
175 |     "collapsed": true
176 |    },
177 |    "outputs": [],
178 |    "source": [
179 |     "target_clf_label = pd.read_csv(\"chembl_source/%s_clf_label.csv\" % target)\n",
180 |     "target_apfp_str = inh_apfp.loc[target_clf_label[\"CMPD_CHEMBLID\"].values]\n",
181 |     "target_apfp = target_apfp_str.apply(json.loads)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 20,
187 |    "metadata": {
188 |     "collapsed": true
189 |    },
190 |    "outputs": [],
191 |    "source": [
192 |     "target_count = pd.Series.from_csv(\"fp_files/%s_apfp_count.csv\" % target, header=0)"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {
199 |     "collapsed": true
200 |    },
201 |    "outputs": [],
202 |    "source": []
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 22,
207 |    "metadata": {
208 |     "collapsed": true
209 |    },
210 |    "outputs": [],
211 |    "source": [
212 |     "count_threshold = 50"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 24,
218 |    "metadata": {
219 |     "scrolled": true
220 |    },
221 |    "outputs": [
222 |     {
223 |      "name": "stdout",
224 |      "output_type": "stream",
225 |      "text": [
226 |       "10\n",
227 |       "(21160,) 11504\n",
228 |       "CHEMBL279 168\n",
229 |       "0.9 52006.0\n",
230 |       "0.95 11212.0\n",
231 |       "0.99 149.0\n",
232 |       "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n",
233 |       "\n",
234 |       "\n",
235 |       "30\n",
236 |       "(21160,) 8803\n",
237 |       "CHEMBL279 161\n",
238 |       "0.9 51948.0\n",
239 |       "0.95 11216.0\n",
240 |       "0.99 147.0\n",
241 |       "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n",
242 |       "\n",
243 |       "\n",
244 |       "50\n",
245 |       "(21160,) 7661\n",
246 |       "CHEMBL279 178\n",
247 |       "0.9 52027.0\n",
248 |       "0.95 11238.0\n",
249 |       "0.99 147.0\n",
250 |       "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n",
251 |       "\n",
252 |       "\n",
253 |       "70\n",
254 |       "(21160,) 6916\n",
255 |       "CHEMBL279 158\n",
256 |       "0.9 52269.0\n",
257 |       "0.95 11286.0\n",
258 |       "0.99 148.0\n",
259 |       "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n",
260 |       "\n",
261 |       "\n",
262 |       "90\n",
263 |       "(21160,) 6363\n",
264 |       "CHEMBL279 152\n",
265 |       "0.9 52449.0\n",
266 |       "0.95 11347.0\n",
267 |       "0.99 148.0\n",
268 |       "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n",
269 |       "\n",
270 |       "\n",
271 |       "110\n",
272 |       "(21160,) 5927\n",
273 |       "CHEMBL279 148\n",
274 |       "0.9 52492.0\n",
275 |       "0.95 11352.0\n",
276 |       "0.99 149.0\n",
277 |       "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n",
278 |       "\n",
279 |       "\n",
280 |       "130\n",
281 |       "(21160,) 5583\n",
282 |       "CHEMBL279 145\n",
283 |       "0.9 52548.0\n",
284 |       "0.95 11373.0\n",
285 |       "0.99 152.0\n",
286 |       "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n",
287 |       "\n",
288 |       "\n",
289 |       "150\n",
290 |       "(21160,) 5310\n",
291 |       "CHEMBL279 141\n",
292 |       "0.9 52617.0\n",
293 |       "0.95 11384.0\n",
294 |       "0.99 152.0\n",
295 |       "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n",
296 |       "\n",
297 |       "\n",
298 |       "170\n",
299 |       "(21160,) 5093\n",
300 |       "CHEMBL279 126\n",
301 |       "0.9 52722.0\n",
302 |       "0.95 11433.0\n",
303 |       "0.99 152.0\n",
304 |       "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n",
305 |       "\n",
306 |       "\n",
307 |       "190\n",
308 |       "(21160,) 4893\n",
309 |       "CHEMBL279 117\n",
310 |       "0.9 52875.0\n",
311 |       "0.95 11479.0\n",
312 |       "0.99 157.0\n",
313 |       "('CHEMBL511563', 1.0) ('CHEMBL502351', -1.0)\n",
314 |       "\n",
315 |       "\n"
316 |      ]
317 |     }
318 |    ],
319 |    "source": [
320 |     "for count_threshold in range(10, 200, 20):\n",
321 |     "    print count_threshold\n",
322 |     "    pns_m = pns_count > count_threshold\n",
323 |     "    print pns_m.shape, pns_m.sum()\n",
324 |     "\n",
325 |     "    count = target_count.add(pns_count, fill_value=0)\n",
326 |     "    m = count > count_threshold\n",
327 |     "    picked = count.loc[m]\n",
328 |     "    print target, picked.shape[0] - pns_m.sum()\n",
329 |     "    target_apfp_picked = picked.index.astype(str)\n",
330 |     "\n",
331 |     "    a = sparse_features(target_apfp, target_apfp_picked)\n",
332 |     "\n",
333 |     "    aa = a.toarray()[:, :-1]\n",
334 |     "\n",
335 |     "    b = np.corrcoef(aa)\n",
336 |     "\n",
337 |     "    c = (abs(b) > 0.9).astype(int) - np.eye(a.shape[0], dtype=int)\n",
338 |     "    print 0.9, c.sum() / 2.0\n",
339 |     "    c = (abs(b) > 0.95).astype(int) - np.eye(a.shape[0], dtype=int)\n",
340 |     "    print 0.95, c.sum() / 2.0\n",
341 |     "    c = (abs(b) > 0.99).astype(int) - np.eye(a.shape[0], dtype=int)\n",
342 |     "    print 0.99, c.sum() / 2.0\n",
343 |     "    c = (abs(b) > 0.999999).astype(int) - np.eye(a.shape[0], dtype=int)\n",
344 |     "    \n",
345 |     "    id_list = []\n",
346 |     "    for i, j in zip(*c.nonzero()):\n",
347 |     "        if i <= j:\n",
348 |     "            continue\n",
349 |     "        li = target_clf_label.iloc[i][\"CLF_LABEL\"]\n",
350 |     "        lj = target_clf_label.iloc[j][\"CLF_LABEL\"]\n",
351 |     "        if (li>0) != (lj>0):\n",
352 |     "            idi = target_clf_label.iloc[i][\"CMPD_CHEMBLID\"]\n",
353 |     "            idj = target_clf_label.iloc[j][\"CMPD_CHEMBLID\"]\n",
354 |     "            id_list.append(idi)\n",
355 |     "            id_list.append(idj)\n",
356 |     "            print (idi, li), (idj, lj)\n",
357 |     "            print \"\\n\""
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "markdown",
362 |    "metadata": {
363 |     "collapsed": true
364 |    },
365 |    "source": [
366 |     "## check molecules' collision "
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 25,
372 |    "metadata": {
373 |     "collapsed": true
374 |    },
375 |    "outputs": [],
376 |    "source": [
377 |     "from rdkit import Chem\n",
378 |     "from rdkit.Chem import Draw\n",
379 |     "from rdkit.Chem.Draw import IPythonConsole"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 28,
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "inh_smi = pd.Series.from_csv(\"structure_files/inhibitor_smiles.csv\", header=0)"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 29,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": [
397 |     "ms = [Chem.MolFromSmiles(inh_smi.loc[id_]) for id_ in id_list]"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": 30,
403 |    "metadata": {},
404 |    "outputs": [
405 |     {
406 |      "data": {
407 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZAAAADICAIAAABJdyC1AAAYwElEQVR4nO3de1xT5/0H8CcIJKIh\nAVQoKAhtABHBC10VfTlUhuKsuilqh1asFa1TbLUW3fwNh/NnbF330q0iirWorDW1rU6nRdRCRbyL\nSjvAeb9bLoIhCITk+/vjuMgPkUtyAjzs8/4rkHP5ql8/OTnnOc+REBEDAOCBTVsXAADQXAgsAOAG\nAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACAGwgsAOAG\nAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACAGwgsAOAG\nAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACAGwgsAOAG\nAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACAGwgsAOAG\nAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACAGwgsAOAG\nAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCS2TFxUwiYUlJT38MDGRlZW1Z\nD3QM6CsBAkt8vXuzTz5hVVVtXQd0LOgrhsCyBoWCTZ367MMQQBToK4bAspJ332UpKUyna+s6oGNB\nXyGwrEIuZzNnsr/+lTHGDIa2rgY6CvQVAstaFixgO3YwrZaVlLCQELZ+PSspaeuagH//5X2FwBJT\nTc2z1w4ObO5cdusWc3ZmI0cytZq98cauOXPm5OTktF2BwCX01TME4omLe+Fbej3t25c+YcIEOzs7\nf3//tWvX3rt3rxVLIyLKzs7esmXL6dOnW3m/YCH0lQkCSzRJSeTr28Dvy8rou+/IaHz644MHD9at\nWxcQEGBrazt16tRWKOzhw4cfffRRnz597Ozsfv7zn0ul0i1btrTCfkEU6Ku6EFiiCQujAwca+P2j\nRySVUmTkmsTExBs3bph+f+LECZVKlZaWZqV6qqurNRpNeHi4jY3NwIEDk5OTy8rKiGj//v0KhSI6\nOrqystJKuwYRoa/qQmCJZtOmF75VXEwbNvx14MCBNjY2I0eOTE1NraioICJvb+/du3eLXsnFixdj\nY2OdnJxcXFzi4+N/+OEH01sGg4GICgsL+/TpM3DgwJs3b4q+dxAX+qouBJY5ioqIMdq48emPffvS\no0dkMDS94qVLlxYvXuzq6iocPHfr1i09PV2sqrRabXJy8qBBgyQSSXh4uEajefLkSd0F9u3b9+qr\nr96+fZuIysvLJ0yY0L1798zMTLEKAAuhr5qEwDJHURH17k19+5LwDyc0VvPp9frq6moikkqlOTk5\nopR07Ngx4aNv0aJFeXl5DS6j1WonT57s7Ox86NAhIjIajQkJCVKpdPPmzaLUABZCXzUJgWWOoiIK\nDqbERPr4Y6KWN5agurqaMXbp0iVRSgoICHjnnXeqqqoaX8xoNKrVant7e7VaLfxm165dXbp0iY2N\nFXod2hD6qkkIrIbp9frS0tIbN278+OOPZ86cycjI2L//p9RU2rCB/vQnOn+egoPp8WMKCKCKCjMb\nq6SkhDF2/fp1Uaq1t7c/fvw4EV28eLG0tLTx5Q8cOKBUKt944w2dTies4u3tPXTo0Pv371teDDQC\nfWUhBFZ9+fn5kZGR9UarOTk5jR37VUAADRlCo0fTyZMUHExEtHYtrVljZmPduHGDMVZcXGx5zYWF\nhYyxkpISInrllVd27NjR5CqXL18OCAjo37+/cIGpuLg4PDzcw8Pj1KlTltcDz0NfWV4PEdlad1iq\nqCoqKioqKrRa7ePHj8vKyoTX9vb248ePl8lkYu1l+fLl9vb2ubm5jo6Ocrm8S5cuDg4O9ZYpLn76\nYsEC9uqrrKLCnB1ptVrGmFwut6hcxhhjBQUFPXr0cHZ2rq6uvn79ur+/f5OrqFSqkydPvvnmmyEh\nIRqNZsSIEf/85z/nzZu3bNmyb7/91t7e3vKqeIG+epH22FeixJ7oCgsLx48fP3jw4L59+3p5eTk5\nOUkkkrplKxQKDw8Pf3//4ODgn/3sZ7W1taLs9/Tp07a2toWFhY0vJpxrEKxfT4yZ80mYk5Njb2/f\n4tUasnbt2uHDhxNRXl6eRCLRarXNXLG2tnbJkiUymezvf/87ER05cqRr165G02DEDgd91SJi9dXR\no0fF6qv2eIRVU1MTERExePDg6OhouVzetWtXhUKhUCiE13K5XKFQmBbW6XRBQUFqtfr3v/+95bte\ntWrVtGnTfH19i4uLZ82a9dlnn7m4uDy/WLdu7MKFp6/j4lhcnDn70mq1onwMMsaE8S+MsYKCgp49\ne3bt2rWZK3bq1GndunUDBgzw8PAQVvf19a33f7jDQF+1lIh9pVKpROkrcQJrzpw5r7/++vjx40XZ\nWnJystFoTE1NlUqljS9ZVlamVCq3bNkyduzYiRMn9u3b15L9njp16uDBg/n5+YyxNWvWFBUVNdhV\nL6LVsub0SX5+fl5e3pQpU0RsrIKCgsmTJwsvmnPcXk90dLTworCw0IzVrQd9xdBX9Vh+kEZEarXa\nzc1NODlnIZ1O5+bmtnHjRiLKysratWvXi5bcvn27n5+fcB9ATEzMa6+9ZuEBfGRk5MyZM4nozp07\nMpns22+/beaKRiMVFRk2bfrnixZ48OBBWlpaTExMz549GWMBAQF6vX7btm39+vWzpGATZ2fnAwcO\nENH06dMXLlxo9nYiIiL++Mc/ilKSKNBXHaOvRo8evXLlSlFKEiew9Hp9SEiI8K9ioT//+c/e3t41\nNTVGo3HAgAHLli1rZKcDBgx47733iKisrMzDw+NjYfiKWU6cOGFnZ3flyhUiWrhwofDVvflOnz5t\nZ2dXd3RvaWmpRqOJjY318fFhjHl6esbGxmo0mp9++klYYMOGDaGhoWYXbPLw4UPG2LVr14goJCTk\nk08+MXtTnp6ejfxPbn3oq47RV15eXl988YXlJZGIwxouXrxob29/8OBBSzby+PHjbt26JScnE9HX\nX3/t6OjY+NVZ4Z/z2LFjRLR7924HB4d///vf5u06IiLirbfeIqJbt25JpdIjR460dAsrVqzw8vIS\nTkzOnDlTKpXKZLJRo0ap1eqzZ88a6txhUV5evmfPnrCwsNDQ0CaH5DUpKyvLwcHBYDAYjUa5XG5G\n5QKtViuRSC5evGhhPeJCX/HeVzqdzsbG5sKFCxbWIxDzKuGKFSs8PT0fP35s9hbUarWPj09NTY3B\nYAgMDFyxYkWTq7z//vt+fn7CzU2TJk0KCwsz42JEVlaWvb29MHJk3rx5YWFhZhRfXV0dGBi4aNEi\nItq9e3d6enrd29b1ev3x48dXrlw5dOhQW1tbNze3qKgoNze3IUOGWDiBUXJycv/+/Yno9u3bjDGz\nt3bu3LlOnTrVu02sPUBfcd1Xly6VjxjxvxUVOkuKMREzsCorK319ff/ngw/MW728vNzFxWXr1q1E\npNFolErlo2Zc1K2srFSpVMuXLyei+/fvOzs7b2rk9vYXGDVqVGxsLBFdu3bNzs4uOzu7xdUTEdGp\nU6dMn8xEZDQaz549q1arw8PDO3fu7OTkFBUVlZycfPXqVWEBYVhdt27djh49at4eiWjx4sXTpk0j\nooyMDKVSafZ20tLSfHx8zF7detBXXPfVgwdk7p+7ASKPw/ohO9vw0kvNusH8OatWrfL19a2trTUY\nDAEBAQkJCc1cMTMz087O7syZM0S0bds2R0fHW7duNX+/mZmZUqlUmBBj9uzZv/jFL1pe+zNLly5V\nqVQ7d+58++23vby8bGxsBg0atGzZsiNHjjR4lF5bWxsfH29ra7t+/Xrz9jh27Fjh7+rw4cPz5883\nu/I//OEPY8eONXt1q0Jf8dtX4rLCwNFFi6jlXytKS0uVSuVnn31GRJ9//rmzs7MwK1gzzZs3Lygo\nSLjNcsyYMZGRkc1fd9iwYe+88w4RFRYW2traWjjTa1FR0csvvxwYGBgfH5+RkSHMT9SktLS0zp07\nt/RO0fPnzy9YsMDX13fy5Mk1NTXmlvzUlClTFi9ebOFGrAh9xVVfNThVjuWsEFg6HT15Qj/+SJGR\nFBFBw4dTamqTKyUkJPj5+dXW1ur1el9f38TExBbts7y8vFevXsIl+Zs3b8rl8u3btxORwWAoLS29\nfv16Xl5eTk5Oenq6RqNJSUn5y1/+kpiYuHTp0qlTp8pkMuEsw5w5cyz8GCSidevW9erVy4xTnufP\nn/fy8ho6dOiDBw8aX7KkpGTDhg39+/eXSqVTpkxJSkrq3bv38OHDHz582KI96nS61NTU2bNnCz8G\nBQUJp6XbKfQVN31Fqal08KBFU+W8iHVuzdFq6ZVXSJjgQqej0FA6fLiRxUtKShwdHYVbK3fu3Nm9\ne/fm3wRg8s0338hksvz8fCJatmyZSqXq0qVL3RFndnZ2zs7OPj4+wcHBw4YNi4yMjIqKio6Otre3\nz8jIIKL79++bfTFIoNVqu3XrlpSUZN7qP/30U1hYWM+ePYWvIfXU1NQIU9N26tQpPDw8NTW1vLxc\neKu8vHz8+PHNnzXt9OnTc+fOVSgUrq6uS5cura6urq2tlclkWVlZ5lXeStBXZmnFvqK5c0mhIFdX\nOnJEhKlynmedwPr6a3rjjWc/7tlD/5kVX/hounbtWm5ublZW1j/+8Y+dO3dOmzZNpVLp9Xq9Xq9S\nqVavXm3ebrdu3Sp05MSJE3/5y19mZmaeO3fu6tWrRUVFjVz8SkhIcHd3b86J2CZ9+OGHwtUos7eg\n1+vj4uJkMpnwSS64cOGCMDVt9+7d4+Pj//Wvfz2/omnWtEYeBHDz5s2EhASVStW5c+cZM2ZkZGSY\nLoqfO3eOMdbSz9LWhr4yl1X7qqSkZP369VOmbO/UicaNo6+/ppqap/dFWjhVzvOsE1jr11PdgXm5\nuTRkCL3yCsnlJU5Opo8mBweHl156yc/PLygoSBjqVllZmZiYaAp48wg3ml6+fLmZy1dXV/fr12/e\nvHmW7JT+M9gnJSXFwu0QUXJyslQqXbx4cWpqalhYmEQiGTp06LZt25o8c9HgrGl1P0KDg4OTk5NN\nExsZDIb09PSpU6dKpdIFCxZYXrl1oa8sI25fGQyGjIyMadOmSaXSl19+ec2aj+/cefpWbS3l5Ykw\nVc7zrBNYX31F0dHPfty7l6ZMoX376OhR47lzV69eLS4u1uv1pveLiop69Ogh1gmUcePGTZ8+vUWr\nCL0oHMCbTa1WC1ejLNmIyeHDhz09Pd3d3dVqtTBbdjM9P2vaRx99JJPJoqOjjx49ahpMlJeXFxcX\n5+bm5uLiEhcXd/bsWVHKti70lcWs0VdHjhwx9VVuLsXGkpMTbd36NLB0OgoIIE/P9hxYWi35+JDw\nUA2djoYNo6bun9q5c2dLLxs36OTJk3Z2dmacMli6dKm3t7cZ5zgElZW0ZMmnGs1X5q3eoO7du+/Z\ns8eMFevNmlZaWmo6pKqoqGj8gQLtGvpKDNboqydPnuzYsWPy5A8kEhoyhLZsoXv3RJgq53lWmw/r\n0iUaM+bp1ZxPP23OGhMmTLB8HJDpRtOWqqqq6tOnj4XX9UWcSKq4uJgxJnwBWbFixYkTJ1q0ujAM\nRyaTbdu2TfjNsWPHYmNjHR0dvb29W/rp2o6grywjel9dvHhx4cKFTk5Orq6u77//fn6+dZ8M0I4m\n8Lt7965SqWzONKwvcuzYMdONpmbIycmxtbU1eziyuLKzs6VSqfBFQKFQNP8W/7qSkpKkUumMGTMG\nDx5sa2s7fvz4vXv3Wj5iiy/oq7rE7avXXnutlfuqHQUWEW3atMnFxaXJASMvEhERYRpVZJ7f/va3\n/v7+wrekysrK5ORktVq9fPny+fPn/+Y3v5k/P2/ECBo0iFQqUqutNTROkJKS0rdvXyK6d+8eY6zu\no31b5PPPP/f09FyzZs1/8wMm0FcmvPdV+5pxNDY2dvfu3QsXLtRoNC1d9/vvv8/MzLx8+bIlBaxd\nuzYoKGj16tWrVq0yGo2bN29WKpXCvJSurq5eXrW9ezOFgsnlrH9/xhjr3Zt98gmbNYuJN/f3U6Y5\nzwoKCrp06eLp6WnedmxtbRljy5YtE7M43qCvTETsK4lE0vp91b4CSyKRbN68uV+/fl999dWkSZMY\nYw8fPrxx44ZWq3306JFWqxWeFKDVasvLy8vLy7VabURExKJFixhjiYmJMTExXl5elhTQpUuXLVu2\njBkzZuLEiYMGDTp79mwjCxcXM4WCTZrEkpLYe+9ZstsGFBQUBAUFMYunLTZvrsgOBn1lIlZf5efn\nt0lfta/AYox5e3uvXLlywYIFI0aMcHZ2/vLLL999911HR0elUimXy+VyufDQEaVS2atXL7lcrlKp\nGGOZmZnZ2dmffvqp5QWMHDly+vTps2bN2r17d0VFRWVlpU6nE7qZseD79weVl7PycjZiBAsPZ4yx\nd99lgwez2FjL9/z/FBQUREVFMYunl0VgCdBXArH6qs1m027N75/NZDAYhgwZ8uabbzZ/laFDhwo3\nmoqiuLh43Lhxwt+PTCZzcXHx8fEJCgqaPv2z0aNpyhSKjaWDB58940TcoXFEVFVV1alTJ+F2WQun\nlx00aJAlc0V2JOgrcftqo+k0Wytqd0dYjDEbG5vk5OSQkJBJkyaZHkAgHKhrtdqKigrTcbvwCLkr\nV66cOXMmLS1NrAJcXFy++eYbrVarUChsbGxetJgoT5Fr0JUrV4xGo+lcw6xZs8zbDhHhCMsEfdUB\n+qo9BhZjrF+/fkuWLFm5cuX8+fOF8wumtyQSiVKpFA7ghaczKZXK77//3sKzDPXY2to61bnbo3EO\nDmzuXLZokWh7LygocHd3l8vlOp3u1q1bZnfG3bt3dTqdr6+vaJVxDn0lSl/duXNHp9O1SWBJiKj1\n99ocRqPx/Pnzd+/eFR4YJ3SS8Mjcti7N6lavXv3dd98dPnw4Nzc3JCSkoqKic+fOZmwnO5t98EHp\n8eP1Hxf63wx9ZXlfHTp0aOrUqY8ePRK9wia10yMsxpiNjU1ISEhISEhbF9IG6l579vLyMq+rGGPD\nhrGcHGdRS+Me+opZ3FcFBQV+fn6iltZcL/wiDW0oPz9feNJcaGjoxo0b27oc6CDE6qs2fOAuAqs9\nioyM/Nvf/ibMFTlmzJiWrn7vHpNIWFLS0x8DA1lZmbgFApcs7Kvi/1wOyM/PxxEWPJOQkDB9+vTQ\n0NCUlJTmr6XVspQUFhrKDh9+OlS6qsp6NQJ/zO0rbUpKSmhoaEBAQG1tLWvbwX2tP5ICmumLL754\nfta0Bp08WfrWW9S1K3l40O9+RzduWGV2WugYmtlXRqMxIyMjKipKmJ9PrVbfvXuXiIRz7Q1OT9oK\nEFjt2oULF+rNmlbXw4cP161bFxAQEBg4c+JE2rePhFnerDQ7LXQYjffV9evX4+Pje/bs6eDgEBsb\ne+zYMdNU2g8ePFi6dKm7u3uLHsMjIgRWe1dcXDxq1CgPD4+6z4k6ceLEpEmT7O3t/fz81q5da2o7\ng4HS0+nAAWsNlYYOo8m++vDDD03zW9TW1u7fv/9Xv/qVnZ2dv7+/hXOoWgKBxQG9Xl9v1jSNRjNz\n5kzTo4CJ6O7dylWrqHdvcnSkQ4esMjstdDAN9lVMTEzdvjI9qMLFxSU+Pv4HYbrXtoPA4obpoZh1\nZ0qrrq7+8ssvx44d2727/6hRxtRU0ume3YxGos5OCx1Sg331+PHjelNpm/FIRGtovyPd4Xm5ubkT\nJ0709vbWaDSXLl3avn373r17XVxc5s6dO2PGDHd397YuELhk6qtdu3bl5eVt3rx5//79PXr0iImJ\niY6OFiauaCcQWJy5ffv2r3/9a71en5eXN3r06Lfffvv111+3s7Nr67qAb3X7Kjw8fPbs2RMmTJBK\npW1dV30ILP5UVVXl5eW5urqaPV0kwPOEvurRo4e493uLC4EFANzASHcA4AYCCwC4gcACAG4gsACA\nGwgsAOAGAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACA\nGwgsAOAGAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACA\nGwgsAOAGAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACA\nGwgsAOAGAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACA\nGwgsAOAGAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4AYCCwC4gcACAG4gsACA\nGwgsAOAGAgsAuIHAAgBuILAAgBsILADgBgILALiBwAIAbiCwAIAbCCwA4Mb/AelZiqmuND8TAAAA\nAElFTkSuQmCC\n",
408 |       "text/plain": [
409 |        "<PIL.Image.Image image mode=RGB size=400x200 at 0x7FA92831B1B8>"
410 |       ]
411 |      },
412 |      "execution_count": 30,
413 |      "metadata": {},
414 |      "output_type": "execute_result"
415 |     }
416 |    ],
417 |    "source": [
418 |     "Draw.MolsToGridImage(ms, molsPerRow=2)"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": null,
424 |    "metadata": {
425 |     "collapsed": true
426 |    },
427 |    "outputs": [],
428 |    "source": []
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": null,
433 |    "metadata": {
434 |     "collapsed": true
435 |    },
436 |    "outputs": [],
437 |    "source": []
438 |   }
439 |  ],
440 |  "metadata": {
441 |   "kernelspec": {
442 |    "display_name": "Python 2",
443 |    "language": "python",
444 |    "name": "python2"
445 |   },
446 |   "language_info": {
447 |    "codemirror_mode": {
448 |     "name": "ipython",
449 |     "version": 2
450 |    },
451 |    "file_extension": ".py",
452 |    "mimetype": "text/x-python",
453 |    "name": "python",
454 |    "nbconvert_exporter": "python",
455 |    "pygments_lexer": "ipython2",
456 |    "version": "2.7.6"
457 |   }
458 |  },
459 |  "nbformat": 4,
460 |  "nbformat_minor": 1
461 | }
462 | 


--------------------------------------------------------------------------------
/data_files/3_fingerprint_analyse_additional.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 6,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pandas as pd"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 8,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "pns_count = pd.Series.from_csv(\"fp_files/pns_apfp_count.csv\", header=0)"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 10,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "data": {
 37 |       "text/plain": [
 38 |        "(21160,)"
 39 |       ]
 40 |      },
 41 |      "execution_count": 10,
 42 |      "metadata": {},
 43 |      "output_type": "execute_result"
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "pns_count.shape"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 13,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "data": {
 57 |       "text/plain": [
 58 |        "APFP\n",
 59 |        "10552354     5\n",
 60 |        "10552355    16\n",
 61 |        "10552356     7\n",
 62 |        "10552357     6\n",
 63 |        "10552358     2\n",
 64 |        "Name: COUNT, dtype: int64"
 65 |       ]
 66 |      },
 67 |      "execution_count": 13,
 68 |      "metadata": {},
 69 |      "output_type": "execute_result"
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "pns_count.head()"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 11,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "import matplotlib.pyplot as plt"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 19,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "a = pns_count.sort_values(ascending=False)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": []
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 28,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAD8CAYAAACcjGjIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHYJJREFUeJzt3XuQXGed3vHv090zo/t9JBRJRgJU\n9hoSfJnIIkuRjb3IspNaOQlQplKRylEhspgUVOWyJvuHN7BUYJMsiyqstwxWkCgWYbxQVu3KaLXC\nXmoTZGsMRr5bY2FHUiRrpNH9PjO//NHvmPZMd5+WNDM9o/N8qrr69O+85/R7jmbm0bkrIjAzM6tU\naHYHzMxs7HE4mJnZEA4HMzMbwuFgZmZDOBzMzGwIh4OZmQ3hcDAzsyEyw0HS9ZKeq3idlPR5SbMk\nbZe0J73PTO0lab2kLkm7Jd1SMa81qf0eSWsq6rdKej5Ns16SRmZxzcysEZnhEBGvRsRNEXETcCtw\nFvgR8ACwIyKWAjvSZ4C7gKXptQ54CEDSLOBB4DZgGfDgQKCkNp+qmG7lsCydmZldkdJltr8DeD0i\n3pS0CvitVN8IPAX8HrAK2BTlS693SpohaX5quz0iegAkbQdWSnoKmBYRO1N9E3AP8ES9jsyZMycW\nL158md03M8uvZ5999khEtDfS9nLD4V7ge2l4XkQcTMOHgHlpeAGwr2Ka/alWr76/Sr2uxYsX09nZ\neZndNzPLL0lvNtq24QPSklqB3wF+MHhc2koY8Zs0SVonqVNSZ3d390h/nZlZbl3O2Up3AT+PiLfS\n57fS7iLS++FUPwAsqphuYarVqy+sUh8iIh6OiI6I6Ghvb2jLyMzMrsDlhMMn+fUuJYAtwMAZR2uA\nxyvqq9NZS8uBE2n30zZghaSZ6UD0CmBbGndS0vJ0ltLqinmZmVkTNHTMQdJk4KPApyvKXwEelbQW\neBP4RKpvBe4Guiif2XQfQET0SPoSsCu1++LAwWngM8C3gYmUD0TXPRhtZmYjS+P1eQ4dHR3hA9Jm\nZo2T9GxEdDTS1ldIm5nZEA4HMzMbInfhsH7HHv72NZ8Ga2ZWT+7C4aGnXud/dx1pdjfMzMa03IWD\nBP394/MgvJnZaMldOBSkkb+U28xsnMtdOAjoH6en75qZjZbchQMCZ4OZWX25C4eCnyNkZpYpd+Eg\nebeSmVmW3IVDQfJuJTOzDLkLBx+QNjPLlr9w8KmsZmaZchgOMF7vRGtmNlryFw74VFYzsyy5Cwcf\nkDYzy5a7cPCprGZm2XIXDr63kplZttyFA3jLwcwsS+7CoVAAbzqYmdXXUDhImiHpMUmvSHpZ0ock\nzZK0XdKe9D4ztZWk9ZK6JO2WdEvFfNak9nskramo3yrp+TTNemnkboAk5C0HM7MMjW45fB34cUTc\nAHwQeBl4ANgREUuBHekzwF3A0vRaBzwEIGkW8CBwG7AMeHAgUFKbT1VMt/LqFqs2yRsOZmZZMsNB\n0nTgI8AjABFxMSKOA6uAjanZRuCeNLwK2BRlO4EZkuYDdwLbI6InIo4B24GVady0iNgZ5avTNlXM\na9j5VFYzs2yNbDksAbqB/yXpF5K+JWkyMC8iDqY2h4B5aXgBsK9i+v2pVq++v0p9RPjeSmZm2RoJ\nhxJwC/BQRNwMnOHXu5AASP/jH/G/uJLWSeqU1Nnd3X2F8/BuJTOzLI2Ew35gf0Q8nT4/Rjks3kq7\nhEjvh9P4A8CiiukXplq9+sIq9SEi4uGI6IiIjvb29ga6PpQk31vJzCxDZjhExCFgn6TrU+kO4CVg\nCzBwxtEa4PE0vAVYnc5aWg6cSLuftgErJM1MB6JXANvSuJOSlqezlFZXzGvYFfyYUDOzTKUG2/07\n4LuSWoG9wH2Ug+VRSWuBN4FPpLZbgbuBLuBsaktE9Ej6ErArtftiRPSk4c8A3wYmAk+k14jwqaxm\nZtkaCoeIeA7oqDLqjiptA7i/xnw2ABuq1DuBDzTSl6slbzmYmWXK3RXSftiPmVm2/IUDftiPmVmW\n3IVDoeDdSmZmWXIXDj4gbWaWLXfhUPBFcGZmmXIXDkj0Ox3MzOrKXTj4gLSZWbbchUNhxJ4UYWZ2\n7chdOEg+IG1mliV34eB7K5mZZctdOPhUVjOzbLkLB7zlYGaWKXfh4OsczMyy5S4chB/2Y2aWJX/h\n4N1KZmaZchcOBd+y28wsU+7CQcJnK5mZZchdOIB3K5mZZcldOPhJcGZm2XIXDoXynfea3Q0zszEt\nd+Eg8C27zcwyNBQOkt6Q9Lyk5yR1ptosSdsl7UnvM1NdktZL6pK0W9ItFfNZk9rvkbSmon5rmn9X\nmnbE7p1a3q3kdDAzq+dythz+SUTcFBEd6fMDwI6IWArsSJ8B7gKWptc64CEohwnwIHAbsAx4cCBQ\nUptPVUy38oqXKINvvGdmlu1qdiutAjam4Y3APRX1TVG2E5ghaT5wJ7A9Inoi4hiwHViZxk2LiJ1R\nvnR5U8W8RoCfBGdmlqXRcAjgryU9K2ldqs2LiINp+BAwLw0vAPZVTLs/1erV91epj4jyFdJOBzOz\nekoNtvtwRByQNBfYLumVypEREZJG/C9uCqZ1ANddd92VzWM4O2Rmdo1qaMshIg6k98PAjygfM3gr\n7RIivR9OzQ8AiyomX5hq9eoLq9Sr9ePhiOiIiI729vZGuj5EQfIxBzOzDJnhIGmypKkDw8AK4AVg\nCzBwxtEa4PE0vAVYnc5aWg6cSLuftgErJM1MB6JXANvSuJOSlqezlFZXzGvY+fYZZmbZGtmtNA/4\nUTq7tAT8eUT8WNIu4FFJa4E3gU+k9luBu4Eu4CxwH0BE9Ej6ErArtftiRPSk4c8A3wYmAk+k14iQ\nn+dgZpYpMxwiYi/wwSr1o8AdVeoB3F9jXhuADVXqncAHGujvVZP8PAczsyy5vELa2WBmVl/+wsE3\n3jMzy5S7cCj4Ogczs0y5CwffeM/MLFv+wsE33jMzy5S/cMAHpM3MsuQvHHyFtJlZphyGgw9Im5ll\nyV04FOQD0mZmWXIXDi3FAr39/c3uhpnZmJbLcLjY63AwM6snd+FQLPhJcGZmWXIXDqWC6HM6mJnV\nlbtwKDgczMwy5S4cSgX5gLSZWYbchcPAMQdf62BmVlv+wqH8RDvvWjIzqyN/4VBM4eAtBzOzmvIX\nDt5yMDPLlL9wKJTDodfhYGZWU8PhIKko6ReS/jJ9XiLpaUldkr4vqTXV29LnrjR+ccU8vpDqr0q6\ns6K+MtW6JD0wfIs31EA49DsczMxqupwth88BL1d8/irwtYh4H3AMWJvqa4Fjqf611A5JNwL3Au8H\nVgJ/mgKnCHwDuAu4EfhkajsiSt5yMDPL1FA4SFoI/FPgW+mzgNuBx1KTjcA9aXhV+kwaf0dqvwrY\nHBEXIuJXQBewLL26ImJvRFwENqe2I6JYKC+ytxzMzGprdMvhT4D/BAxcPTYbOB4RvenzfmBBGl4A\n7ANI40+k9m/XB01Tqz4iimmJveVgZlZbZjhI+mfA4Yh4dhT6k9WXdZI6JXV2d3df0TwGthx8tpKZ\nWW2NbDn8JvA7kt6gvMvnduDrwAxJpdRmIXAgDR8AFgGk8dOBo5X1QdPUqg8REQ9HREdEdLS3tzfQ\n9aEGthwcDmZmtWWGQ0R8ISIWRsRiygeUfxIR/wp4EvhYarYGeDwNb0mfSeN/EuV7VWwB7k1nMy0B\nlgLPALuApensp9b0HVuGZemqGNhy8G4lM7PaStlNavo9YLOkPwR+ATyS6o8A35HUBfRQ/mNPRLwo\n6VHgJaAXuD8i+gAkfRbYBhSBDRHx4lX0q66Bi+D6fYW0mVlNlxUOEfEU8FQa3kv5TKPBbc4DH68x\n/ZeBL1epbwW2Xk5frtTbF8H1ORzMzGrJ3RXSA9c5+JiDmVltuQuHgS0H33jPzKy2/IaDH/hjZlZT\njsOhyR0xMxvDchsOflSomVltuQ0HH5A2M6vN4WBmZkPkLhx8KquZWbbchUPBjwk1M8uUu3AoFR0O\nZmZZchcOA/dW8o33zMxqy184FHzjPTOzLLkNB994z8ysttyGg++tZGZWW+7CoeTHhJqZZcpdOKRs\n8AFpM7M6chcOA1sO/Q4HM7OachcOPpXVzCxb/sIhXQTnLQczs9ryFw7ecjAzy5S/cPCT4MzMMmWG\ng6QJkp6R9EtJL0r6L6m+RNLTkrokfV9Sa6q3pc9dafziinl9IdVflXRnRX1lqnVJemD4F/PXWtJu\npUu+CM7MrKZGthwuALdHxAeBm4CVkpYDXwW+FhHvA44Ba1P7tcCxVP9aaoekG4F7gfcDK4E/lVSU\nVAS+AdwF3Ah8MrUdEZIoFuQnwZmZ1ZEZDlF2On1sSa8AbgceS/WNwD1peFX6TBp/hySl+uaIuBAR\nvwK6gGXp1RUReyPiIrA5tR0xpYJ8+wwzszoaOuaQ/of/HHAY2A68DhyPiN7UZD+wIA0vAPYBpPEn\ngNmV9UHT1KpX68c6SZ2SOru7uxvpelUtxYJ3K5mZ1dFQOEREX0TcBCyk/D/9G0a0V7X78XBEdERE\nR3t7+xXPp1T0biUzs3ou62yliDgOPAl8CJghqZRGLQQOpOEDwCKANH46cLSyPmiaWvURUyp4y8HM\nrJ5GzlZqlzQjDU8EPgq8TDkkPpaarQEeT8Nb0mfS+J9ERKT6velspiXAUuAZYBewNJ391Er5oPWW\n4Vi4WlqKorfPWw5mZrWUspswH9iYzioqAI9GxF9KegnYLOkPgV8Aj6T2jwDfkdQF9FD+Y09EvCjp\nUeAloBe4PyL6ACR9FtgGFIENEfHisC1hFeXdSt5yMDOrJTMcImI3cHOV+l7Kxx8G188DH68xry8D\nX65S3wpsbaC/w6KlUOCStxzMzGrK3RXSkLYcfMzBzKymXIZD+VRWbzmYmdWSy3BoKxW40OtwMDOr\nJafhUORCb1+zu2FmNmblMhwmtBQ4f8lbDmZmteQ0HIqcu+QtBzOzWnIZDpNaS5y76HAwM6slp+FQ\n5OzF3uyGZmY5lc9waCtyxlsOZmY15TMcWkpc7O33/ZXMzGrIZThMbisCcNYHpc3MqsplOExsLYeD\nD0qbmVWXy3CYlMLhzAUflDYzqyaX4TClrQWA0w4HM7OqchkO0yeWw+H42UtN7omZ2diUy3B4+4C0\njzmYmVWVy3CY1Fp+xtG5S96tZGZWTU7DYeCAtLcczMyqyWU4TG5LWw7erWRmVlUuw2FiS9py8P2V\nzMyqygwHSYskPSnpJUkvSvpcqs+StF3SnvQ+M9Ulab2kLkm7Jd1SMa81qf0eSWsq6rdKej5Ns16S\nRmJhBxQLYmJL0QekzcxqaGTLoRf49xFxI7AcuF/SjcADwI6IWArsSJ8B7gKWptc64CEohwnwIHAb\nsAx4cCBQUptPVUy38uoXrb7JbUVOnfeWg5lZNZnhEBEHI+LnafgU8DKwAFgFbEzNNgL3pOFVwKYo\n2wnMkDQfuBPYHhE9EXEM2A6sTOOmRcTOiAhgU8W8RsyMSa0cP3txpL/GzGxcuqxjDpIWAzcDTwPz\nIuJgGnUImJeGFwD7Kibbn2r16vur1EfU5FY/Dc7MrJaGw0HSFOAvgM9HxMnKcel//DHMfavWh3WS\nOiV1dnd3X9W8JviYg5lZTQ2Fg6QWysHw3Yj4YSq/lXYJkd4Pp/oBYFHF5AtTrV59YZX6EBHxcER0\nRERHe3t7I12vaWJrkfPecjAzq6qRs5UEPAK8HBF/XDFqCzBwxtEa4PGK+up01tJy4ETa/bQNWCFp\nZjoQvQLYlsadlLQ8fdfqinmNmPKjQh0OZmbVlBpo85vAvwael/Rcqv1n4CvAo5LWAm8Cn0jjtgJ3\nA13AWeA+gIjokfQlYFdq98WI6EnDnwG+DUwEnkivETWhpeiL4MzMasgMh4j4O6DWdQd3VGkfwP01\n5rUB2FCl3gl8IKsvw2lii3crmZnVkssrpMG7lczM6sltOExsKZ/KWt7QMTOzSvkNh3Tbbm89mJkN\nldtwGHga3MnzfhqcmdlguQ2HKRPKWw5n/BxpM7Mh8hsO6VGhvvmemdlQuQ2H6RNbATh+zruVzMwG\ny204zJvWBsChE+eb3BMzs7Ent+Ewd+oEAI6cutDknpiZjT25DYfWUoGpE0ocOe1wMDMbLLfhADB3\nahuHveVgZjZErsNhxqRWX+dgZlZFrsNhSluJ0z6V1cxsiFyHw9QJJV/nYGZWRa7DYdbkVo6eudjs\nbpiZjTm5DoeBYw59/b4zq5lZpXyHw8QWIuCkr5I2M3uHXIfD7CnlW2j4Wgczs3fKdTi0Ty3fQqPb\n4WBm9g75DocpKRx8IZyZ2TvkOxzSlsOR0z5jycysUmY4SNog6bCkFypqsyRtl7Qnvc9MdUlaL6lL\n0m5Jt1RMsya13yNpTUX9VknPp2nWS9JwL2Qt0ye20FKUtxzMzAZpZMvh28DKQbUHgB0RsRTYkT4D\n3AUsTa91wENQDhPgQeA2YBnw4ECgpDafqphu8HeNGEnMmdLmA9JmZoNkhkNE/BToGVReBWxMwxuB\neyrqm6JsJzBD0nzgTmB7RPRExDFgO7AyjZsWETsjIoBNFfMaFXOmtHnLwcxskCs95jAvIg6m4UPA\nvDS8ANhX0W5/qtWr769Sr0rSOkmdkjq7u7uvsOvv1O47s5qZDXHVB6TT//hH5RLjiHg4IjoioqO9\nvX1Y5nndrEn836NnKC+GmZnBlYfDW2mXEOn9cKofABZVtFuYavXqC6vUR83i2ZM4c7HPZyyZmVW4\n0nDYAgyccbQGeLyivjqdtbQcOJF2P20DVkiamQ5ErwC2pXEnJS1PZymtrpjXqHj3nMkAvHH0zGh+\nrZnZmNbIqazfA34GXC9pv6S1wFeAj0raA/x2+gywFdgLdAHfBD4DEBE9wJeAXen1xVQjtflWmuZ1\n4InhWbTGvHfOFABeP3x6NL/WzGxMK2U1iIhP1hh1R5W2AdxfYz4bgA1V6p3AB7L6MVIWzJxIW6lA\nl8PBzOxtub5CGqBYEO9tn0JXt8PBzGxA7sMB4H1zp7DnLYeDmdkAhwOwdO4UDhw/R4+fCmdmBjgc\nAPit6+cC8OQrhzNampnlg8MB+I35U5naVuKp14bnqmszs/HO4QCUigXuXbaIv9r9/zh86nyzu2Nm\n1nQOh+STy65DEt/86d5md8XMrOkcDsl72qdw5/vnsfmZfZy50Nvs7piZNZXDocLaDy/h1IVeHnt2\nf3ZjM7NrmMOhwq3vnsXN183gW3+3l96+/mZ3x8ysaRwOg3z6I+9hX885tr5wqNldMTNrGofDIL/9\nG/N4b/tk/tu2V3zswcxyy+EwSKlY4L/+i3/A/mPn+KMfv9Ls7piZNYXDoYplS2ax5kOL2fizN+l8\nY/Djs83Mrn0Ohxr+453X8/emT+D+P/85+3rONrs7ZmajyuFQw+S2Ehvu+4ecv9TPx//sZ3QdPtXs\nLpmZjRqHQx03vGsa3//0cnr7g3/50M94/LkD9PdHs7tlZjbiHA4ZbnjXNH74u/+IJXMm87nNz/GP\n//uT/MnfvOZdTWZ2TVP5yZ7jT0dHR3R2do7a913q6+evdh/kB8/u4/+8fpQIuPm6Gfz9BdO54V3T\nuGH+VK6fN5XJbZlPXjUzawpJz0ZER0NtHQ6Xb/+xs/zw5wf429e6efXQKU5XXA8xZ0obi2dP4rrZ\nk1g4YyKzp7QxY1IL7VPbWDBjIgtnTqJYUFP6bWb5Ni7DQdJK4OtAEfhWRHylXvtmhkOl/v7gwPFz\nvHLoFK8eOsm+nnO8cfQMbx49y+FT5xl8iKK1VGDJ7Mm8e/Yk5k5rY9bkNuZMaWX25DZmTW4tD09p\nY/rEFoeImQ2rywmHMbEPRFIR+AbwUWA/sEvSloh4qbk9y1YoiEWzJrFo1iQ+euO8d4zr7evnxLlL\nHDt7kcMnL7Dv2Fn2dp/h9e7TvHH0DLve6OH4uUtUy2cJZk5qZdbkVuZPn8C7pk3gXdMnMHdqG7On\nlINk4DV1Qom2UnGUltjM8mBMhAOwDOiKiL0AkjYDq4AxHw71lIoFZk8p/zF/39ypVdv09QdHz1zg\n2JlLHD19gSNnLtJz+gI9Zy/Rc+YCR05d5OCJc7x66BRHTl8YsiUyoK1UYEpbiQktRSa2FpnYUmRC\nS4EJLUXaSuX3CQO1UpG2lgJtpSKtpQKtxQKtpQKlgigVC7QURalQoFTU28PFgt750q+HSwVRKIiC\nREFQkJCgmGpKtYHxGtROpPfKYcrtyu/lYTMbPWMlHBYA+yo+7wdua1JfRlWxIOZOncDcqROA6gEy\noLevn56zFzly6iLHzl7k6JmLHDtzkVPnL3Hi3CXOXuzj3KU+zqX3C5f6OX2hlyOn+7lwqY/zl/o4\n39tffr/UVzNoxqrK0CikIOHtWgobLi9U6o2uN2W9+WbFWP0u1Zlvxozr97fedCP1nVce6HX7O8aW\nJXMph/lnbNakVh79tx/K+tarNlbCoSGS1gHrAK677rom92b0lYqFiiC5er19/Vzs6+dib/nV2x/0\n9gWX+vvL730DtX76+oO+iPL7oFdvf9AfQQT0R9Cf3qNiuD8of+6vrA3UIShPH2k+QZX6oFp/Gubt\n9kPnV0+94231Jq0336g7Zda0VzZd1tR1v3MMLkvd773i78xYlrrTXtl0Wd9bd9o6I6dOGJ0/22Ml\nHA4Aiyo+L0y1d4iIh4GHoXxAenS6du0qFQuUigUmtTa7J2Y21oyVi+B2AUslLZHUCtwLbGlyn8zM\ncmtMbDlERK+kzwLbKJ/KuiEiXmxyt8zMcmtMhANARGwFtja7H2ZmNnZ2K5mZ2RjicDAzsyEcDmZm\nNoTDwczMhnA4mJnZEGPmrqyXS1I38OYVTj4HODKM3bnWeP3U5/VTn9dPbc1eN++OiPZGGo7bcLga\nkjobvW1tHnn91Of1U5/XT23jad14t5KZmQ3hcDAzsyHyGg4PN7sDY5zXT31eP/V5/dQ2btZNLo85\nmJlZfXndcjAzszpyFQ6SVkp6VVKXpAea3Z/RJOkNSc9Lek5SZ6rNkrRd0p70PjPVJWl9Wk+7Jd1S\nMZ81qf0eSWuatTxXS9IGSYclvVBRG7b1IenWtL670rTj6jmnNdbPH0g6kH6GnpN0d8W4L6RlfVXS\nnRX1qr9z6fb8T6f699Ot+scNSYskPSnpJUkvSvpcql87P0ORnth1rb8o3wr8deA9QCvwS+DGZvdr\nFJf/DWDOoNofAQ+k4QeAr6bhu4EnKD/FcDnwdKrPAvam95lpeGazl+0K18dHgFuAF0ZifQDPpLZK\n097V7GUehvXzB8B/qNL2xvT71AYsSb9nxXq/c8CjwL1p+M+A3232Ml/m+pkP3JKGpwKvpfVwzfwM\n5WnLYRnQFRF7I+IisBlY1eQ+NdsqYGMa3gjcU1HfFGU7gRmS5gN3AtsjoicijgHbgZWj3enhEBE/\nBXoGlYdlfaRx0yJiZ5R/yzdVzGtcqLF+alkFbI6ICxHxK6CL8u9b1d+59D/g24HH0vSV63pciIiD\nEfHzNHwKeBlYwDX0M5SncFgA7Kv4vD/V8iKAv5b0bHoWN8C8iDiYhg8B89JwrXV1ra/D4VofC9Lw\n4Pq14LNpt8iGgV0mXP76mQ0cj4jeQfVxSdJi4Gbgaa6hn6E8hUPefTgibgHuAu6X9JHKkel/Jz51\nLfH6qOoh4L3ATcBB4H80tzvNJ2kK8BfA5yPiZOW48f4zlKdwOAAsqvi8MNVyISIOpPfDwI8ob/K/\nlTZfSe+HU/Na6+paX4fDtT4OpOHB9XEtIt6KiL6I6Ae+SflnCC5//RylvFulNKg+rkhqoRwM342I\nH6byNfMzlKdw2AUsTWdJtAL3Alua3KdRIWmypKkDw8AK4AXKyz9wdsQa4PE0vAVYnc6wWA6cSJvK\n24AVkmamXQorUu1aMSzrI407KWl52r++umJe49bAH73kn1P+GYLy+rlXUpukJcBSygdTq/7Opf9R\nPwl8LE1fua7HhfTv+gjwckT8ccWoa+dnqNlH/UfzRfmMgdcon0Hx+83uzygu93sonynyS+DFgWWn\nvO93B7AH+BtgVqoL+EZaT88DHRXz+jeUDzh2Afc1e9muYp18j/KukUuU9+euHc71AXRQ/uP5OvA/\nSRecjpdXjfXznbT8uyn/sZtf0f7307K+SsVZNbV+59LP5DNpvf0AaGv2Ml/m+vkw5V1Gu4Hn0uvu\na+lnyFdIm5nZEHnarWRmZg1yOJiZ2RAOBzMzG8LhYGZmQzgczMxsCIeDmZkN4XAwM7MhHA5mZjbE\n/wf3xMIpnDEfowAAAABJRU5ErkJggg==\n",
109 |       "text/plain": [
110 |        "<Figure size 432x288 with 1 Axes>"
111 |       ]
112 |      },
113 |      "metadata": {},
114 |      "output_type": "display_data"
115 |     }
116 |    ],
117 |    "source": [
118 |     "plt.plot(a.values)\n",
119 |     "plt.show()"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 32,
125 |    "metadata": {},
126 |    "outputs": [
127 |     {
128 |      "data": {
129 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHItJREFUeJzt3Xl8VOW9x/HPbyYbCQlrCEsImyCC\nyBYWAbUuIFqXutTlal2qpdZatdqF2vZqr7223lqtS22lSt1LtcXrUiwigmyCBGRHCCAICEnYw06S\n5/6RgRuRkEmYyTln8n2/XnnlcObM5HceZ76eec5zzmPOOUREJDhCXhcgIiK1o+AWEQkYBbeISMAo\nuEVEAkbBLSISMApuEZGAUXCLiASMgltEJGCSotnIzNYCpUA5UOacy49nUSIiUr2ogjvibOfclmg2\nbNmypevYsWPdKhIRaYDmzZu3xTmXHc22tQnuqHXs2JGCgoJ4vLSISEIys3XRbhttH7cD3jOzeWY2\nqm5liYhILER7xD3MObfRzFoBk8zsU+fctKobRAJ9FEBeXl6MyxQRkcOiOuJ2zm2M/C4G3gAGHmOb\nMc65fOdcfnZ2VN00IiJSBzUGt5llmFnm4WVgBLAk3oWJiMixRdNVkgO8YWaHt3/VOffvuFYlIiLV\nqjG4nXNrgN71UIuIiERBV06KiASMr4L7icmFfLiyxOsyRER8zVfB/cyHq5m6otjrMkREfM1XwZ2Z\nlszu/WVelyEi4ms+C+4kShXcIiLH5b/gPnDI6zJERHzNZ8GdrCNuEZEa+Cy41VUiIlITnwV3MqX7\n1VUiInI8vgrurLQkdumIW0TkuPwV3I2SOVhWwf5D5V6XIiLiW74K7laZqQAU7zrgcSUiIv7lq+DO\nyUoDoKh0v8eViIj4lz+De5eCW0SkOj4L7squkiJ1lYiIVMtXwd2kUTIpSSGKdcQtIlItXwW3mZGT\nlcpmBbeISLV8FdwArbPS1MctInIcvgvuVllpGg4oInIcvgvu1llpbNixj4oK53UpIiK+5Lvg7pyd\nwcGyCtZv3+t1KSIivuS74O6X1wyABet3eFyJiIg/+S64u2Q3JilkfLq51OtSRER8yXfBnZIUIq9F\nOmu37PG6FBERX/JdcAO0b5auPm4RkWr4MrjzmqezbutenNPIEhGRo/kyuLtkZ1C6v4yS3RrPLSJy\nNF8Gd7ecTACWb9IJShGRo/kyuE/NbQLAIg0JFBH5Cl8Gd1ZaMl2yM1i4YafXpYiI+I4vgxugR9sm\nrCja5XUZIiK+49vg7taqMeu37WPvQc36LiJSVdTBbWZhM/vEzN6JZ0GHdY2coCws2l0ff05EJDBq\nc8R9F7A8XoUcrVtOYwBWFmlkiYhIVVEFt5nlAl8Hno1vOf+vQ4sMUpJCFBbriFtEpKpoj7j/APwE\nqIhjLV8SDhldshvriFtE5Cg1BreZXQQUO+fm1bDdKDMrMLOCkpKSmBR3ck5jlm/SyBIRkaqiOeIe\nClxiZmuBccA5Zvby0Rs558Y45/Kdc/nZ2dkxKa5P+6YU7TrAxh37YvJ6IiKJoMbgds79zDmX65zr\nCFwDfOCcuz7ulQH5HZsDULB2W338ORGRQPDtOG6A7q0zSU8JM3/ddq9LERHxjaTabOycmwpMjUsl\nx5AUDtE7tynzPldwi4gc5usjboD+HZqxfFOprqAUEYnwfXAP7NSc8grHzFVbvS5FRMQXfB/cp3dp\nQXpKmHcXb/K6FBERX/B9cCeHQ1yV3563Fn5B0a79XpcjIuI53wc3wM1DO1LuHK/MXud1KSIingtE\ncHdokcG53VvxypzPOVBW7nU5IiKeCkRwA9xweke27jnIe0uLvC5FRMRTgQnuoSe1pG2TNF6ft8Hr\nUkREPBWY4A6HjCv65zK9sIRNO3XvEhFpuAIT3ABX9s/FORg/f6PXpYiIeCZQwd2hRQYDOzbn9YL1\nOOe8LkdExBOBCm6AK/NzWbt1L3PX6v4lItIwBS64v96rDekpYf4+d73XpYiIeCJwwZ2RmsSlfdry\nzqIvdJJSRBqkwAU3wK1ndOZAWQXvLNT9S0Sk4QlkcHfJbkx+h2a8NHsd5RU6SSkiDUsggxvg5qGd\n+HzbXiYt05WUItKwBDa4z++ZQ/vmjfjL9DVelyIiUq8CG9xJ4RC3DO3EvHXbmbdOkwmLSMMR2OAG\n+GZ+e5qlJ/ObCZ/qghwRaTACHdwZqUn8+PzuFKzbzjuLNMJERBqGQAc3wNUD2tMtpzFPflDIofIK\nr8sREYm7wAd3OGTcM/xkVhbt5qkPVnldjohI3AU+uAFGntqay/q244kPCvn3ks1elyMiElcJEdwA\nD13Wi965Tblz3CcaZSIiCS1hgrtRSpi/3jSAVpmp3P33BZSUHvC6JBGRuEiY4AZolpHCk9f2paT0\nAJc9PZPlm3Z5XZKISMwlVHAD9M1rxrhRp3OovILLnp7JmGmrdT8TEUkoCRfcAH3aN+XtHwxj2Ekt\neWjCp1z+9ExWFpV6XZaISEwkZHADtMpM4y835PP4NX3YsH0fFz85g99MWM7+Q+VelyYickISNrgB\nzIxL+7Tj3bvP4MJebXhm2houeHw6M1dt0SXyIhJYCR3ch7XKTOOxq/vw8i2DKK9wXPfsHG7661w+\nWr1V/d8iEjhW05GnmaUB04BUIAn4h3Pu/uM9Jz8/3xUUFMSsyFjad7Ccl2ev4/HJhew+UEbbJml8\n58zOXDeoAylJDeL/YyLiQ2Y2zzmXH9W2UQS3ARnOud1mlgzMAO5yzs2u7jl+Du7D9h4s4/3lxbwy\nex1zPttGt5zGPHfjANo3T/e6NBFpgGoT3DUeYrpKuyP/TI78BL5/IT0liUt6t2XcqME8d2M+m3fu\n54o/zWLJxp1elyYiclxR9Q2YWdjMFgDFwCTn3Jz4llV/zIxzT8lh3KjTCYeMq575iKkrir0uS0Sk\nWlEFt3Ou3DnXB8gFBprZqUdvY2ajzKzAzApKSkpiXWfc9WibxZvfH0pe83RueaGAX729lKJd+70u\nS0TkK2p1Ns45twOYAow8xmNjnHP5zrn87OzsWNVXr1plpfH6badzZb9cXpi1lrMfmcofp6ziYJnu\n8y0i/lFjcJtZtpk1jSw3AoYDn8a7MK9kpiXz8JWn8f49Z3FG15b8buIKhvz2AyYs1gw7IuIP0Rxx\ntwGmmNkiYC6VfdzvxLcs73XObswz38rnpVsG0rZpGre/Mp/bXprHjr0HvS5NRBq4GocD1kUQhgPW\nxsGyCsZMW80Tk1eR1SiZe4Z34+oB7QmHzOvSRCRBxHQ4oEBKUog7zunK67edTscW6dz3xmKuf3YO\nxTp5KSIeUHDXQu/2TXn9ttP5zeW9KFi3jeGPTaNgrWbbEZH6peCuJTPj2oF5TLjzDJpnpHDriwUU\n6paxIlKPFNx11DUnk7E3DSApZFwzZjafb93rdUki0kAouE9Ap5YZjBs1mEPlFdz0/Me6YEdE6oWC\n+wSd1CqT524aQNHO/dz817ns3HfI65JEJMEpuGNgQMfmPHVdP1YWlfIff5mt8BaRuFJwx8jZJ7fi\nT9f3Z8XmUq4dM5udexXeIhIfCu4YGt4jh7/ckM/KolKu/PMshbeIxIWCO8bO7t6K528eyGdb9nD7\nq/Mo3a/wFpHYUnDHwbCuLfntFacxe802Ln96lkabiEhMKbjj5Mr+ufz1pgFs3LGPa8bMZvNOhbeI\nxIaCO47O7JbN2JsGsGnnPi57eiYzCrd4XZKIJAAFd5wN7tyCv31nMMnhENc/N4cbx37M1t0HvC5L\nRAJMwV0P+uY14927zuCnI7vz0ZqtjHx8OhOXbva6LBEJKAV3PclITeJ7X+vCG7cPIT0lzHdfmscd\nr85n2x5NzCAitaPgrmc92zZh4t1n8oNzTuLdJZs5+5GpfLR6q9dliUiAKLg9kJYc5t4RJ/PG7UNo\nlp7Mdc/OZvz8DV6XJSIBoeD20Gm5TXn9tiH079CMe15byCMTV1BREfup5EQksSi4PZadmcpLtwzi\n0j5teWrKKv7j2dls0agTETkOBbcPpCWH+cPVffjF10+hYO12LnlyBnM1JZqIVEPB7RNmxq1ndOaV\nWwdR7hxXP/MRf5yyCufUdSIiX6bg9plBnVvw3g/P4pzurfjdxBXcOW6BblQlIl+i4PahJo2SeeZb\n+Xz/7C68vfALbhz7MWtKdntdloj4hILbp8Ih48fnd+eRb/ZmyRe7GPn4dF6YtVZdJyKi4Pa7K/vn\nMvmes+if14z731rKLS8UsH6bZpQXacgU3AHQvnk6r9w6iHuHd2NG4RZGPDaN1wrWe12WiHhEwR0Q\noZDxg3O78v49Z9GlVQY/+ccirh0zm2JN0iDS4Ci4AyavRTrjvzeUH57XjdmfbeXsR6byv59s9Los\nEalHCu4ASkkKcdd5XXnl1kFkpCZx998XcOHj01m0YYfXpYlIPVBwB9iQLi2Z/tOzufPcrqwoKuWS\np2Yy+p+LKNf9TkQSmoI74FKTwtwzvBszfno23XIaM27uekb+YZpGnogksBqD28zam9kUM1tmZkvN\n7K76KExqp02TRvz7rjO578LuFBbv5oz/mcJ/vb2MPQfKvC5NRGIsmiPuMuBe51wPYDDwfTPrEd+y\npC5CIWPUmV145wfD6Nk2i7EzP6Pvg5N4bNJKDpSVe12eiMRIjcHtnNvknJsfWS4FlgPt4l2Y1N2p\n7ZrwrzvP4Onr+tEsPZnHJxfS/8H3mblKs8yLJIJa9XGbWUegLzAnHsVIbF3Yqw1z7juPh6/oxcGy\nCq57dg63PD9X/d8iAWfR3vvCzBoDHwL/7Zwbf4zHRwGjAPLy8vqvW7culnXKCSou3c994xfz/vJi\nAC7p3ZZfXtSD7MxUjysTEQAzm+ecy49q22iC28ySgXeAic65R2vaPj8/3xUUFETz96WeLd6wkwfe\nXsq8ddsB+NbgDvz0gu40Tk3yuDKRhi2mwW1mBrwAbHPO3R3Niyq4/W/W6i38bPxi1m2t7Da5aUhH\nfnheN5qkJ3tcmUjDFOvgHgZMBxYDFZHV9znnJlT3HAV3cLy7eBO/ensZmyP3PBnZszUPX3GaAlyk\nnsW8q6S2FNzB83rBekaPX3zkqstffP0Ubj2js8dViTQctQluXTkpAHwzvz2Fv76Ae4Z3A+DX/1rO\nub+fSpHuPijiOwpuOSIUMu48tysf/ewc2jVtxOqSPQx6aDKj/7mI/Yd0AY+IXyi45SvaNGnEzNHn\ncP/FlRfIjpu7nu6//LcmbxDxCQW3VOvmoZ1Y9d8X8M3+uQD85B+L6PXARN5csFFzX4p4SCcnJSob\nd+zjR68t5KM1WwFolBzmlxf14NqB7akcMSoiJ0KjSiRuVpfs5ldvL2PayhIAMlLCTPzhmeQ2S/e4\nMpFg06gSiZsu2Y158dsD+fjn59ItpzF7DpYz7OEpjHqxQLeQFaknCm6pk1aZabz3w7N4+IpeALy3\nrIie90/kicmFVGgGHpG4UnDLCbl6QB5rHrqQm4Z0BODRSSvpfN8Elmzc6W1hIglMwS0nLBQyHrik\nJwv+czj98poCcNGTM7j86ZmsKt7tcXUiiUfBLTHTND2F8bcP5U/X9SMzNYn5n+/gvEc/5Jt/nsX2\nPQe9Lk8kYSi4JeYu6NWGRQ+M4Mlr+5KVlsTctdvp++AkfvG/izlUXlHzC4jIcSm4JS7MjIt7t2Xh\n/SP4xddPAeDl2Z/T9efv8scpq3QJvcgJUHBLXJkZt57RmRW/HslFp7UB4HcTV9D9l//mmQ9XawSK\nSB3oAhypV7sPlPHQhOW8OufzI+sevao3l/fL9bAqEe/pyknxvR17D3LHq58wIzLzfFLIeOzqPlzc\nu63HlYl4Q1dOiu81TU/h5VsH8fHPz+WsbtmUVTh+8LdPOOeRqcwo3KIuFJHj0BG3+MLSL3Zy3/jF\nLNxQeeFO66w0fnVpT87v2drjykTqh7pKJLBmrdrCneMWsGX3AQB6ts3iRyNOZkCn5pqJXhKaglsC\nzTnHko27+P2kFUxdUXJk/T3Du3Fhrzac1Kqxh9WJxIeCWxLGzFVbmLSsiOdnrT2y7rtndebHI04m\nKaxTNJI4FNyScEpKDzB25mf8aepqANJTwtwyrBP3jjjZ48pEYkPBLQmrdP8h/vPNpbzxyUYAwiHj\nmev7M7hLC/WBS6ApuCXhrS7ZzZgP1/D3KhMY//N7Qzi1XRapSWEPKxOpGwW3NBhTVhQzdsZnTC+s\nvJBnUKfm3DO8G71ym5CeoiNwCQ4FtzQoh8ormLNmGzeMncPh63a+0act3zq9Ax1bZNCicaq3BYpE\nQcEtDVLRrv2sLtnNg+8sZ/mmXUDlOPAnr+1LZloy2ZkKcPEvBbc0aJt37mdlUSnj5n7OhMWbgcqT\nmLNGn0NOVprH1YkcW22CW52AknBaN0mjdZM0Tm3XhPN7tmZ18W6e+GAVgx6aDMD1g/P49Td6eVyl\nSN0puCVhNc9I4dI+7ThYVkFmWjK7D5Txr8WbGD9/I8u+2EVSOMT9F/egZ9smXpcqUisKbkl4KUkh\nvnNmZwBOaZPFK3PW4RzMWLWFR99bycBOzclqlMzV+e0JhczjakVqpuCWBmXkqa0ZeWrlHQfPfmQq\nkz8tZvKnxQB0y8mkZ9ssQmakJOlyevGvGk9OmtlY4CKg2Dl3ajQvqpOTEgRl5RUcLK9gycZdXPXM\nR0fWm8Ez1/dnhG4pK/Uo1icnnweeAl48kaJE/CYpHCIpHCK/QzMe/MaplO4/hHOVc2K+NHsdKzaX\nAtCxZYZm5hFfqTG4nXPTzKxj/EsR8UYoZHxrcIcj/3574RdML9xy5GpMMxjRM0eX0otvqI9b5Cjv\n3nUG5ZFLMF+Z8zn3v7WUy5+eRVLkxOVJrTL5/VW9vSxRGriYnYExs1FmVmBmBSUlJTU/QcSnzOxI\nN8rXTs5mRI8csjNTaZaRQumBMv45fwMHysq9LlMasKiunIx0lbyjk5PS0L300Vp++eZSmqUnE7Iv\nDx3snJ3Ba989HTMNKZTa05WTInEyomdr1mzZw6Hyii+tX76plLlrt7PvULnuSihxV+M7zMz+BnwN\naGlmG4D7nXPPxbswET/KyUrj/ot7fmX9y7PXMW/ddi55auaRvnCA1OQwj13Vm87ZmidTYieaUSXX\n1kchIkF2VrdsLu7dloNV+r73HixneuEWFm3YqeCWmNJ3OpEYaN88nSev7fuldcW79jPwocm8t2wz\nW3YfOObz8js2p0/7pvVRoiQQBbdInDRJT6Zl4xQmLN585PayR+ud24Q37xhWz5VJ0Cm4ReIkNSnM\nrNHnVjt08MevL2JlcWk9VyWJQMEtEkcpSaFqb1iVmZbExu37+Pbzc4/5+IgeOVwzMC+e5UlAKbhF\nPHLuKTmsKCqlpPSr/d9rt+6huHS/gluOScEt4pGqt5g92u2vzGNl0e56rkiCQsEt4kNpyWFKSg/w\n5w9XV7tNr3ZNGHpSy3qsSvxCwS3iQye1asz4+Rv57bufVrtNu6aNmDn6nHqsSvxCwS3iQ7d/7SRu\nHtKp2sf/651lTFx67CGGkvgU3CI+1Sil+vt/p6eEOVhWUe3jktgU3CIBlJIUYt+hch5/vzCq7VOT\nQ1w3KI/MtOQ4Vyb1QcEtEkDdchpT4RyPvb8y6ue0bdqISzQFW0JQcIsE0GV9c7m0d7uotl2/fS9n\n/W4qBw5p8odEoeAWCahQKLoJGw7PlXmovOZJUyQYYjZ1mYj4U3K4MuDLKnQyM1HoiFskwSVH7pVS\nsHY7jZJrP1N9vw7N6KL7ifuKglskwTVKDpOVlsRbC7/grYVf1Pr5Q7q04NXvDI5DZVJXCm6RBJcc\nDjFj9Dns3Huo1s+997WF7NNJTd9RcIs0AFlpyWTVYQx3RmqYrXsU3H6jk5MiUq1wKESZRqP4joJb\nRKqVFDLKKxTcfqPgFpFqhcOmYYQ+pD5uEalWUsjYd7CcwqLYz41pBp1aNiYc5YVE8v8U3CJSrfSU\nJL7YuZ/hj02Ly+t//+wu/Pj87nF57USm4BaRat07ohtDT2oRl9f+2fjFbNtT+yGKouAWkeNo2TiV\ni06Lzx0FH3xnGc7pxGdd6OSkiHgibBqxUlcKbhHxhJmh3K4bBbeIeCIUggp1ldSJgltEPBE2U3DX\nkYJbRDwRUldJnSm4RcQToZBRoeSuk6iC28xGmtkKM1tlZqPjXZSIJL6QqY+7rmoMbjMLA38ELgB6\nANeaWY94FyYiiS2k4YB1Fs0R90BglXNujXPuIDAOuDS+ZYlIolMfd91Fc+VkO2B9lX9vAAbFpxwR\naShCIfho9RaGP/qh16XETLP0FF677fS4/52YXfJuZqOAUQB5eXmxelkRSVA3DenEB58WeV1GTNVl\nlqG6iCa4NwLtq/w7N7LuS5xzY4AxAPn5+foCJCLHdWX/XK7sn+t1GYEUTR/3XKCrmXUysxTgGuCt\n+JYlIiLVqfGI2zlXZmZ3ABOBMDDWObc07pWJiMgxRdXH7ZybAEyIcy0iIhIFXTkpIhIwCm4RkYBR\ncIuIBIyCW0QkYBTcIiIBY/GYrNPMSoB1dXx6S2BLDMtJNGqf41P7HJ/ap3pet00H51x2NBvGJbhP\nhJkVOOfyva7Dr9Q+x6f2OT61T/WC1DbqKhERCRgFt4hIwPgxuMd4XYDPqX2OT+1zfGqf6gWmbXzX\nxy0iIsfnxyNuERE5Dt8Ed0OekNjM1prZYjNbYGYFkXXNzWySmRVGfjeLrDczeyLSTovMrF+V17kx\nsn2hmd3o1f6cKDMba2bFZrakyrqYtYeZ9Y+096rIc61+9/DEVNM+D5jZxsh7aIGZXVjlsZ9F9nWF\nmZ1fZf0xP3ORWzjPiaz/e+R2zoFhZu3NbIqZLTOzpWZ2V2R94ryHnHOe/1B5u9jVQGcgBVgI9PC6\nrnrc/7VAy6PW/Q8wOrI8Gng4snwh8C5gwGBgTmR9c2BN5HezyHIzr/etju1xJtAPWBKP9gA+jmxr\nkede4PU+x6B9HgB+dIxte0Q+T6lAp8jnLHy8zxzwGnBNZPnPwPe83udatk8boF9kORNYGWmHhHkP\n+eWIWxMSf9WlwAuR5ReAb1RZ/6KrNBtoamZtgPOBSc65bc657cAkYGR9Fx0LzrlpwLajVsekPSKP\nZTnnZrvKT+CLVV4rEKppn+pcCoxzzh1wzn0GrKLy83bMz1zkyPEc4B+R51dt60Bwzm1yzs2PLJcC\ny6mcOzdh3kN+Ce5jTUjczqNavOCA98xsXmTuToAc59ymyPJmICeyXF1bJXobxqo92kWWj16fCO6I\nfNUfe7gbgNq3Twtgh3Ou7Kj1gWRmHYG+wBwS6D3kl+Bu6IY55/oBFwDfN7Mzqz4Y+b+6hv9EqD2O\n6U9AF6APsAn4vbfleM/MGgP/BO52zu2q+ljQ30N+Ce6oJiROVM65jZHfxcAbVH6NLYp8JSPyuziy\neXVtlehtGKv22BhZPnp9oDnnipxz5c65CuAvVL6HoPbts5XKroKko9YHipklUxnarzjnxkdWJ8x7\nyC/B3WAnJDazDDPLPLwMjACWULn/h89i3wi8GVl+C7ghciZ8MLAz8vVvIjDCzJpFviaPiKxLFDFp\nj8hju8xscKQ/94YqrxVYhwMp4jIq30NQ2T7XmFmqmXUCulJ5Yu2Yn7nIkegU4MrI86u2dSBE/rs+\nByx3zj1a5aHEeQ95fQb48A+VZ3ZXUnmm++de11OP+92ZyjP6C4Glh/edyr7GyUAh8D7QPLLegD9G\n2mkxkF/ltb5N5cmnVcDNXu/bCbTJ36j8un+Iyv7DW2LZHkA+lcG2GniKyIVoQfmppn1eiuz/IiqD\nqE2V7X8e2dcVVBn9UN1nLvKe/DjSbq8DqV7vcy3bZxiV3SCLgAWRnwsT6T2kKydFRALGL10lIiIS\nJQW3iEjAKLhFRAJGwS0iEjAKbhGRgFFwi4gEjIJbRCRgFNwiIgHzfwvnQoSGc4mTAAAAAElFTkSu\nQmCC\n",
130 |       "text/plain": [
131 |        "<Figure size 432x288 with 1 Axes>"
132 |       ]
133 |      },
134 |      "metadata": {},
135 |      "output_type": "display_data"
136 |     }
137 |    ],
138 |    "source": [
139 |     "plt.plot(np.log10(a.values))\n",
140 |     "plt.show()"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": []
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": []
156 |   }
157 |  ],
158 |  "metadata": {
159 |   "kernelspec": {
160 |    "display_name": "Python 2",
161 |    "language": "python",
162 |    "name": "python2"
163 |   },
164 |   "language_info": {
165 |    "codemirror_mode": {
166 |     "name": "ipython",
167 |     "version": 2
168 |    },
169 |    "file_extension": ".py",
170 |    "mimetype": "text/x-python",
171 |    "name": "python",
172 |    "nbconvert_exporter": "python",
173 |    "pygments_lexer": "ipython2",
174 |    "version": "2.7.6"
175 |   }
176 |  },
177 |  "nbformat": 4,
178 |  "nbformat_minor": 2
179 | }
180 | 


--------------------------------------------------------------------------------
/data_files/chembl_preparation.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {
   7 |     "collapsed": true
   8 |    },
   9 |    "outputs": [],
  10 |    "source": [
  11 |     "from __future__ import division\n",
  12 |     "\n",
  13 |     "import os\n",
  14 |     "import numpy as np\n",
  15 |     "import pandas as pd\n",
  16 |     "from scipy import stats\n",
  17 |     "import matplotlib.pyplot as plt\n",
  18 |     "%matplotlib inline"
  19 |    ]
  20 |   },
  21 |   {
  22 |    "cell_type": "code",
  23 |    "execution_count": 30,
  24 |    "metadata": {
  25 |     "collapsed": true
  26 |    },
  27 |    "outputs": [],
  28 |    "source": [
  29 |     "from rdkit import Chem\n",
  30 |     "from rdkit.Chem import Draw\n",
  31 |     "from rdkit.Chem.Draw import IPythonConsole\n",
  32 |     "from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprint"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "code",
  37 |    "execution_count": 3,
  38 |    "metadata": {
  39 |     "collapsed": true
  40 |    },
  41 |    "outputs": [],
  42 |    "source": [
  43 |     "txt_dir = \"chembl_source\""
  44 |    ]
  45 |   },
  46 |   {
  47 |    "cell_type": "code",
  48 |    "execution_count": 4,
  49 |    "metadata": {
  50 |     "collapsed": false
  51 |    },
  52 |    "outputs": [
  53 |     {
  54 |      "name": "stderr",
  55 |      "output_type": "stream",
  56 |      "text": [
  57 |       "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (1,3,6,7,8,9,11,14,16,19,23,27,31,34,35,38,44,48,50,52,53,54,55,56,57,58) have mixed types. Specify dtype option on import or set low_memory=False.\n",
  58 |       "  interactivity=interactivity, compiler=compiler, result=result)\n"
  59 |      ]
  60 |     },
  61 |     {
  62 |      "data": {
  63 |       "text/plain": [
  64 |        "(1235867, 59)"
  65 |       ]
  66 |      },
  67 |      "execution_count": 4,
  68 |      "metadata": {},
  69 |      "output_type": "execute_result"
  70 |     }
  71 |    ],
  72 |    "source": [
  73 |     "# read all chembl bioactivity records\n",
  74 |     "chembl = pd.read_csv(os.path.join(txt_dir, \"inhibitor_2017_06_08.csv\"), delimiter=\"\\t\")\n",
  75 |     "chembl.shape"
  76 |    ]
  77 |   },
  78 |   {
  79 |    "cell_type": "code",
  80 |    "execution_count": 5,
  81 |    "metadata": {
  82 |     "collapsed": false,
  83 |     "scrolled": true
  84 |    },
  85 |    "outputs": [
  86 |     {
  87 |      "data": {
  88 |       "text/plain": [
  89 |        "(1230260, 59)"
  90 |       ]
  91 |      },
  92 |      "execution_count": 5,
  93 |      "metadata": {},
  94 |      "output_type": "execute_result"
  95 |     }
  96 |    ],
  97 |    "source": [
  98 |     "# Remove records has no canonical smiles\n",
  99 |     "m = chembl[\"CANONICAL_SMILES\"].isnull()\n",
 100 |     "chembl = chembl[~m]\n",
 101 |     "chembl.shape"
 102 |    ]
 103 |   },
 104 |   {
 105 |    "cell_type": "code",
 106 |    "execution_count": 21,
 107 |    "metadata": {
 108 |     "collapsed": true
 109 |    },
 110 |    "outputs": [],
 111 |    "source": [
 112 |     "# save inhibitors' smiles and apfp\n",
 113 |     "smiles = chembl[[\"CMPD_CHEMBLID\", \"CANONICAL_SMILES\"]].copy()\n",
 114 |     "smiles.drop_duplicates(subset=\"CMPD_CHEMBLID\", inplace=True)\n",
 115 |     "smiles.set_index(keys=\"CMPD_CHEMBLID\", drop=True, inplace=True)\n",
 116 |     "smiles.to_csv(txt_dir + \"/inhibitor_smiles.csv\")"
 117 |    ]
 118 |   },
 119 |   {
 120 |    "cell_type": "code",
 121 |    "execution_count": 31,
 122 |    "metadata": {
 123 |     "collapsed": false,
 124 |     "scrolled": true
 125 |    },
 126 |    "outputs": [
 127 |     {
 128 |      "name": "stdout",
 129 |      "output_type": "stream",
 130 |      "text": [
 131 |       "CHEMBL1161633\n",
 132 |       "CHEMBL2097021\n",
 133 |       "CHEMBL471869\n",
 134 |       "CHEMBL1161635\n",
 135 |       "CHEMBL181124\n",
 136 |       "CHEMBL1161637\n",
 137 |       "CHEMBL181880\n",
 138 |       "CHEMBL3593577\n",
 139 |       "CHEMBL450200\n",
 140 |       "CMPD_CHEMBLID\n",
 141 |       "CHEMBL450642\n",
 142 |       "CHEMBL2205792\n",
 143 |       "CHEMBL2205793\n",
 144 |       "CHEMBL490121\n",
 145 |       "CHEMBL523281\n",
 146 |       "CHEMBL463327\n",
 147 |       "CHEMBL522826\n",
 148 |       "CHEMBL2205790\n",
 149 |       "CHEMBL495469\n",
 150 |       "CHEMBL2205791\n",
 151 |       "CHEMBL492602\n",
 152 |       "CHEMBL2205788\n",
 153 |       "CHEMBL2205787\n",
 154 |       "CHEMBL452133\n",
 155 |       "CHEMBL2205785\n",
 156 |       "CHEMBL508580\n",
 157 |       "CHEMBL508803\n",
 158 |       "CHEMBL2205789\n",
 159 |       "CHEMBL2205786\n",
 160 |       "CHEMBL493431\n",
 161 |       "CHEMBL2087763\n",
 162 |       "CHEMBL2087764\n",
 163 |       "CHEMBL2179461\n",
 164 |       "CHEMBL2179458\n",
 165 |       "CHEMBL2179464\n",
 166 |       "CHEMBL2179462\n",
 167 |       "CHEMBL2179459\n",
 168 |       "CHEMBL2179463\n",
 169 |       "CHEMBL1083554\n",
 170 |       "CHEMBL2179460\n",
 171 |       "CHEMBL3327018\n"
 172 |      ]
 173 |     }
 174 |    ],
 175 |    "source": [
 176 |     "def dict_2_str(d):\n",
 177 |     "  keylist = d.keys()\n",
 178 |     "  keylist.sort()\n",
 179 |     "  kv_list = [\"{}: {}\".format(k, d[k]) for k in keylist] \n",
 180 |     "  return \", \".join(kv_list)\n",
 181 |     "\n",
 182 |     "apfp_file = open(txt_dir + \"/inhibitor_apfp.csv\", \"w\")\n",
 183 |     "for id_, row in smiles.iterrows():\n",
 184 |     "    m = Chem.MolFromSmiles(row.values[0])\n",
 185 |     "    if m is None:\n",
 186 |     "        print id_\n",
 187 |     "        continue\n",
 188 |     "    apfps = GetAtomPairFingerprint(Chem.RemoveHs(m)).GetNonzeroElements()\n",
 189 |     "    apfp_file.write(\"%s\\t{%s}\\n\" % (id_, dict_2_str(apfps)))\n",
 190 |     "apfp_file.close()"
 191 |    ]
 192 |   },
 193 |   {
 194 |    "cell_type": "code",
 195 |    "execution_count": 6,
 196 |    "metadata": {
 197 |     "collapsed": true
 198 |    },
 199 |    "outputs": [],
 200 |    "source": [
 201 |     "# calculate some molecules's weight\n",
 202 |     "def molwt(x):\n",
 203 |     "    try:\n",
 204 |     "        value = Chem.Descriptors.MolWt(Chem.MolFromSmiles(x))\n",
 205 |     "    except:\n",
 206 |     "        value = np.nan\n",
 207 |     "    return value\n",
 208 |     "\n",
 209 |     "m = chembl[\"MOLWEIGHT\"].isnull()\n",
 210 |     "chembl.loc[m, \"MOLWEIGHT\"] = chembl.loc[m, \"CANONICAL_SMILES\"].apply(molwt)"
 211 |    ]
 212 |   },
 213 |   {
 214 |    "cell_type": "code",
 215 |    "execution_count": 8,
 216 |    "metadata": {
 217 |     "collapsed": false
 218 |    },
 219 |    "outputs": [
 220 |     {
 221 |      "data": {
 222 |       "text/plain": [
 223 |        "(1223639, 59)"
 224 |       ]
 225 |      },
 226 |      "execution_count": 8,
 227 |      "metadata": {},
 228 |      "output_type": "execute_result"
 229 |     }
 230 |    ],
 231 |    "source": [
 232 |     "# remove molecules that has no \"MOLWEIGHT\"\n",
 233 |     "m = chembl[\"MOLWEIGHT\"].isnull()\n",
 234 |     "chembl = chembl[~m]\n",
 235 |     "chembl.shape"
 236 |    ]
 237 |   },
 238 |   {
 239 |    "cell_type": "code",
 240 |    "execution_count": 32,
 241 |    "metadata": {
 242 |     "collapsed": false,
 243 |     "scrolled": false
 244 |    },
 245 |    "outputs": [
 246 |     {
 247 |      "data": {
 248 |       "text/plain": [
 249 |        "(835299, 59)"
 250 |       ]
 251 |      },
 252 |      "execution_count": 32,
 253 |      "metadata": {},
 254 |      "output_type": "execute_result"
 255 |     }
 256 |    ],
 257 |    "source": [
 258 |     "# pick out inhibitor records\n",
 259 |     "inhibitor = chembl[chembl[\"STANDARD_TYPE\"].isin([\"IC50\", \"Ki\", \"EC50\"])]\n",
 260 |     "\n",
 261 |     "# inhibitor records: all IC50, a part of Ki and EC50 with \"inhibit\" in \"DESCRIPTION\"\n",
 262 |     "m0 = inhibitor[\"STANDARD_TYPE\"].isin([\"IC50\"]) \n",
 263 |     "m1 = inhibitor[\"STANDARD_TYPE\"].isin([\"Ki\", \"EC50\"]) \n",
 264 |     "m2 = inhibitor[\"DESCRIPTION\"].apply(lambda x: \"inhibit\" in x.lower())\n",
 265 |     "m = m0 | (m1 & m2)\n",
 266 |     "\n",
 267 |     "inhibitor = inhibitor[m]\n",
 268 |     "inhibitor.shape"
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "code",
 273 |    "execution_count": 33,
 274 |    "metadata": {
 275 |     "collapsed": false
 276 |    },
 277 |    "outputs": [
 278 |     {
 279 |      "data": {
 280 |       "text/plain": [
 281 |        "(716442, 59)"
 282 |       ]
 283 |      },
 284 |      "execution_count": 33,
 285 |      "metadata": {},
 286 |      "output_type": "execute_result"
 287 |     }
 288 |    ],
 289 |    "source": [
 290 |     "# some records without \"STANDARD_VALUE\" should be cleared away\n",
 291 |     "m = inhibitor[\"STANDARD_VALUE\"].isnull()\n",
 292 |     "inhibitor = inhibitor[~m]\n",
 293 |     "inhibitor.shape"
 294 |    ]
 295 |   },
 296 |   {
 297 |    "cell_type": "code",
 298 |    "execution_count": 34,
 299 |    "metadata": {
 300 |     "collapsed": false
 301 |    },
 302 |    "outputs": [
 303 |     {
 304 |      "data": {
 305 |       "text/plain": [
 306 |        "Outside typical range            26411\n",
 307 |        "Potential transcription error      378\n",
 308 |        "Non standard unit for type         370\n",
 309 |        "Manually validated                 163\n",
 310 |        "Name: DATA_VALIDITY_COMMENT, dtype: int64"
 311 |       ]
 312 |      },
 313 |      "execution_count": 34,
 314 |      "metadata": {},
 315 |      "output_type": "execute_result"
 316 |     }
 317 |    ],
 318 |    "source": [
 319 |     "inhibitor[\"DATA_VALIDITY_COMMENT\"].value_counts()"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "code",
 324 |    "execution_count": 35,
 325 |    "metadata": {
 326 |     "collapsed": false
 327 |    },
 328 |    "outputs": [
 329 |     {
 330 |      "data": {
 331 |       "text/plain": [
 332 |        "(690031, 59)"
 333 |       ]
 334 |      },
 335 |      "execution_count": 35,
 336 |      "metadata": {},
 337 |      "output_type": "execute_result"
 338 |     }
 339 |    ],
 340 |    "source": [
 341 |     "# some records with abnormal data also should be cleared away\n",
 342 |     "#error_comment = [\"Outside typical range\", \"Non standard unit for type\", \"Potential transcription error\"]\n",
 343 |     "error_comment = [\"Outside typical range\"]\n",
 344 |     "m = inhibitor[\"DATA_VALIDITY_COMMENT\"].isin(error_comment)\n",
 345 |     "inhibitor = inhibitor[~m]\n",
 346 |     "inhibitor.shape"
 347 |    ]
 348 |   },
 349 |   {
 350 |    "cell_type": "code",
 351 |    "execution_count": 36,
 352 |    "metadata": {
 353 |     "collapsed": false
 354 |    },
 355 |    "outputs": [
 356 |     {
 357 |      "data": {
 358 |       "text/plain": [
 359 |        "(689725, 59)"
 360 |       ]
 361 |      },
 362 |      "execution_count": 36,
 363 |      "metadata": {},
 364 |      "output_type": "execute_result"
 365 |     }
 366 |    ],
 367 |    "source": [
 368 |     "# correct some STANDARD_UNITS\n",
 369 |     "m = inhibitor[\"STANDARD_UNITS\"].isin([\"/uM\"])\n",
 370 |     "inhibitor.loc[m, \"STANDARD_VALUE\"] = inhibitor.loc[m, \"STANDARD_VALUE\"].astype(float).values * 1000\n",
 371 |     "inhibitor.loc[m, \"STANDARD_UNITS\"] = \"nM\"\n",
 372 |     "\n",
 373 |     "m = inhibitor[\"STANDARD_UNITS\"].isin([\"/nM\", \"ug nM-1\", \"Ke nM-1\"])\n",
 374 |     "inhibitor.loc[m, \"STANDARD_UNITS\"] = \"nM\"\n",
 375 |     "\n",
 376 |     "m = inhibitor[\"STANDARD_UNITS\"].isin([\"ug.mL-1\"])\n",
 377 |     "inhibitor.loc[m, \"STANDARD_VALUE\"] = inhibitor.loc[m, \"STANDARD_VALUE\"].astype(float) / inhibitor.loc[m, \"MOLWEIGHT\"].astype(float) * 10**6\n",
 378 |     "inhibitor.loc[m, \"STANDARD_UNITS\"] = \"nM\"\n",
 379 |     "\n",
 380 |     "m = inhibitor[\"STANDARD_UNITS\"].isin([\"nM\"])\n",
 381 |     "inhibitor = inhibitor[m]\n",
 382 |     "inhibitor.shape"
 383 |    ]
 384 |   },
 385 |   {
 386 |    "cell_type": "code",
 387 |    "execution_count": 37,
 388 |    "metadata": {
 389 |     "collapsed": false
 390 |    },
 391 |    "outputs": [
 392 |     {
 393 |      "data": {
 394 |       "text/plain": [
 395 |        "(662788, 59)"
 396 |       ]
 397 |      },
 398 |      "execution_count": 37,
 399 |      "metadata": {},
 400 |      "output_type": "execute_result"
 401 |     }
 402 |    ],
 403 |    "source": [
 404 |     "# remove duplicates\n",
 405 |     "m = inhibitor[\"POTENTIAL_DUPLICATE\"].fillna(0).astype(int) == 0\n",
 406 |     "inhibitor = inhibitor[m]\n",
 407 |     "inhibitor.shape"
 408 |    ]
 409 |   },
 410 |   {
 411 |    "cell_type": "code",
 412 |    "execution_count": 39,
 413 |    "metadata": {
 414 |     "collapsed": false
 415 |    },
 416 |    "outputs": [],
 417 |    "source": [
 418 |     "inhibitor.to_csv(txt_dir + \"/inhibitor_clean_2017_06_08.csv\", index=False)"
 419 |    ]
 420 |   },
 421 |   {
 422 |    "cell_type": "code",
 423 |    "execution_count": null,
 424 |    "metadata": {
 425 |     "collapsed": true
 426 |    },
 427 |    "outputs": [],
 428 |    "source": []
 429 |   },
 430 |   {
 431 |    "cell_type": "code",
 432 |    "execution_count": null,
 433 |    "metadata": {
 434 |     "collapsed": true
 435 |    },
 436 |    "outputs": [],
 437 |    "source": [
 438 |     "# judge a record's clf label\n",
 439 |     "def is_pos(row):\n",
 440 |     "  r = row[\"RELATION\"]\n",
 441 |     "  v = np.float32(row[\"STANDARD_VALUE\"])\n",
 442 |     "  if r == \"<\" or r == \"<=\":\n",
 443 |     "    return 1 if v <= 10000 else np.nan\n",
 444 |     "  elif r == \">\" or r == \">=\":\n",
 445 |     "    return -1 if v >= 10000 else np.nan\n",
 446 |     "  elif r == \"=\":\n",
 447 |     "    return 1 if v <= 10000 else -1\n",
 448 |     "  else:\n",
 449 |     "    return np.nan"
 450 |    ]
 451 |   },
 452 |   {
 453 |    "cell_type": "code",
 454 |    "execution_count": 89,
 455 |    "metadata": {
 456 |     "collapsed": true
 457 |    },
 458 |    "outputs": [],
 459 |    "source": [
 460 |     "inhibitor[\"CLF_LABEL\"] = inhibitor.apply(is_pos, axis=1)\n",
 461 |     "inhibitor = inhibitor[~inhibitor[\"CLF_LABEL\"].isnull()]\n",
 462 |     "inhibitor.loc[:, \"YEAR\"] = inhibitor.loc[:, \"YEAR\"].astype(float)"
 463 |    ]
 464 |   },
 465 |   {
 466 |    "cell_type": "code",
 467 |    "execution_count": 131,
 468 |    "metadata": {
 469 |     "collapsed": false,
 470 |     "scrolled": true
 471 |    },
 472 |    "outputs": [
 473 |     {
 474 |      "data": {
 475 |       "text/html": [
 476 |        "<div>\n",
 477 |        "<table border=\"1\" class=\"dataframe\">\n",
 478 |        "  <thead>\n",
 479 |        "    <tr style=\"text-align: right;\">\n",
 480 |        "      <th></th>\n",
 481 |        "      <th>TARGET_CHEMBLID</th>\n",
 482 |        "      <th>PREF_NAME</th>\n",
 483 |        "      <th>CMPD_CHEMBLID</th>\n",
 484 |        "      <th>CLF_LABEL</th>\n",
 485 |        "      <th>YEAR</th>\n",
 486 |        "    </tr>\n",
 487 |        "  </thead>\n",
 488 |        "  <tbody>\n",
 489 |        "    <tr>\n",
 490 |        "      <th>0</th>\n",
 491 |        "      <td>CHEMBL1075092</td>\n",
 492 |        "      <td>Glycine receptor subunit alpha-3</td>\n",
 493 |        "      <td>CHEMBL1092618</td>\n",
 494 |        "      <td>-1.0</td>\n",
 495 |        "      <td>2010.0</td>\n",
 496 |        "    </tr>\n",
 497 |        "    <tr>\n",
 498 |        "      <th>1</th>\n",
 499 |        "      <td>CHEMBL1075092</td>\n",
 500 |        "      <td>Glycine receptor subunit alpha-3</td>\n",
 501 |        "      <td>CHEMBL1092619</td>\n",
 502 |        "      <td>-1.0</td>\n",
 503 |        "      <td>2010.0</td>\n",
 504 |        "    </tr>\n",
 505 |        "    <tr>\n",
 506 |        "      <th>2</th>\n",
 507 |        "      <td>CHEMBL1075092</td>\n",
 508 |        "      <td>Glycine receptor subunit alpha-3</td>\n",
 509 |        "      <td>CHEMBL1093582</td>\n",
 510 |        "      <td>-1.0</td>\n",
 511 |        "      <td>2010.0</td>\n",
 512 |        "    </tr>\n",
 513 |        "    <tr>\n",
 514 |        "      <th>3</th>\n",
 515 |        "      <td>CHEMBL1075092</td>\n",
 516 |        "      <td>Glycine receptor subunit alpha-3</td>\n",
 517 |        "      <td>CHEMBL1093848</td>\n",
 518 |        "      <td>-1.0</td>\n",
 519 |        "      <td>2010.0</td>\n",
 520 |        "    </tr>\n",
 521 |        "    <tr>\n",
 522 |        "      <th>4</th>\n",
 523 |        "      <td>CHEMBL1075092</td>\n",
 524 |        "      <td>Glycine receptor subunit alpha-3</td>\n",
 525 |        "      <td>CHEMBL2398350</td>\n",
 526 |        "      <td>-1.0</td>\n",
 527 |        "      <td>2013.0</td>\n",
 528 |        "    </tr>\n",
 529 |        "    <tr>\n",
 530 |        "      <th>5</th>\n",
 531 |        "      <td>CHEMBL1075092</td>\n",
 532 |        "      <td>Glycine receptor subunit alpha-3</td>\n",
 533 |        "      <td>CHEMBL2398352</td>\n",
 534 |        "      <td>-1.0</td>\n",
 535 |        "      <td>2013.0</td>\n",
 536 |        "    </tr>\n",
 537 |        "    <tr>\n",
 538 |        "      <th>6</th>\n",
 539 |        "      <td>CHEMBL1075092</td>\n",
 540 |        "      <td>Glycine receptor subunit alpha-3</td>\n",
 541 |        "      <td>CHEMBL464651</td>\n",
 542 |        "      <td>1.0</td>\n",
 543 |        "      <td>2010.0</td>\n",
 544 |        "    </tr>\n",
 545 |        "    <tr>\n",
 546 |        "      <th>7</th>\n",
 547 |        "      <td>CHEMBL1075097</td>\n",
 548 |        "      <td>Arginase-1</td>\n",
 549 |        "      <td>CHEMBL1099169</td>\n",
 550 |        "      <td>1.0</td>\n",
 551 |        "      <td>2010.0</td>\n",
 552 |        "    </tr>\n",
 553 |        "    <tr>\n",
 554 |        "      <th>8</th>\n",
 555 |        "      <td>CHEMBL1075101</td>\n",
 556 |        "      <td>G-protein coupled receptor 81</td>\n",
 557 |        "      <td>CHEMBL3714817</td>\n",
 558 |        "      <td>1.0</td>\n",
 559 |        "      <td>NaN</td>\n",
 560 |        "    </tr>\n",
 561 |        "    <tr>\n",
 562 |        "      <th>9</th>\n",
 563 |        "      <td>CHEMBL1075101</td>\n",
 564 |        "      <td>G-protein coupled receptor 81</td>\n",
 565 |        "      <td>CHEMBL3714879</td>\n",
 566 |        "      <td>1.0</td>\n",
 567 |        "      <td>NaN</td>\n",
 568 |        "    </tr>\n",
 569 |        "    <tr>\n",
 570 |        "      <th>10</th>\n",
 571 |        "      <td>CHEMBL1075101</td>\n",
 572 |        "      <td>G-protein coupled receptor 81</td>\n",
 573 |        "      <td>CHEMBL3714885</td>\n",
 574 |        "      <td>1.0</td>\n",
 575 |        "      <td>NaN</td>\n",
 576 |        "    </tr>\n",
 577 |        "    <tr>\n",
 578 |        "      <th>11</th>\n",
 579 |        "      <td>CHEMBL1075101</td>\n",
 580 |        "      <td>G-protein coupled receptor 81</td>\n",
 581 |        "      <td>CHEMBL3714909</td>\n",
 582 |        "      <td>1.0</td>\n",
 583 |        "      <td>NaN</td>\n",
 584 |        "    </tr>\n",
 585 |        "    <tr>\n",
 586 |        "      <th>12</th>\n",
 587 |        "      <td>CHEMBL1075101</td>\n",
 588 |        "      <td>G-protein coupled receptor 81</td>\n",
 589 |        "      <td>CHEMBL3714960</td>\n",
 590 |        "      <td>1.0</td>\n",
 591 |        "      <td>NaN</td>\n",
 592 |        "    </tr>\n",
 593 |        "    <tr>\n",
 594 |        "      <th>13</th>\n",
 595 |        "      <td>CHEMBL1075101</td>\n",
 596 |        "      <td>G-protein coupled receptor 81</td>\n",
 597 |        "      <td>CHEMBL3714970</td>\n",
 598 |        "      <td>1.0</td>\n",
 599 |        "      <td>NaN</td>\n",
 600 |        "    </tr>\n",
 601 |        "    <tr>\n",
 602 |        "      <th>14</th>\n",
 603 |        "      <td>CHEMBL1075101</td>\n",
 604 |        "      <td>G-protein coupled receptor 81</td>\n",
 605 |        "      <td>CHEMBL3715004</td>\n",
 606 |        "      <td>1.0</td>\n",
 607 |        "      <td>NaN</td>\n",
 608 |        "    </tr>\n",
 609 |        "    <tr>\n",
 610 |        "      <th>15</th>\n",
 611 |        "      <td>CHEMBL1075101</td>\n",
 612 |        "      <td>G-protein coupled receptor 81</td>\n",
 613 |        "      <td>CHEMBL3715017</td>\n",
 614 |        "      <td>1.0</td>\n",
 615 |        "      <td>NaN</td>\n",
 616 |        "    </tr>\n",
 617 |        "    <tr>\n",
 618 |        "      <th>16</th>\n",
 619 |        "      <td>CHEMBL1075101</td>\n",
 620 |        "      <td>G-protein coupled receptor 81</td>\n",
 621 |        "      <td>CHEMBL3715077</td>\n",
 622 |        "      <td>1.0</td>\n",
 623 |        "      <td>NaN</td>\n",
 624 |        "    </tr>\n",
 625 |        "    <tr>\n",
 626 |        "      <th>17</th>\n",
 627 |        "      <td>CHEMBL1075101</td>\n",
 628 |        "      <td>G-protein coupled receptor 81</td>\n",
 629 |        "      <td>CHEMBL3715155</td>\n",
 630 |        "      <td>1.0</td>\n",
 631 |        "      <td>NaN</td>\n",
 632 |        "    </tr>\n",
 633 |        "    <tr>\n",
 634 |        "      <th>18</th>\n",
 635 |        "      <td>CHEMBL1075101</td>\n",
 636 |        "      <td>G-protein coupled receptor 81</td>\n",
 637 |        "      <td>CHEMBL3715174</td>\n",
 638 |        "      <td>1.0</td>\n",
 639 |        "      <td>NaN</td>\n",
 640 |        "    </tr>\n",
 641 |        "    <tr>\n",
 642 |        "      <th>19</th>\n",
 643 |        "      <td>CHEMBL1075101</td>\n",
 644 |        "      <td>G-protein coupled receptor 81</td>\n",
 645 |        "      <td>CHEMBL3715218</td>\n",
 646 |        "      <td>1.0</td>\n",
 647 |        "      <td>NaN</td>\n",
 648 |        "    </tr>\n",
 649 |        "    <tr>\n",
 650 |        "      <th>20</th>\n",
 651 |        "      <td>CHEMBL1075101</td>\n",
 652 |        "      <td>G-protein coupled receptor 81</td>\n",
 653 |        "      <td>CHEMBL3715359</td>\n",
 654 |        "      <td>1.0</td>\n",
 655 |        "      <td>NaN</td>\n",
 656 |        "    </tr>\n",
 657 |        "    <tr>\n",
 658 |        "      <th>21</th>\n",
 659 |        "      <td>CHEMBL1075101</td>\n",
 660 |        "      <td>G-protein coupled receptor 81</td>\n",
 661 |        "      <td>CHEMBL3715361</td>\n",
 662 |        "      <td>1.0</td>\n",
 663 |        "      <td>NaN</td>\n",
 664 |        "    </tr>\n",
 665 |        "    <tr>\n",
 666 |        "      <th>22</th>\n",
 667 |        "      <td>CHEMBL1075101</td>\n",
 668 |        "      <td>G-protein coupled receptor 81</td>\n",
 669 |        "      <td>CHEMBL3715375</td>\n",
 670 |        "      <td>1.0</td>\n",
 671 |        "      <td>NaN</td>\n",
 672 |        "    </tr>\n",
 673 |        "    <tr>\n",
 674 |        "      <th>23</th>\n",
 675 |        "      <td>CHEMBL1075101</td>\n",
 676 |        "      <td>G-protein coupled receptor 81</td>\n",
 677 |        "      <td>CHEMBL3715394</td>\n",
 678 |        "      <td>1.0</td>\n",
 679 |        "      <td>NaN</td>\n",
 680 |        "    </tr>\n",
 681 |        "    <tr>\n",
 682 |        "      <th>24</th>\n",
 683 |        "      <td>CHEMBL1075101</td>\n",
 684 |        "      <td>G-protein coupled receptor 81</td>\n",
 685 |        "      <td>CHEMBL3715478</td>\n",
 686 |        "      <td>1.0</td>\n",
 687 |        "      <td>NaN</td>\n",
 688 |        "    </tr>\n",
 689 |        "    <tr>\n",
 690 |        "      <th>25</th>\n",
 691 |        "      <td>CHEMBL1075101</td>\n",
 692 |        "      <td>G-protein coupled receptor 81</td>\n",
 693 |        "      <td>CHEMBL3715536</td>\n",
 694 |        "      <td>1.0</td>\n",
 695 |        "      <td>NaN</td>\n",
 696 |        "    </tr>\n",
 697 |        "    <tr>\n",
 698 |        "      <th>26</th>\n",
 699 |        "      <td>CHEMBL1075101</td>\n",
 700 |        "      <td>G-protein coupled receptor 81</td>\n",
 701 |        "      <td>CHEMBL3715558</td>\n",
 702 |        "      <td>1.0</td>\n",
 703 |        "      <td>NaN</td>\n",
 704 |        "    </tr>\n",
 705 |        "    <tr>\n",
 706 |        "      <th>27</th>\n",
 707 |        "      <td>CHEMBL1075101</td>\n",
 708 |        "      <td>G-protein coupled receptor 81</td>\n",
 709 |        "      <td>CHEMBL3715572</td>\n",
 710 |        "      <td>1.0</td>\n",
 711 |        "      <td>NaN</td>\n",
 712 |        "    </tr>\n",
 713 |        "    <tr>\n",
 714 |        "      <th>28</th>\n",
 715 |        "      <td>CHEMBL1075101</td>\n",
 716 |        "      <td>G-protein coupled receptor 81</td>\n",
 717 |        "      <td>CHEMBL3715577</td>\n",
 718 |        "      <td>1.0</td>\n",
 719 |        "      <td>NaN</td>\n",
 720 |        "    </tr>\n",
 721 |        "    <tr>\n",
 722 |        "      <th>29</th>\n",
 723 |        "      <td>CHEMBL1075101</td>\n",
 724 |        "      <td>G-protein coupled receptor 81</td>\n",
 725 |        "      <td>CHEMBL3715599</td>\n",
 726 |        "      <td>1.0</td>\n",
 727 |        "      <td>NaN</td>\n",
 728 |        "    </tr>\n",
 729 |        "    <tr>\n",
 730 |        "      <th>...</th>\n",
 731 |        "      <td>...</td>\n",
 732 |        "      <td>...</td>\n",
 733 |        "      <td>...</td>\n",
 734 |        "      <td>...</td>\n",
 735 |        "      <td>...</td>\n",
 736 |        "    </tr>\n",
 737 |        "    <tr>\n",
 738 |        "      <th>542203</th>\n",
 739 |        "      <td>CHEMBL6175</td>\n",
 740 |        "      <td>Lysine-specific demethylase 4C</td>\n",
 741 |        "      <td>CHEMBL3786862</td>\n",
 742 |        "      <td>1.0</td>\n",
 743 |        "      <td>2016.0</td>\n",
 744 |        "    </tr>\n",
 745 |        "    <tr>\n",
 746 |        "      <th>542204</th>\n",
 747 |        "      <td>CHEMBL6175</td>\n",
 748 |        "      <td>Lysine-specific demethylase 4C</td>\n",
 749 |        "      <td>CHEMBL3786952</td>\n",
 750 |        "      <td>1.0</td>\n",
 751 |        "      <td>2016.0</td>\n",
 752 |        "    </tr>\n",
 753 |        "    <tr>\n",
 754 |        "      <th>542205</th>\n",
 755 |        "      <td>CHEMBL6175</td>\n",
 756 |        "      <td>Lysine-specific demethylase 4C</td>\n",
 757 |        "      <td>CHEMBL3786963</td>\n",
 758 |        "      <td>1.0</td>\n",
 759 |        "      <td>2016.0</td>\n",
 760 |        "    </tr>\n",
 761 |        "    <tr>\n",
 762 |        "      <th>542206</th>\n",
 763 |        "      <td>CHEMBL6175</td>\n",
 764 |        "      <td>Lysine-specific demethylase 4C</td>\n",
 765 |        "      <td>CHEMBL3787020</td>\n",
 766 |        "      <td>1.0</td>\n",
 767 |        "      <td>2016.0</td>\n",
 768 |        "    </tr>\n",
 769 |        "    <tr>\n",
 770 |        "      <th>542207</th>\n",
 771 |        "      <td>CHEMBL6175</td>\n",
 772 |        "      <td>Lysine-specific demethylase 4C</td>\n",
 773 |        "      <td>CHEMBL3787044</td>\n",
 774 |        "      <td>1.0</td>\n",
 775 |        "      <td>2016.0</td>\n",
 776 |        "    </tr>\n",
 777 |        "    <tr>\n",
 778 |        "      <th>542208</th>\n",
 779 |        "      <td>CHEMBL6175</td>\n",
 780 |        "      <td>Lysine-specific demethylase 4C</td>\n",
 781 |        "      <td>CHEMBL3787133</td>\n",
 782 |        "      <td>1.0</td>\n",
 783 |        "      <td>2016.0</td>\n",
 784 |        "    </tr>\n",
 785 |        "    <tr>\n",
 786 |        "      <th>542209</th>\n",
 787 |        "      <td>CHEMBL6175</td>\n",
 788 |        "      <td>Lysine-specific demethylase 4C</td>\n",
 789 |        "      <td>CHEMBL3787193</td>\n",
 790 |        "      <td>1.0</td>\n",
 791 |        "      <td>2016.0</td>\n",
 792 |        "    </tr>\n",
 793 |        "    <tr>\n",
 794 |        "      <th>542210</th>\n",
 795 |        "      <td>CHEMBL6175</td>\n",
 796 |        "      <td>Lysine-specific demethylase 4C</td>\n",
 797 |        "      <td>CHEMBL3787438</td>\n",
 798 |        "      <td>1.0</td>\n",
 799 |        "      <td>2016.0</td>\n",
 800 |        "    </tr>\n",
 801 |        "    <tr>\n",
 802 |        "      <th>542211</th>\n",
 803 |        "      <td>CHEMBL6175</td>\n",
 804 |        "      <td>Lysine-specific demethylase 4C</td>\n",
 805 |        "      <td>CHEMBL3787516</td>\n",
 806 |        "      <td>1.0</td>\n",
 807 |        "      <td>2016.0</td>\n",
 808 |        "    </tr>\n",
 809 |        "    <tr>\n",
 810 |        "      <th>542212</th>\n",
 811 |        "      <td>CHEMBL6175</td>\n",
 812 |        "      <td>Lysine-specific demethylase 4C</td>\n",
 813 |        "      <td>CHEMBL3787534</td>\n",
 814 |        "      <td>1.0</td>\n",
 815 |        "      <td>2016.0</td>\n",
 816 |        "    </tr>\n",
 817 |        "    <tr>\n",
 818 |        "      <th>542213</th>\n",
 819 |        "      <td>CHEMBL6175</td>\n",
 820 |        "      <td>Lysine-specific demethylase 4C</td>\n",
 821 |        "      <td>CHEMBL3787548</td>\n",
 822 |        "      <td>1.0</td>\n",
 823 |        "      <td>2016.0</td>\n",
 824 |        "    </tr>\n",
 825 |        "    <tr>\n",
 826 |        "      <th>542214</th>\n",
 827 |        "      <td>CHEMBL6175</td>\n",
 828 |        "      <td>Lysine-specific demethylase 4C</td>\n",
 829 |        "      <td>CHEMBL3787556</td>\n",
 830 |        "      <td>1.0</td>\n",
 831 |        "      <td>2016.0</td>\n",
 832 |        "    </tr>\n",
 833 |        "    <tr>\n",
 834 |        "      <th>542215</th>\n",
 835 |        "      <td>CHEMBL6175</td>\n",
 836 |        "      <td>Lysine-specific demethylase 4C</td>\n",
 837 |        "      <td>CHEMBL3787664</td>\n",
 838 |        "      <td>1.0</td>\n",
 839 |        "      <td>2016.0</td>\n",
 840 |        "    </tr>\n",
 841 |        "    <tr>\n",
 842 |        "      <th>542216</th>\n",
 843 |        "      <td>CHEMBL6175</td>\n",
 844 |        "      <td>Lysine-specific demethylase 4C</td>\n",
 845 |        "      <td>CHEMBL3787669</td>\n",
 846 |        "      <td>1.0</td>\n",
 847 |        "      <td>2016.0</td>\n",
 848 |        "    </tr>\n",
 849 |        "    <tr>\n",
 850 |        "      <th>542217</th>\n",
 851 |        "      <td>CHEMBL6177</td>\n",
 852 |        "      <td>NAD kinase</td>\n",
 853 |        "      <td>CHEMBL233434</td>\n",
 854 |        "      <td>-1.0</td>\n",
 855 |        "      <td>2008.0</td>\n",
 856 |        "    </tr>\n",
 857 |        "    <tr>\n",
 858 |        "      <th>542218</th>\n",
 859 |        "      <td>CHEMBL6177</td>\n",
 860 |        "      <td>NAD kinase</td>\n",
 861 |        "      <td>CHEMBL538665</td>\n",
 862 |        "      <td>-1.0</td>\n",
 863 |        "      <td>2009.0</td>\n",
 864 |        "    </tr>\n",
 865 |        "    <tr>\n",
 866 |        "      <th>542219</th>\n",
 867 |        "      <td>CHEMBL6177</td>\n",
 868 |        "      <td>NAD kinase</td>\n",
 869 |        "      <td>CHEMBL560315</td>\n",
 870 |        "      <td>1.0</td>\n",
 871 |        "      <td>2009.0</td>\n",
 872 |        "    </tr>\n",
 873 |        "    <tr>\n",
 874 |        "      <th>542220</th>\n",
 875 |        "      <td>CHEMBL6177</td>\n",
 876 |        "      <td>NAD kinase</td>\n",
 877 |        "      <td>CHEMBL561654</td>\n",
 878 |        "      <td>-1.0</td>\n",
 879 |        "      <td>2009.0</td>\n",
 880 |        "    </tr>\n",
 881 |        "    <tr>\n",
 882 |        "      <th>542221</th>\n",
 883 |        "      <td>CHEMBL6177</td>\n",
 884 |        "      <td>NAD kinase</td>\n",
 885 |        "      <td>CHEMBL562056</td>\n",
 886 |        "      <td>1.0</td>\n",
 887 |        "      <td>2009.0</td>\n",
 888 |        "    </tr>\n",
 889 |        "    <tr>\n",
 890 |        "      <th>542222</th>\n",
 891 |        "      <td>CHEMBL6186</td>\n",
 892 |        "      <td>Serine/threonine-protein kinase Sgk3</td>\n",
 893 |        "      <td>CHEMBL2333365</td>\n",
 894 |        "      <td>-1.0</td>\n",
 895 |        "      <td>2013.0</td>\n",
 896 |        "    </tr>\n",
 897 |        "    <tr>\n",
 898 |        "      <th>542223</th>\n",
 899 |        "      <td>CHEMBL6186</td>\n",
 900 |        "      <td>Serine/threonine-protein kinase Sgk3</td>\n",
 901 |        "      <td>CHEMBL3092460</td>\n",
 902 |        "      <td>-1.0</td>\n",
 903 |        "      <td>2015.0</td>\n",
 904 |        "    </tr>\n",
 905 |        "    <tr>\n",
 906 |        "      <th>542224</th>\n",
 907 |        "      <td>CHEMBL6186</td>\n",
 908 |        "      <td>Serine/threonine-protein kinase Sgk3</td>\n",
 909 |        "      <td>CHEMBL3092468</td>\n",
 910 |        "      <td>1.0</td>\n",
 911 |        "      <td>2015.0</td>\n",
 912 |        "    </tr>\n",
 913 |        "    <tr>\n",
 914 |        "      <th>542225</th>\n",
 915 |        "      <td>CHEMBL6186</td>\n",
 916 |        "      <td>Serine/threonine-protein kinase Sgk3</td>\n",
 917 |        "      <td>CHEMBL3745885</td>\n",
 918 |        "      <td>-1.0</td>\n",
 919 |        "      <td>2016.0</td>\n",
 920 |        "    </tr>\n",
 921 |        "    <tr>\n",
 922 |        "      <th>542226</th>\n",
 923 |        "      <td>CHEMBL6195</td>\n",
 924 |        "      <td>Ubiquitin carboxyl-terminal hydrolase isozyme L3</td>\n",
 925 |        "      <td>CHEMBL1190585</td>\n",
 926 |        "      <td>-1.0</td>\n",
 927 |        "      <td>2007.0</td>\n",
 928 |        "    </tr>\n",
 929 |        "    <tr>\n",
 930 |        "      <th>542227</th>\n",
 931 |        "      <td>CHEMBL6195</td>\n",
 932 |        "      <td>Ubiquitin carboxyl-terminal hydrolase isozyme L3</td>\n",
 933 |        "      <td>CHEMBL1241028</td>\n",
 934 |        "      <td>1.0</td>\n",
 935 |        "      <td>2007.0</td>\n",
 936 |        "    </tr>\n",
 937 |        "    <tr>\n",
 938 |        "      <th>542228</th>\n",
 939 |        "      <td>CHEMBL6195</td>\n",
 940 |        "      <td>Ubiquitin carboxyl-terminal hydrolase isozyme L3</td>\n",
 941 |        "      <td>CHEMBL1241672</td>\n",
 942 |        "      <td>-1.0</td>\n",
 943 |        "      <td>2007.0</td>\n",
 944 |        "    </tr>\n",
 945 |        "    <tr>\n",
 946 |        "      <th>542229</th>\n",
 947 |        "      <td>CHEMBL6195</td>\n",
 948 |        "      <td>Ubiquitin carboxyl-terminal hydrolase isozyme L3</td>\n",
 949 |        "      <td>CHEMBL1241673</td>\n",
 950 |        "      <td>-1.0</td>\n",
 951 |        "      <td>2007.0</td>\n",
 952 |        "    </tr>\n",
 953 |        "    <tr>\n",
 954 |        "      <th>542230</th>\n",
 955 |        "      <td>CHEMBL6195</td>\n",
 956 |        "      <td>Ubiquitin carboxyl-terminal hydrolase isozyme L3</td>\n",
 957 |        "      <td>CHEMBL1241765</td>\n",
 958 |        "      <td>-1.0</td>\n",
 959 |        "      <td>2007.0</td>\n",
 960 |        "    </tr>\n",
 961 |        "    <tr>\n",
 962 |        "      <th>542231</th>\n",
 963 |        "      <td>CHEMBL6195</td>\n",
 964 |        "      <td>Ubiquitin carboxyl-terminal hydrolase isozyme L3</td>\n",
 965 |        "      <td>CHEMBL1241766</td>\n",
 966 |        "      <td>-1.0</td>\n",
 967 |        "      <td>2007.0</td>\n",
 968 |        "    </tr>\n",
 969 |        "    <tr>\n",
 970 |        "      <th>542232</th>\n",
 971 |        "      <td>CHEMBL6195</td>\n",
 972 |        "      <td>Ubiquitin carboxyl-terminal hydrolase isozyme L3</td>\n",
 973 |        "      <td>CHEMBL590</td>\n",
 974 |        "      <td>-1.0</td>\n",
 975 |        "      <td>2007.0</td>\n",
 976 |        "    </tr>\n",
 977 |        "  </tbody>\n",
 978 |        "</table>\n",
 979 |        "<p>542233 rows × 5 columns</p>\n",
 980 |        "</div>"
 981 |       ],
 982 |       "text/plain": [
 983 |        "       TARGET_CHEMBLID                                         PREF_NAME  \\\n",
 984 |        "0        CHEMBL1075092                  Glycine receptor subunit alpha-3   \n",
 985 |        "1        CHEMBL1075092                  Glycine receptor subunit alpha-3   \n",
 986 |        "2        CHEMBL1075092                  Glycine receptor subunit alpha-3   \n",
 987 |        "3        CHEMBL1075092                  Glycine receptor subunit alpha-3   \n",
 988 |        "4        CHEMBL1075092                  Glycine receptor subunit alpha-3   \n",
 989 |        "5        CHEMBL1075092                  Glycine receptor subunit alpha-3   \n",
 990 |        "6        CHEMBL1075092                  Glycine receptor subunit alpha-3   \n",
 991 |        "7        CHEMBL1075097                                        Arginase-1   \n",
 992 |        "8        CHEMBL1075101                     G-protein coupled receptor 81   \n",
 993 |        "9        CHEMBL1075101                     G-protein coupled receptor 81   \n",
 994 |        "10       CHEMBL1075101                     G-protein coupled receptor 81   \n",
 995 |        "11       CHEMBL1075101                     G-protein coupled receptor 81   \n",
 996 |        "12       CHEMBL1075101                     G-protein coupled receptor 81   \n",
 997 |        "13       CHEMBL1075101                     G-protein coupled receptor 81   \n",
 998 |        "14       CHEMBL1075101                     G-protein coupled receptor 81   \n",
 999 |        "15       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1000 |        "16       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1001 |        "17       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1002 |        "18       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1003 |        "19       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1004 |        "20       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1005 |        "21       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1006 |        "22       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1007 |        "23       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1008 |        "24       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1009 |        "25       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1010 |        "26       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1011 |        "27       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1012 |        "28       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1013 |        "29       CHEMBL1075101                     G-protein coupled receptor 81   \n",
1014 |        "...                ...                                               ...   \n",
1015 |        "542203      CHEMBL6175                    Lysine-specific demethylase 4C   \n",
1016 |        "542204      CHEMBL6175                    Lysine-specific demethylase 4C   \n",
1017 |        "542205      CHEMBL6175                    Lysine-specific demethylase 4C   \n",
1018 |        "542206      CHEMBL6175                    Lysine-specific demethylase 4C   \n",
1019 |        "542207      CHEMBL6175                    Lysine-specific demethylase 4C   \n",
1020 |        "542208      CHEMBL6175                    Lysine-specific demethylase 4C   \n",
1021 |        "542209      CHEMBL6175                    Lysine-specific demethylase 4C   \n",
1022 |        "542210      CHEMBL6175                    Lysine-specific demethylase 4C   \n",
1023 |        "542211      CHEMBL6175                    Lysine-specific demethylase 4C   \n",
1024 |        "542212      CHEMBL6175                    Lysine-specific demethylase 4C   \n",
1025 |        "542213      CHEMBL6175                    Lysine-specific demethylase 4C   \n",
1026 |        "542214      CHEMBL6175                    Lysine-specific demethylase 4C   \n",
1027 |        "542215      CHEMBL6175                    Lysine-specific demethylase 4C   \n",
1028 |        "542216      CHEMBL6175                    Lysine-specific demethylase 4C   \n",
1029 |        "542217      CHEMBL6177                                        NAD kinase   \n",
1030 |        "542218      CHEMBL6177                                        NAD kinase   \n",
1031 |        "542219      CHEMBL6177                                        NAD kinase   \n",
1032 |        "542220      CHEMBL6177                                        NAD kinase   \n",
1033 |        "542221      CHEMBL6177                                        NAD kinase   \n",
1034 |        "542222      CHEMBL6186              Serine/threonine-protein kinase Sgk3   \n",
1035 |        "542223      CHEMBL6186              Serine/threonine-protein kinase Sgk3   \n",
1036 |        "542224      CHEMBL6186              Serine/threonine-protein kinase Sgk3   \n",
1037 |        "542225      CHEMBL6186              Serine/threonine-protein kinase Sgk3   \n",
1038 |        "542226      CHEMBL6195  Ubiquitin carboxyl-terminal hydrolase isozyme L3   \n",
1039 |        "542227      CHEMBL6195  Ubiquitin carboxyl-terminal hydrolase isozyme L3   \n",
1040 |        "542228      CHEMBL6195  Ubiquitin carboxyl-terminal hydrolase isozyme L3   \n",
1041 |        "542229      CHEMBL6195  Ubiquitin carboxyl-terminal hydrolase isozyme L3   \n",
1042 |        "542230      CHEMBL6195  Ubiquitin carboxyl-terminal hydrolase isozyme L3   \n",
1043 |        "542231      CHEMBL6195  Ubiquitin carboxyl-terminal hydrolase isozyme L3   \n",
1044 |        "542232      CHEMBL6195  Ubiquitin carboxyl-terminal hydrolase isozyme L3   \n",
1045 |        "\n",
1046 |        "        CMPD_CHEMBLID  CLF_LABEL    YEAR  \n",
1047 |        "0       CHEMBL1092618       -1.0  2010.0  \n",
1048 |        "1       CHEMBL1092619       -1.0  2010.0  \n",
1049 |        "2       CHEMBL1093582       -1.0  2010.0  \n",
1050 |        "3       CHEMBL1093848       -1.0  2010.0  \n",
1051 |        "4       CHEMBL2398350       -1.0  2013.0  \n",
1052 |        "5       CHEMBL2398352       -1.0  2013.0  \n",
1053 |        "6        CHEMBL464651        1.0  2010.0  \n",
1054 |        "7       CHEMBL1099169        1.0  2010.0  \n",
1055 |        "8       CHEMBL3714817        1.0     NaN  \n",
1056 |        "9       CHEMBL3714879        1.0     NaN  \n",
1057 |        "10      CHEMBL3714885        1.0     NaN  \n",
1058 |        "11      CHEMBL3714909        1.0     NaN  \n",
1059 |        "12      CHEMBL3714960        1.0     NaN  \n",
1060 |        "13      CHEMBL3714970        1.0     NaN  \n",
1061 |        "14      CHEMBL3715004        1.0     NaN  \n",
1062 |        "15      CHEMBL3715017        1.0     NaN  \n",
1063 |        "16      CHEMBL3715077        1.0     NaN  \n",
1064 |        "17      CHEMBL3715155        1.0     NaN  \n",
1065 |        "18      CHEMBL3715174        1.0     NaN  \n",
1066 |        "19      CHEMBL3715218        1.0     NaN  \n",
1067 |        "20      CHEMBL3715359        1.0     NaN  \n",
1068 |        "21      CHEMBL3715361        1.0     NaN  \n",
1069 |        "22      CHEMBL3715375        1.0     NaN  \n",
1070 |        "23      CHEMBL3715394        1.0     NaN  \n",
1071 |        "24      CHEMBL3715478        1.0     NaN  \n",
1072 |        "25      CHEMBL3715536        1.0     NaN  \n",
1073 |        "26      CHEMBL3715558        1.0     NaN  \n",
1074 |        "27      CHEMBL3715572        1.0     NaN  \n",
1075 |        "28      CHEMBL3715577        1.0     NaN  \n",
1076 |        "29      CHEMBL3715599        1.0     NaN  \n",
1077 |        "...               ...        ...     ...  \n",
1078 |        "542203  CHEMBL3786862        1.0  2016.0  \n",
1079 |        "542204  CHEMBL3786952        1.0  2016.0  \n",
1080 |        "542205  CHEMBL3786963        1.0  2016.0  \n",
1081 |        "542206  CHEMBL3787020        1.0  2016.0  \n",
1082 |        "542207  CHEMBL3787044        1.0  2016.0  \n",
1083 |        "542208  CHEMBL3787133        1.0  2016.0  \n",
1084 |        "542209  CHEMBL3787193        1.0  2016.0  \n",
1085 |        "542210  CHEMBL3787438        1.0  2016.0  \n",
1086 |        "542211  CHEMBL3787516        1.0  2016.0  \n",
1087 |        "542212  CHEMBL3787534        1.0  2016.0  \n",
1088 |        "542213  CHEMBL3787548        1.0  2016.0  \n",
1089 |        "542214  CHEMBL3787556        1.0  2016.0  \n",
1090 |        "542215  CHEMBL3787664        1.0  2016.0  \n",
1091 |        "542216  CHEMBL3787669        1.0  2016.0  \n",
1092 |        "542217   CHEMBL233434       -1.0  2008.0  \n",
1093 |        "542218   CHEMBL538665       -1.0  2009.0  \n",
1094 |        "542219   CHEMBL560315        1.0  2009.0  \n",
1095 |        "542220   CHEMBL561654       -1.0  2009.0  \n",
1096 |        "542221   CHEMBL562056        1.0  2009.0  \n",
1097 |        "542222  CHEMBL2333365       -1.0  2013.0  \n",
1098 |        "542223  CHEMBL3092460       -1.0  2015.0  \n",
1099 |        "542224  CHEMBL3092468        1.0  2015.0  \n",
1100 |        "542225  CHEMBL3745885       -1.0  2016.0  \n",
1101 |        "542226  CHEMBL1190585       -1.0  2007.0  \n",
1102 |        "542227  CHEMBL1241028        1.0  2007.0  \n",
1103 |        "542228  CHEMBL1241672       -1.0  2007.0  \n",
1104 |        "542229  CHEMBL1241673       -1.0  2007.0  \n",
1105 |        "542230  CHEMBL1241765       -1.0  2007.0  \n",
1106 |        "542231  CHEMBL1241766       -1.0  2007.0  \n",
1107 |        "542232      CHEMBL590       -1.0  2007.0  \n",
1108 |        "\n",
1109 |        "[542233 rows x 5 columns]"
1110 |       ]
1111 |      },
1112 |      "execution_count": 131,
1113 |      "metadata": {},
1114 |      "output_type": "execute_result"
1115 |     }
1116 |    ],
1117 |    "source": [
1118 |     "# group\n",
1119 |     "grouped = inhibitor.groupby(by=[\"TARGET_CHEMBLID\", \"PREF_NAME\", \"CMPD_CHEMBLID\"], as_index=False)\n",
1120 |     "# judge one molecule's label by the average label\n",
1121 |     "clf_label = grouped[[\"CLF_LABEL\", \"YEAR\"]].mean()\n",
1122 |     "clf_label"
1123 |    ]
1124 |   },
1125 |   {
1126 |    "cell_type": "code",
1127 |    "execution_count": 105,
1128 |    "metadata": {
1129 |     "collapsed": true
1130 |    },
1131 |    "outputs": [],
1132 |    "source": [
1133 |     "clf_label.to_csv(txt_dir + \"/inhibitor_clf_label.csv\")"
1134 |    ]
1135 |   },
1136 |   {
1137 |    "cell_type": "code",
1138 |    "execution_count": null,
1139 |    "metadata": {
1140 |     "collapsed": true
1141 |    },
1142 |    "outputs": [],
1143 |    "source": []
1144 |   },
1145 |   {
1146 |    "cell_type": "code",
1147 |    "execution_count": 144,
1148 |    "metadata": {
1149 |     "collapsed": true
1150 |    },
1151 |    "outputs": [],
1152 |    "source": [
1153 |     "cancer_approved_target = [\"CHEMBL279\", \"CHEMBL203\", \"CHEMBL333\", \"CHEMBL325\", \"CHEMBL267\", \"CHEMBL2842\"]\n",
1154 |     "cancer_clinical_target = [\"CHEMBL340\", \"CHEMBL4005\", \"CHEMBL332\"]"
1155 |    ]
1156 |   },
1157 |   {
1158 |    "cell_type": "code",
1159 |    "execution_count": 158,
1160 |    "metadata": {
1161 |     "collapsed": false
1162 |    },
1163 |    "outputs": [],
1164 |    "source": [
1165 |     "for target in cancer_approved_target + cancer_clinical_target:\n",
1166 |     "    df = clf_label[clf_label[\"TARGET_CHEMBLID\"] == target]\n",
1167 |     "    df.to_csv(txt_dir + \"/%s_clf_label.csv\" % target, index=False)"
1168 |    ]
1169 |   },
1170 |   {
1171 |    "cell_type": "code",
1172 |    "execution_count": null,
1173 |    "metadata": {
1174 |     "collapsed": true
1175 |    },
1176 |    "outputs": [],
1177 |    "source": []
1178 |   }
1179 |  ],
1180 |  "metadata": {
1181 |   "kernelspec": {
1182 |    "display_name": "Python 2",
1183 |    "language": "python",
1184 |    "name": "python2"
1185 |   },
1186 |   "language_info": {
1187 |    "codemirror_mode": {
1188 |     "name": "ipython",
1189 |     "version": 2
1190 |    },
1191 |    "file_extension": ".py",
1192 |    "mimetype": "text/x-python",
1193 |    "name": "python",
1194 |    "nbconvert_exporter": "python",
1195 |    "pygments_lexer": "ipython2",
1196 |    "version": "2.7.6"
1197 |   }
1198 |  },
1199 |  "nbformat": 4,
1200 |  "nbformat_minor": 1
1201 | }
1202 | 


--------------------------------------------------------------------------------