├── LICENSE
├── README.md
├── scripts
    └── create_dataset.py
└── trainer
    ├── device.py
    ├── input_data.py
    ├── main.py
    ├── model_config.json
    ├── ops.py
    ├── resnet.py
    └── util.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Haicang Zhang
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Overview 
 2 | 
 3 | This is the code for [this](https://youtu.be/cw6_OP5An8s) video on Youtube by Siraj Raval on DeepMind AlphaFold. This is a re-implemention of [Sheng and Jinbo's deep leanring model](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005324) on protein contacts prediction, which is a breakthrough in protein structure prediction. However, unfortunately they did not opensource their codes and models. So its a re-implementation of their methods based on the paper. As their paper reported, deep learning model can significantly improve the accuracy of contacts prediction.
 4 | 
 5 | # Coding Challenge - Due Date, Jan 29, 2019 at 12 PM PST - Replicate AlphaFold as best you can. Post your github links in the comment section of the video! I'll give a shoutout to the top 2 entries
 6 | 
 7 | Wizards, see page 11 of this [paper](http://predictioncenter.org/casp13/doc/CASP13_Abstracts.pdf) and DeepMind's [blog post](https://deepmind.com/blog/alphafold/) for details on the AlphaFold algorithm. 2 Residual networks are used. Multiple methods are attempted. 
 8 | 
 9 | This code is one very related example to AlphaFold. Here are others that you can build off of
10 | 
11 | - https://github.com/carlosmartinezvillar/3DCNNFolds 
12 | - https://github.com/igemsoftware2017/AiGEM_TeamHeidelberg2017/blob/master/DeeProtein/DeeProtein_README.md
13 | - https://github.com/pfnet-research/BMI219-2017-ProteinFolding
14 | - https://github.com/5bingstar/Deep-learning-for-contact_map_v2
15 | - https://github.com/Illumina/PrimateAI 
16 | 
17 | Do the best you can! I'm looking for well documented code. Good luck!
18 | 
19 | 


--------------------------------------------------------------------------------
/scripts/create_dataset.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | import argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | import random
  6 | import tensorflow as tf
  7 | 
  8 | PADDING_FULL_LEN = 500
  9 | SAMPLES_EACH_CHUNK = 512
 10 | 
 11 | def _int64_list_feature(value):
 12 |   return tf.train.Feature(int64_list=tf.train.Int64List(value=np.reshape(value,-1)))
 13 | 
 14 | def _float_list_feature(value):
 15 |   return tf.train.Feature(float_list=tf.train.FloatList(value=np.reshape(value,-1)))
 16 | 
 17 | def _bytes_feature(value):
 18 |     return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
 19 | 
 20 | def _int64_feature(value):
 21 |     return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
 22 | 
 23 | def create_supervised(name_list, oned_feature_dir,\
 24 |         twod_feature_dir, output_prefix):
 25 |     padding_full_num = PADDING_FULL_LEN * PADDING_FULL_LEN
 26 |     names = pd.read_csv(name_list, names=['name'], header=None)
 27 |     names = list(names['name'])
 28 |     random.shuffle(names)
 29 |     SAMPLES_EACH_CHUNK = 1024
 30 |     chunk_num = (len(names) - 1) / SAMPLES_EACH_CHUNK + 1
 31 |     total_example_num = 0
 32 |     total_pos_num = 0
 33 |     total_neg_num = 0
 34 |     for i in range(chunk_num):
 35 |         with tf.python_io.TFRecordWriter(
 36 |             '%s_%s.tfrecord' % (output_prefix, i)) as record_writer: 
 37 |             start = i * SAMPLES_EACH_CHUNK
 38 |             end = min(len(names), (i+1) * SAMPLES_EACH_CHUNK)
 39 |             X1d = []
 40 |             X2d = []
 41 |             Y = []
 42 |             for name in names[start:end]:
 43 |                 fea1 = pd.read_csv('{}/{}.1dfeat'.format(oned_feature_dir, name),
 44 |                         header=None, sep='\s+')
 45 |                 L = fea1.shape[0]
 46 |                 if L > PADDING_FULL_LEN:
 47 |                     continue
 48 | 
 49 |                 x1d = fea1.iloc[:,range(23,43) + range(63,69)].values
 50 |                 fea2 = pd.read_csv('{}/{}.2dfeat'.format(twod_feature_dir, name),
 51 |                         header=None, sep='\s+')
 52 |                 assert L == int(np.sqrt(fea2.shape[0]))
 53 |                 data = fea2.iloc[:,[2,4,5,7]].values
 54 |                 data = data.reshape((L, L, -1))
 55 |                 x2d = data[:L,:L,1:]
 56 | 
 57 |                 y = data[:L,:L,0].reshape((L,L))
 58 |                 #np.fill_diagonal(y, -1.0)
 59 |                 y = np.where(y > 0.0, (y < 8.0).astype(np.int16), y).astype(np.int16)
 60 |                 y[np.tril_indices(y.shape[0], 5)] = -1
 61 |                 neg_num = np.sum(y==0)
 62 |                 pos_num = np.sum(y==1)
 63 |                 if pos_num < 100:
 64 |                     continue
 65 |                 total_pos_num += pos_num
 66 |                 total_neg_num += neg_num
 67 |                 total_example_num += 1
 68 | 
 69 |                 example = tf.train.Example(features = tf.train.Features(
 70 |                     feature={
 71 |                         'size': _int64_feature(L),
 72 |                         'x1d' : _bytes_feature(x1d.astype(np.float32).tobytes()),
 73 |                         'x2d' : _bytes_feature(x2d.astype(np.float32).tobytes()),
 74 |                         'y'   : _bytes_feature(y.astype(np.int16).tobytes())
 75 |                         }))
 76 |                 record_writer.write(example.SerializeToString())
 77 |                 print name
 78 | 
 79 |     print total_example_num, total_pos_num + total_neg_num, total_pos_num, total_neg_num
 80 | 
 81 | def create_semi(name_list, oned_feature_dir,\
 82 |         twod_feature_dir, output_prefix, labeled=True):
 83 |     padding_full_num = PADDING_FULL_LEN * PADDING_FULL_LEN
 84 |     names = pd.read_csv(name_list, names=['name'], header=None)
 85 |     names = list(names['name'])
 86 |     random.shuffle(names)
 87 |     SAMPLES_EACH_CHUNK = 1024
 88 |     chunk_num = (len(names) - 1) / SAMPLES_EACH_CHUNK + 1
 89 |     for i in range(chunk_num):
 90 |         with tf.python_io.TFRecordWriter(
 91 |             '%s_%s.tfrecord' % (output_prefix, i)) as record_writer: 
 92 |             start = i * SAMPLES_EACH_CHUNK
 93 |             end = min(len(names), (i+1) * SAMPLES_EACH_CHUNK)
 94 |             X1d = []
 95 |             X2d = []
 96 |             Y = []
 97 |             for name in names[start:end]:
 98 |                 fea1 = pd.read_csv('{}/{}.1dfeat'.format(oned_feature_dir, name),
 99 |                         header=None, sep='\s+')
100 |                 L = fea1.shape[0]
101 |                 if L > PADDING_FULL_LEN:
102 |                     continue
103 | 
104 |                 x1d = fea1.iloc[:,range(23,43) + range(63,69)].values
105 |                 fea2 = pd.read_csv('{}/{}.2dfeat'.format(twod_feature_dir, name),
106 |                         header=None, sep='\s+')
107 |                 assert L == int(np.sqrt(fea2.shape[0]))
108 |                 data = fea2.iloc[:,[2,4,5,7]].values
109 |                 data = data.reshape((L, L, -1))
110 |                 x2d = data[:L,:L,1:]
111 |                 feature={
112 |                         'size': _int64_feature(L),
113 |                         'x1d' : _bytes_feature(x1d.astype(np.float32).tobytes()),
114 |                         'x2d' : _bytes_feature(x2d.astype(np.float32).tobytes()),
115 |                         }
116 |                 if labeled:
117 |                     y = data[:L,:L,0].reshape((L,L))
118 |                     y = np.where(y > 0.0, (y < 8.0).astype(np.int16), y).astype(np.int16)
119 |                     y[np.tril_indices(y.shape[0], 5)] = -1
120 |                     y_ = np.zeros((L,L,2))
121 |                     for i in range(L):
122 |                         for j in range(L):
123 |                             if y[i,j] < 0:
124 |                                 y_[i,j,:] = -1
125 |                             else:
126 |                                 y_[i,j,y[i,j]] = 1
127 |                     feature.update({'y':_bytes_feature(y_.astype(np.int16).tobytes())})
128 |                 example = tf.train.Example(features = tf.train.Features(feature=feature))
129 |                 record_writer.write(example.SerializeToString())
130 | 
131 | def create_from_ccmpred(name_list,\
132 |         profile_dir, structure_dir,\
133 |         ccmpred_dir, distcb_dir,\
134 |         output_prefix, chunk_size):
135 |     PADDING_FULL_LEN = 250
136 |     padding_full_num = PADDING_FULL_LEN * PADDING_FULL_LEN
137 |     names = pd.read_csv(name_list, names=['name'], header=None)
138 |     names = list(names['name'])
139 |     random.shuffle(names)
140 |     SAMPLES_EACH_CHUNK = chunk_size
141 |     chunk_num = (len(names) - 1) / SAMPLES_EACH_CHUNK + 1
142 |     #chunk_num = len(names) / SAMPLES_EACH_CHUNK
143 |     total_example_num = 0
144 |     total_pos_num = 0
145 |     total_neg_num = 0
146 |     pbar = tqdm(total=len(names))
147 |     for i in range(chunk_num):
148 |         with tf.python_io.TFRecordWriter(
149 |             '%s_%s.tfrecord' % (output_prefix, i)) as record_writer: 
150 |             start = i * SAMPLES_EACH_CHUNK
151 |             end = min(len(names), (i+1) * SAMPLES_EACH_CHUNK)
152 |             #end = (i+1) * SAMPLES_EACH_CHUNK
153 |             if i + 1 == chunk_num:
154 |                 end = len(names)
155 |                 
156 |             X1d = []
157 |             X2d = []
158 |             Y = []
159 |             for name in names[start:end]:
160 |                 pbar.update(1)
161 |                 profile = np.loadtxt('{}/{}.profile'.format(profile_dir, name))
162 |                 L = profile.shape[0]
163 |                 if L > PADDING_FULL_LEN:
164 |                     continue
165 | 
166 |                 structure = pd.read_csv('{}/{}.structure'.format(structure_dir, name))
167 |                 structure = structure.iloc[:, range(3,6) + range(16,19) + range(20,21)].values
168 |                 
169 |                 ccmpred = np.loadtxt('{}/{}.cc'.format(ccmpred_dir, name))
170 |                 assert L == structure.shape[0]
171 |                 assert L == ccmpred.shape[0]
172 |                 x1d = np.concatenate([profile, structure], axis=1)
173 |                 x2d = ccmpred[:,:,np.newaxis]
174 |                 #np.fill_diagonal(y, -1.0)
175 |                 if distcb_dir is not None:
176 |                     y = np.loadtxt('{}/{}.distcb'.format(distcb_dir, name))
177 |                     assert L == y.shape[0]
178 |                     y = np.where(y > 0.0, (y < 8.0).astype(np.int16), y).astype(np.int16)
179 |                     y[np.tril_indices(y.shape[0], 4)] = -1
180 |                     #np.fill_diagonal(y, -1)
181 |                     neg_num = np.sum(y==0)
182 |                     pos_num = np.sum(y==1)
183 |                     if pos_num < 100:
184 |                         continue
185 |                     total_pos_num += pos_num
186 |                     total_neg_num += neg_num
187 | 
188 |                     example = tf.train.Example(features = tf.train.Features(
189 |                         feature={
190 |                             'size': _int64_feature(L),
191 |                             'x1d' : _bytes_feature(x1d.astype(np.float32).tobytes()),
192 |                             'x2d' : _bytes_feature(x2d.astype(np.float32).tobytes()),
193 |                             'y'   : _bytes_feature(y.astype(np.int16).tobytes()),
194 |                             'name': _bytes_feature(name) 
195 |                             }))
196 |                 else:
197 |                     example = tf.train.Example(features = tf.train.Features(
198 |                         feature={
199 |                             'size': _int64_feature(L),
200 |                             'x1d' : _bytes_feature(x1d.astype(np.float32).tobytes()),
201 |                             'x2d' : _bytes_feature(x2d.astype(np.float32).tobytes()),
202 |                             'name': _bytes_feature(name) 
203 |                             }))
204 |                 record_writer.write(example.SerializeToString())
205 |                 total_example_num += 1
206 |     pbar.close()
207 |     if distcb_dir is not None:
208 |         print total_example_num, total_pos_num + total_neg_num,\
209 |                 total_pos_num, total_neg_num
210 |     else:
211 |         print total_example_num
212 | 
213 |     
214 | def main():
215 |     parser = argparse.ArgumentParser()
216 |     parser.add_argument('--op',
217 |             choices=['create_from_ccmpred', 'create_semi', 'create_supervised'], required=True)
218 |     parser.add_argument('--name_list', type=str)
219 |     parser.add_argument('--oned_feature_dir', type=str)
220 |     parser.add_argument('--twod_feature_dir', type=str)
221 |     parser.add_argument('--output_prefix', type=str)
222 |     parser.add_argument('--labeled', action='store_true')
223 |     parser.add_argument('--profile_dir', type=str)
224 |     parser.add_argument('--structure_dir', type=str)
225 |     parser.add_argument('--ccmpred_dir', type=str)
226 |     parser.add_argument('--distcb_dir', type=str)
227 |     parser.add_argument('--chunk_size', type=int, default=512)
228 |     
229 |     args = parser.parse_args()
230 |     
231 |     if args.op == 'create_semi':
232 |         create_semi(args.name_list, args.oned_feature_dir,\
233 |                 args.twod_feature_dir, args.output_prefix, args.labeled)
234 |     if args.op == 'create_supervised':
235 |         create_supervised(args.name_list, args.oned_feature_dir,\
236 |                 args.twod_feature_dir, args.output_prefix)
237 |     if args.op == 'create_from_ccmpred':
238 |         create_from_ccmpred(args.name_list,\
239 |                 args.profile_dir, args.structure_dir,\
240 |                 args.ccmpred_dir, args.distcb_dir,\
241 |                 args.output_prefix, args.chunk_size)
242 | if __name__ == '__main__':
243 |     main()
244 | 


--------------------------------------------------------------------------------
/trainer/device.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.python.client import device_lib
 2 | 
 3 | def get_available_cpus():
 4 |     local_device_protos = device_lib.list_local_devices()
 5 |     return [x.name for x in local_device_protos if x.device_type == 'CPU']
 6 | 
 7 | def get_available_gpus():
 8 |     local_device_protos = device_lib.list_local_devices()
 9 |     return [x.name for x in local_device_protos if x.device_type == 'GPU']
10 | 
11 | print get_available_gpus()
12 | print get_available_cpus()
13 | 


--------------------------------------------------------------------------------
/trainer/input_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import random
 3 | import numpy as np
 4 | import gc 
 5 | from util import RunMode
 6 | 
 7 | class MemDataset:
 8 |     pass
 9 | 
10 | class TfRecordDataset:
11 |     def __init__(self, train_file_prefix='', chunk_num=0, val_size = 1,\
12 |             test_file_prefix='', unlabel_file_prefix='', unlabel_chunk_num=0):
13 |         self.chunk_num = chunk_num
14 |         self.train_file_prefix = train_file_prefix
15 |         self.test_file_prefix = test_file_prefix
16 |         self.unlabel_file_prefix = unlabel_file_prefix
17 |         self.unlabel_chunk_num = unlabel_chunk_num
18 | 
19 |         idx = range(chunk_num)
20 |         #random.shuffle(idx)
21 |         if type(val_size) == float:
22 |             train_chunk_num = np.ceil(chunk_num * (1.0 - val_size))
23 |         else:
24 |             train_chunk_num = chunk_num - val_size
25 | 
26 |         self.train_chunks = idx[:train_chunk_num]
27 |         self.val_chunks = idx[train_chunk_num:]
28 | 
29 |     def get_chunks(self, mode):
30 |         input_file_prefix = self.train_file_prefix
31 |         if mode == RunMode.TRAIN:
32 |             chunks = self.train_chunks
33 |             random.shuffle(chunks)
34 |         elif mode == RunMode.VALIDATE:
35 |             chunks = self.val_chunks
36 |         elif mode == RunMode.TEST:
37 |             input_file_prefix = self.test_file_prefix
38 |             chunks = [0]
39 |         elif model == RunMode.UNLABEL:
40 |             input_file_prefix = self.unlabel_file_prefix
41 |             chunks = range(self.unlabel_chunk_num)
42 |             random.shuffle(chunks)
43 |         return ['{}_{:d}.tfrecord'.format(input_file_prefix, c)\
44 |                 for c in chunks]
45 | 


--------------------------------------------------------------------------------
/trainer/main.py:
--------------------------------------------------------------------------------
 1 | import resnet
 2 | import input_data
 3 | import argparse
 4 | import tensorflow as tf
 5 | import numpy as np
 6 | import logging
 7 | import sys
 8 | import json
 9 | from tensorflow.python.lib.io import file_io
10 | import os
11 | 
12 | def parse_model_config(json_file):
13 |     #the 'open' function can't support goole cloud platform
14 |     with file_io.FileIO(json_file, 'r') as f:
15 |         config = json.load(f)
16 |     return config
17 | 
18 | def main():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('--alg', type=str, choices=['gan', 'resn'], default='resn')
21 |     parser.add_argument('--epoch', type=int, default=30)
22 |     parser.add_argument('--batch_size', type=int, default=2)
23 |     parser.add_argument('--learn_rate', type=float, default=0.0002)
24 |     parser.add_argument('--beta1', type=float, default=0.9)
25 |     parser.add_argument('--l2_reg', type=float, default=0.0001)
26 |     parser.add_argument('--down_weight', type=float, default=1.0)
27 |     parser.add_argument('--keep_prob', type=float, default=0.8)
28 |     parser.add_argument('--op_alg', type=str, choices=['sgd', 'adam'], default='adam')
29 |     parser.add_argument('--train_file_prefix', type=str)
30 |     parser.add_argument('--unlabel_file_prefix', type=str)
31 |     parser.add_argument('--test_file_prefix', type=str, default=None)
32 |     parser.add_argument('--chunk_num', type=int, default=4)
33 |     parser.add_argument('--unlabel_chunk_num', type=int)
34 |     parser.add_argument('--gan_gen_iter', type=int, default=5)
35 |     parser.add_argument('--model_config', type=str, required=True)
36 |     parser.add_argument('--job_dir', type=str)
37 |     parser.add_argument('--log_file', type=str)
38 |     parser.add_argument('--summary_dir', type=str)
39 |     parser.add_argument('--model_dir', type=str)
40 |     parser.add_argument('--output_dir', type=str)
41 |     parser.add_argument('--model_path', type=str)
42 |     parser.add_argument('--mode', type=str, choices=['test', 'train'], default='train')
43 | 
44 |     args = parser.parse_args()
45 |     if args.job_dir is not None:
46 |         os.makedirs(args.job_dir)
47 |         if args.summary_dir is None:
48 |             args.summary_dir = '{}/summary'.format(args.job_dir)
49 |             os.makedirs(args.summary_dir)
50 |         if args.model_dir is None:
51 |             args.model_dir = '{}/model'.format(args.job_dir)
52 |             os.makedirs(args.model_dir)
53 |         if args.log_file is None:
54 |             args.log_file = '{}/run.log'.format(args.job_dir)
55 |     logger = logging.getLogger()
56 |     logger.setLevel(logging.DEBUG)
57 |     formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
58 |     log_file_stream=file_io.FileIO(args.log_file,'a')
59 |     fh = logging.StreamHandler(log_file_stream)
60 |     fh.setFormatter(formatter)
61 |     logger.addHandler(fh)
62 |     ch = logging.StreamHandler(sys.stdout)
63 |     ch.setFormatter(formatter)
64 |     logger.addHandler(ch)
65 | 
66 |     model_config = parse_model_config(args.model_config)
67 |     logging.info('train_config: {:s}'.format(args))
68 |     logging.info('model_config: {:s}'.format(json.dumps(model_config)))
69 | 
70 |     if args.alg == 'resn':
71 |         with tf.Session() as sess:
72 |             if args.mode == 'train':
73 |                 dataset = input_data.TfRecordDataset(
74 |                         args.train_file_prefix, args.chunk_num, val_size = 1,
75 |                         test_file_prefix = args.test_file_prefix)
76 |                 resn_ = resnet.Resnet(sess, dataset, train_config=args, model_config=model_config)
77 |                 resn_.train()
78 |             elif args.mode == 'test':
79 |                 dataset = input_data.TfRecordDataset(test_file_prefix = args.test_file_prefix)
80 |                 resn_ = resnet.Resnet(sess, dataset, train_config=args, model_config=model_config)
81 |                 resn_.predict(args.output_dir, args.model_path)
82 | 
83 | if __name__ == '__main__':
84 |     main()
85 | 


--------------------------------------------------------------------------------
/trainer/model_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "1d":{
 3 |         "filters":32,
 4 |         "kernel_size":7,
 5 |         "block_num":8,
 6 |         "channel_dim":32
 7 |     },
 8 |     "2d":{
 9 |         "filters":16,
10 |         "kernel_size":5,
11 |         "block_num":2,
12 |         "channel_dim":3
13 |     },
14 |     "2d_label_size":400,
15 |     "1d_label_size":20
16 | }
17 | 


--------------------------------------------------------------------------------
/trainer/ops.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np 
 3 | import tensorflow as tf
 4 | 
 5 | from tensorflow.python.framework import ops
 6 | 
 7 | def linear(input_, output_size, scope=None, stddev=0.02, bias_start=0.0, with_w=False):
 8 |   shape = input_.get_shape().as_list()
 9 | 
10 |   with tf.variable_scope(scope or "Linear"):
11 |     matrix = tf.get_variable("Matrix", [shape[1], output_size], tf.float32,
12 |                  tf.random_normal_initializer(stddev=stddev))
13 |     bias = tf.get_variable("bias", [output_size],
14 |       initializer=tf.constant_initializer(bias_start))
15 |     if with_w:
16 |       return tf.matmul(input_, matrix) + bias, matrix, bias
17 |     else:
18 |       return tf.matmul(input_, matrix) + bias
19 | 
20 | def deconv2d(input_, output_shape,
21 |        k_h=5, k_w=5, d_h=2, d_w=2, stddev=0.02,
22 |        name="deconv2d", with_w=False):
23 |   with tf.variable_scope(name):
24 |     # filter : [height, width, output_channels, in_channels]
25 |     w = tf.get_variable('w', [k_h, k_w, output_shape[-1], input_.get_shape()[-1]],
26 |               initializer=tf.random_normal_initializer(stddev=stddev))
27 |     
28 |     deconv = tf.nn.conv2d_transpose(input_, w, output_shape=output_shape,
29 |                 strides=[1, d_h, d_w, 1])
30 | 
31 |     biases = tf.get_variable('biases', [output_shape[-1]], initializer=tf.constant_initializer(0.0))
32 |     deconv = tf.reshape(tf.nn.bias_add(deconv, biases), deconv.get_shape())
33 | 
34 |     if with_w:
35 |       return deconv, w, biases
36 |     else:
37 |       return deconv
38 | 


--------------------------------------------------------------------------------
/trainer/resnet.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import util
  5 | from util import RunMode
  6 | import logging
  7 | 
  8 | #PADDING_FULL_LEN = 500
  9 | PADDING_FULL_LEN = 250
 10 | 
 11 | class Resnet:
 12 |     def __init__(self, sess, dataset, train_config, model_config):
 13 |         self.sess = sess
 14 |         self.dataset = dataset
 15 |         self.train_config = train_config
 16 |         self.model_config = model_config
 17 | 
 18 |         self.input_tfrecord_files = tf.placeholder(tf.string, shape=[None])
 19 |         self.keep_prob = tf.placeholder(tf.float32)
 20 |         self.training = tf.placeholder(tf.bool)
 21 | 
 22 |         self.x1d_channel_dim = model_config['1d']['channel_dim']
 23 |         self.x2d_channel_dim = model_config['2d']['channel_dim']
 24 | 
 25 |     def cnn_with_2dfeature(self, x2d, reuse=False):
 26 |         with tf.variable_scope('discriminator', reuse=reuse) as scope:
 27 |             block_num = 8
 28 |             filters = 16
 29 |             kernel_size = [4, 4]
 30 |             act = tf.nn.relu
 31 |             #kernel_initializer = tf.truncated_normal_initializer(stddev=0.01)
 32 |             kernel_initializer = tf.glorot_normal_initializer()
 33 |             #kernel_initializer = None
 34 |             bias_initializer = tf.zeros_initializer()
 35 |             #kernel_regularizer = tf.contrib.layers.l2_regularizer(scale=0.001)
 36 |             kernel_regularizer = None
 37 |             bias_regularizer = None
 38 | 
 39 |             for i in np.arange(block_num):
 40 |                 inputs = x2d if i == 0 else conv_
 41 |                 conv_ = tf.layers.conv2d(inputs=inputs, filters=filters,
 42 |                         kernel_size=kernel_size, strides=(1,1), padding='same', activation=act,
 43 |                         kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,
 44 |                         kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer)
 45 | 
 46 |             logits = tf.layers.conv2d(inputs=conv_, filters=1,
 47 |                     kernel_size=kernel_size, strides=(1,1), padding='same',
 48 |                     kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,
 49 |                     kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer)
 50 |                 
 51 |             logits = tf.reshape(logits, (-1, tf.shape(logits)[1], tf.shape(logits)[2]))
 52 |             return tf.sigmoid(logits), logits
 53 |     
 54 |     def resn_with_2dfeature(self, x2d, reuse=False):
 55 |         with tf.variable_scope('discriminator', reuse=reuse) as scope:
 56 |             block_num = 8
 57 |             filters = 32
 58 |             kernel_size = [4, 4]
 59 |             act = tf.nn.relu
 60 |             #kernel_initializer = tf.truncated_normal_initializer(stddev=0.01)
 61 |             kernel_initializer = tf.glorot_normal_initializer()
 62 |             #kernel_initializer = None
 63 |             bias_initializer = tf.zeros_initializer()
 64 |             #kernel_regularizer = tf.contrib.layers.l2_regularizer(scale=0.001)
 65 |             kernel_regularizer = None
 66 |             bias_regularizer = None
 67 | 
 68 |             prev = tf.layers.conv2d(inputs=x2d, filters=filters,
 69 |                     kernel_size=kernel_size, strides=(1,1), padding='same',
 70 |                     kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,
 71 |                     kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer)
 72 |             for i in np.arange(block_num):
 73 |                 conv_ = act(prev)
 74 |                 conv_ = tf.layers.conv2d(inputs=conv_, filters=filters,
 75 |                         kernel_size=kernel_size, strides=(1,1), padding='same',
 76 |                         kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,
 77 |                         kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer)
 78 |                 conv_ = act(conv_)
 79 |                 conv_ = tf.layers.conv2d(inputs=conv_, filters=filters,
 80 |                         kernel_size=kernel_size, strides=(1,1), padding='same',
 81 |                         kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,
 82 |                         kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer)
 83 |                 prev = tf.add(conv_, prev)
 84 | 
 85 |             logits = tf.layers.conv2d(inputs=prev, filters=1,
 86 |                     kernel_size=kernel_size, strides=(1,1), padding='same',
 87 |                     kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,
 88 |                     kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer)
 89 | 
 90 |             logits = tf.reshape(logits, (-1, tf.shape(logits)[1], tf.shape(logits)[2]))
 91 |             return tf.sigmoid(logits), logits
 92 | 
 93 |     def resn(self, x1d, x2d, reuse=False):
 94 |         with tf.variable_scope('discriminator', reuse=reuse) as scope:
 95 |             act = tf.nn.relu
 96 | 
 97 |             filters_1d = self.model_config['1d']['filters']
 98 |             kernel_size_1d = self.model_config['1d']['kernel_size']
 99 |             block_num_1d = self.model_config['1d']['block_num']
100 | 
101 |             filters_2d = self.model_config['2d']['filters']
102 |             kernel_size_2d = self.model_config['2d']['kernel_size']
103 |             block_num_2d = self.model_config['2d']['block_num']
104 | 
105 |             #kernel_initializer = tf.glorot_normal_initializer()
106 |             kernel_initializer = tf.variance_scaling_initializer()
107 |             bias_initializer = tf.zeros_initializer()
108 |             if self.train_config.l2_reg <= 0.0:
109 |                 kernel_regularizer = None
110 |             else:
111 |                 kernel_regularizer = tf.contrib.layers.l2_regularizer(scale=self.train_config.l2_reg)
112 |             bias_regularizer = None
113 | 
114 |             prev_1d = tf.layers.conv1d(inputs=x1d, filters=filters_1d,
115 |                     kernel_size=kernel_size_1d, strides=1, padding='same',
116 |                     kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,
117 |                     kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=False)
118 |             for i in np.arange(block_num_1d):
119 |                 conv_1d = act(prev_1d)
120 |                 conv_1d = tf.layers.conv1d(inputs=conv_1d, filters=filters_1d,
121 |                         kernel_size=kernel_size_1d, strides=1, padding='same',
122 |                         kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,
123 |                         kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=False)
124 |                 conv_1d = act(conv_1d)
125 |                 conv_1d = tf.layers.conv1d(inputs=conv_1d, filters=filters_1d,
126 |                         kernel_size=kernel_size_1d, strides=1, padding='same',
127 |                         kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,
128 |                         kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=False)
129 | 
130 |                 prev_1d = tf.add(conv_1d, prev_1d)
131 |             
132 |             out_1d = tf.expand_dims(prev_1d, axis=3)
133 |             ones = tf.ones((1, PADDING_FULL_LEN))
134 |             #left_1d = tf.tensordot(out_1d, ones, [[3], [0]])
135 |             left_1d = tf.einsum('abcd,de->abce', out_1d, ones)
136 |             left_1d = tf.transpose(left_1d, perm=[0,1,3,2])
137 |             right_1d = tf.transpose(left_1d, perm=[0,2,1,3])
138 |             print '1d shape', left_1d.shape, right_1d.shape
139 | 
140 |             input_2d = tf.concat([x2d, left_1d, right_1d], axis=3)
141 |             print '2d shape', input_2d.shape
142 |             
143 |             prev_2d = tf.layers.conv2d(inputs=input_2d, filters=filters_2d,
144 |                     kernel_size=kernel_size_2d, strides=(1,1), padding='same',
145 |                     kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,
146 |                     kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=False)
147 |             for i in np.arange(block_num_2d):
148 |                 conv_2d = act(prev_2d)
149 |                 conv_2d = tf.layers.conv2d(inputs=conv_2d, filters=filters_2d,
150 |                     kernel_size=kernel_size_2d, strides=(1,1), padding='same',
151 |                     kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,
152 |                     kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=False)
153 |                 conv_2d = act(conv_2d)
154 |                 conv_2d = tf.layers.conv2d(inputs=conv_2d, filters=filters_2d,
155 |                     kernel_size=kernel_size_2d, strides=(1,1), padding='same',
156 |                     kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,
157 |                     kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=False)
158 |                 prev_2d =  tf.add(conv_2d, prev_2d)
159 |                 
160 |             logits = tf.layers.conv2d(inputs=prev_2d, filters=1,
161 |                     kernel_size=kernel_size_2d, strides=(1,1), padding='same',
162 |                     kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,
163 |                     kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=True)
164 | 
165 |             #logits = tf.reshape(logits, (-1, tf.shape(logits)[1], tf.shape(logits)[2]))
166 |             logits = tf.squeeze(logits, 3)
167 |             logits_tran = tf.transpose(logits, perm=[0, 2, 1])
168 |             logits = (logits + logits_tran) / 2.0
169 |             return tf.sigmoid(logits), logits
170 | 
171 |     def evaluate(self, mode):
172 |         self.sess.run(self.iterator.initializer,\
173 |                 feed_dict={self.input_tfrecord_files:self.dataset.get_chunks(mode)})
174 |         acc = []
175 |         while True:
176 |             try:
177 |                 pred, y, size = self.sess.run([self.pred, self.y, self.size])
178 |                 for y_, pred_, size_ in zip(y, pred, size):
179 |                     #pred_ = (pred_ + np.transpose(pred_)) / 2.0
180 |                     acc_ = util.TopAccuracy(pred_[:size_, :size_], y_[:size_, :size_])
181 |                     acc.append(acc_)
182 |             except tf.errors.OutOfRangeError:
183 |                 break
184 |         acc = np.array(acc)
185 |         acc = np.mean(acc, axis=0)
186 |         acc_str = ' '.join(['%.4f ' % acc_ for acc_ in acc])
187 |         logging.info('{:s} acc: {:s}'.format(mode, acc_str))
188 |         return
189 | 
190 |     def build_input(self):
191 |         with tf.device('/cpu:0'):
192 |             def parser(record):
193 |                 keys_to_features = {
194 |                 'x1d' :tf.FixedLenFeature([], tf.string),
195 |                 'x2d' :tf.FixedLenFeature([], tf.string),
196 |                 'y'   :tf.FixedLenFeature([], tf.string),
197 |                 'size':tf.FixedLenFeature([], tf.int64)}
198 |                 parsed = tf.parse_single_example(record, keys_to_features)
199 |                 x1d = tf.decode_raw(parsed['x1d'], tf.float32)
200 |                 x2d = tf.decode_raw(parsed['x2d'] ,tf.float32)
201 |                 size = parsed['size']
202 |                 x1d = tf.reshape(x1d, tf.stack([size, -1]))
203 |                 x2d = tf.reshape(x2d, tf.stack([size, size, -1]))
204 |                 y = tf.decode_raw(parsed['y'],tf.int16)
205 |                 y = tf.cast(y, tf.float32)
206 |                 y = tf.reshape(y, tf.stack([size, size]))
207 |                 return x1d, x2d, y, size
208 | 
209 |             dataset = tf.data.TFRecordDataset(self.input_tfrecord_files)
210 |             dataset = dataset.map(parser, num_parallel_calls=64)
211 |             dataset = dataset.prefetch(1024)
212 |             dataset = dataset.shuffle(buffer_size=512)
213 |             dataset = dataset.padded_batch(self.train_config.batch_size,
214 |                     padded_shapes=([PADDING_FULL_LEN, self.x1d_channel_dim],
215 |                         [PADDING_FULL_LEN, PADDING_FULL_LEN, self.x2d_channel_dim],
216 |                         [PADDING_FULL_LEN, PADDING_FULL_LEN], []),
217 |                     padding_values=(0.0, 0.0, -1.0, np.int64(PADDING_FULL_LEN)))
218 |             iterator = dataset.make_initializable_iterator()
219 |             x1d, x2d, y, size = iterator.get_next()
220 |             return  x1d, x2d, y, size, iterator
221 | 
222 |     def train(self):
223 |         self.x1d, self.x2d, self.y, self.size, self.iterator = self.build_input()
224 | 
225 |         with tf.device('/gpu:0'):
226 |             #self.pred, logits = self.discriminator_cnn(self.x2d)
227 |             #self.pred, logits = self.discriminator_resn(self.x2d)
228 |             self.pred, logits = self.resn(self.x1d, self.x2d)
229 |             if self.train_config.down_weight >= 1.0:
230 |                 mask = tf.greater_equal(self.y, 0.0)
231 |                 labels = tf.boolean_mask(self.y, mask)
232 |                 logits = tf.boolean_mask(logits, mask)
233 |                 self.loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = labels, logits = logits))
234 |             else:
235 |                 mask_pos = tf.equal(self.y, 1.0)
236 |                 label_pos = tf.boolean_mask(self.y, mask_pos)
237 |                 logit_pos = tf.boolean_mask(logits, mask_pos)
238 |                 loss_pos = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = label_pos, logits = logit_pos))
239 | 
240 |                 mask_neg = tf.equal(self.y, 0.0)
241 |                 label_neg = tf.boolean_mask(self.y, mask_neg)
242 |                 logit_neg = tf.boolean_mask(logits, mask_neg)
243 |                 loss_neg = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = label_neg, logits = logit_neg))
244 |                 self.loss = loss_neg * self.train_config.down_weight + loss_pos
245 | 
246 |             update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)#for batch normalization
247 |             with tf.control_dependencies(update_ops):
248 |                 if self.train_config.op_alg == 'adam':
249 |                     optim = tf.train.AdamOptimizer(self.train_config.learn_rate,
250 |                             beta1=self.train_config.beta1).minimize(self.loss)
251 |                 elif self.train_config.op_alg == 'sgd':
252 |                     optim = tf.train.GradientDescentOptimizer(
253 |                             self.train_config.learn_rate).minimize(self.loss)
254 | 
255 |         tf.summary.scalar('train_loss', self.loss)
256 |         merged_summary = tf.summary.merge_all()
257 |         train_writer = tf.summary.FileWriter(self.train_config.summary_dir, self.sess.graph)
258 |         tf.global_variables_initializer().run()
259 |         steps = 0
260 |         saver = tf.train.Saver()
261 |         for epoch in np.arange(self.train_config.epoch):
262 |             self.sess.run(self.iterator.initializer,\
263 |                     feed_dict={self.input_tfrecord_files:self.dataset.get_chunks(RunMode.TRAIN)})
264 |             train_loss = 0.0
265 |             while True:
266 |                 try:
267 |                     _, _loss, summary = self.sess.run([optim, self.loss, merged_summary])
268 |                     train_loss += _loss
269 |                     train_writer.add_summary(summary, steps)
270 |                     steps += 1
271 |                 except tf.errors.OutOfRangeError:
272 |                     break
273 |             saver.save(self.sess, '{}/model'.format(self.train_config.model_dir),
274 |                     global_step=epoch)
275 |             logging.info('Epoch= {:d} train_loss= {:.4f}'.format(epoch, train_loss))
276 |             self.evaluate(RunMode.VALIDATE)
277 |             if self.train_config.test_file_prefix is not None:
278 |                 self.evaluate(RunMode.TEST)
279 |         train_writer.close()
280 | 
281 |     def build_input_test(self):
282 |         with tf.device('/cpu:0'):
283 |             def parser(record):
284 |                 keys_to_features = {
285 |                 'x1d' :tf.FixedLenFeature([], tf.string),
286 |                 'x2d' :tf.FixedLenFeature([], tf.string),
287 |                 'name':tf.FixedLenFeature([], tf.string),
288 |                 'size':tf.FixedLenFeature([], tf.int64)}
289 |                 parsed = tf.parse_single_example(record, keys_to_features)
290 |                 x1d = tf.decode_raw(parsed['x1d'], tf.float32)
291 |                 x2d = tf.decode_raw(parsed['x2d'] ,tf.float32)
292 |                 size = parsed['size']
293 |                 x1d = tf.reshape(x1d, tf.stack([size, -1]))
294 |                 x2d = tf.reshape(x2d, tf.stack([size, size, -1]))
295 |                 name = parsed['name']
296 |                 return x1d, x2d, name, size
297 | 
298 |             dataset = tf.data.TFRecordDataset(self.input_tfrecord_files)
299 |             dataset = dataset.map(parser, num_parallel_calls=64)
300 |             dataset = dataset.prefetch(512)
301 |             #dataset = dataset.shuffle(buffer_size=512)
302 |             dataset = dataset.padded_batch(self.train_config.batch_size,
303 |                     padded_shapes=([PADDING_FULL_LEN, self.x1d_channel_dim],
304 |                         [PADDING_FULL_LEN, PADDING_FULL_LEN, self.x2d_channel_dim],
305 |                         [], []),
306 |                     padding_values=(0.0, 0.0, "", np.int64(PADDING_FULL_LEN)))
307 |             iterator = dataset.make_initializable_iterator()
308 |             x1d, x2d, name, size = iterator.get_next()
309 |             return  x1d, x2d, name, size, iterator
310 | 
311 |     def predict(self, output_dir, model_path):
312 |         x1d, x2d, name, size, iterator = self.build_input_test()
313 |         preds, logits = self.resn(x1d, x2d)
314 |         saver = tf.train.Saver()
315 |         saver.restore(self.sess, model_path)
316 |         self.sess.run(iterator.initializer,
317 |                 feed_dict={self.input_tfrecord_files:self.dataset.get_chunks(RunMode.TEST)})
318 |         while True:
319 |             try:
320 |                 preds_, names_, sizes_, = self.sess.run([preds, name, size])
321 |                 for pred_, name_, size_ in zip(preds_, names_, sizes_):
322 |                     pred_ = pred_[:size_, :size_]
323 |                     #inds = np.triu_indices_from(pred_, k=1)
324 |                     #pred_[(inds[1], inds[0])] = pred_[inds]
325 |                     #pred_ = (pred_ + np.transpose(pred_)) / 2.0
326 |                     output_path = '{}/{}.concat'.format(output_dir, name_)
327 |                     np.savetxt(output_path, pred_)
328 |             except tf.errors.OutOfRangeError:
329 |                 break
330 | 


--------------------------------------------------------------------------------
/trainer/util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from enum import Enum
 3 | 
 4 | class RunMode(Enum):
 5 |     TRAIN=1
 6 |     VALIDATE=2
 7 |     TEST=3
 8 |     UNLABEL=4
 9 | 
10 | def TopAccuracy(pred=None, truth=None, ratio=[1, 0.5, 0.2, 0.1]):
11 |     if pred is None:
12 |         print 'please provide a predicted contact matrix'
13 |         sys.exit(-1)
14 | 
15 |     if truth is None:
16 |         print 'please provide a true contact matrix'
17 |         sys.exit(-1)
18 | 
19 |     assert pred.shape[0] == pred.shape[1]
20 |     assert pred.shape == truth.shape
21 | 
22 |     pred_truth = np.dstack( (pred, truth) )
23 | 
24 |     M1s = np.ones_like(truth, dtype = np.int8)
25 |     mask_LR = np.triu(M1s, 24)
26 |     mask_MLR = np.triu(M1s, 12)
27 |     mask_SMLR = np.triu(M1s, 6)
28 |     mask_MR = mask_MLR - mask_LR
29 |     mask_SR = mask_SMLR - mask_MLR
30 | 
31 |     seqLen = pred.shape[0]
32 | 
33 |     accs = []
34 |     for mask in [ mask_LR, mask_MR, mask_MLR, mask_SR]:
35 | 
36 |         res = pred_truth[mask.nonzero()]
37 |         res_sorted = res [ (-res[:,0]).argsort() ]
38 | 
39 |         for r in ratio:
40 |     	    numTops = int(seqLen * r)
41 |             numTops = min(numTops, res_sorted.shape[0] )
42 |             topLabels = res_sorted[:numTops, 1]
43 |             #numCorrects = ( (0<topLabels) & (topLabels<8) ).sum()
44 |             numCorrects = np.sum(topLabels == 1.0)
45 |             accuracy = numCorrects * 1./numTops
46 |             accs.append(accuracy)
47 | 
48 |     return np.array(accs)
49 | 
50 | def conv_out_size_same(size, stride):
51 |   return int(math.ceil(float(size) / float(stride)))
52 | 


--------------------------------------------------------------------------------