├── LICENSE ├── README.md ├── scripts └── create_dataset.py └── trainer ├── device.py ├── input_data.py ├── main.py ├── model_config.json ├── ops.py ├── resnet.py └── util.py /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Haicang Zhang 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | This is the code for [this](https://youtu.be/cw6_OP5An8s) video on Youtube by Siraj Raval on DeepMind AlphaFold. This is a re-implemention of [Sheng and Jinbo's deep leanring model](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005324) on protein contacts prediction, which is a breakthrough in protein structure prediction. However, unfortunately they did not opensource their codes and models. So its a re-implementation of their methods based on the paper. As their paper reported, deep learning model can significantly improve the accuracy of contacts prediction. 4 | 5 | # Coding Challenge - Due Date, Jan 29, 2019 at 12 PM PST - Replicate AlphaFold as best you can. Post your github links in the comment section of the video! I'll give a shoutout to the top 2 entries 6 | 7 | Wizards, see page 11 of this [paper](http://predictioncenter.org/casp13/doc/CASP13_Abstracts.pdf) and DeepMind's [blog post](https://deepmind.com/blog/alphafold/) for details on the AlphaFold algorithm. 2 Residual networks are used. Multiple methods are attempted. 8 | 9 | This code is one very related example to AlphaFold. Here are others that you can build off of 10 | 11 | - https://github.com/carlosmartinezvillar/3DCNNFolds 12 | - https://github.com/igemsoftware2017/AiGEM_TeamHeidelberg2017/blob/master/DeeProtein/DeeProtein_README.md 13 | - https://github.com/pfnet-research/BMI219-2017-ProteinFolding 14 | - https://github.com/5bingstar/Deep-learning-for-contact_map_v2 15 | - https://github.com/Illumina/PrimateAI 16 | 17 | Do the best you can! I'm looking for well documented code. Good luck! 18 | 19 | -------------------------------------------------------------------------------- /scripts/create_dataset.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import argparse 3 | import pandas as pd 4 | import numpy as np 5 | import random 6 | import tensorflow as tf 7 | 8 | PADDING_FULL_LEN = 500 9 | SAMPLES_EACH_CHUNK = 512 10 | 11 | def _int64_list_feature(value): 12 | return tf.train.Feature(int64_list=tf.train.Int64List(value=np.reshape(value,-1))) 13 | 14 | def _float_list_feature(value): 15 | return tf.train.Feature(float_list=tf.train.FloatList(value=np.reshape(value,-1))) 16 | 17 | def _bytes_feature(value): 18 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 19 | 20 | def _int64_feature(value): 21 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 22 | 23 | def create_supervised(name_list, oned_feature_dir,\ 24 | twod_feature_dir, output_prefix): 25 | padding_full_num = PADDING_FULL_LEN * PADDING_FULL_LEN 26 | names = pd.read_csv(name_list, names=['name'], header=None) 27 | names = list(names['name']) 28 | random.shuffle(names) 29 | SAMPLES_EACH_CHUNK = 1024 30 | chunk_num = (len(names) - 1) / SAMPLES_EACH_CHUNK + 1 31 | total_example_num = 0 32 | total_pos_num = 0 33 | total_neg_num = 0 34 | for i in range(chunk_num): 35 | with tf.python_io.TFRecordWriter( 36 | '%s_%s.tfrecord' % (output_prefix, i)) as record_writer: 37 | start = i * SAMPLES_EACH_CHUNK 38 | end = min(len(names), (i+1) * SAMPLES_EACH_CHUNK) 39 | X1d = [] 40 | X2d = [] 41 | Y = [] 42 | for name in names[start:end]: 43 | fea1 = pd.read_csv('{}/{}.1dfeat'.format(oned_feature_dir, name), 44 | header=None, sep='\s+') 45 | L = fea1.shape[0] 46 | if L > PADDING_FULL_LEN: 47 | continue 48 | 49 | x1d = fea1.iloc[:,range(23,43) + range(63,69)].values 50 | fea2 = pd.read_csv('{}/{}.2dfeat'.format(twod_feature_dir, name), 51 | header=None, sep='\s+') 52 | assert L == int(np.sqrt(fea2.shape[0])) 53 | data = fea2.iloc[:,[2,4,5,7]].values 54 | data = data.reshape((L, L, -1)) 55 | x2d = data[:L,:L,1:] 56 | 57 | y = data[:L,:L,0].reshape((L,L)) 58 | #np.fill_diagonal(y, -1.0) 59 | y = np.where(y > 0.0, (y < 8.0).astype(np.int16), y).astype(np.int16) 60 | y[np.tril_indices(y.shape[0], 5)] = -1 61 | neg_num = np.sum(y==0) 62 | pos_num = np.sum(y==1) 63 | if pos_num < 100: 64 | continue 65 | total_pos_num += pos_num 66 | total_neg_num += neg_num 67 | total_example_num += 1 68 | 69 | example = tf.train.Example(features = tf.train.Features( 70 | feature={ 71 | 'size': _int64_feature(L), 72 | 'x1d' : _bytes_feature(x1d.astype(np.float32).tobytes()), 73 | 'x2d' : _bytes_feature(x2d.astype(np.float32).tobytes()), 74 | 'y' : _bytes_feature(y.astype(np.int16).tobytes()) 75 | })) 76 | record_writer.write(example.SerializeToString()) 77 | print name 78 | 79 | print total_example_num, total_pos_num + total_neg_num, total_pos_num, total_neg_num 80 | 81 | def create_semi(name_list, oned_feature_dir,\ 82 | twod_feature_dir, output_prefix, labeled=True): 83 | padding_full_num = PADDING_FULL_LEN * PADDING_FULL_LEN 84 | names = pd.read_csv(name_list, names=['name'], header=None) 85 | names = list(names['name']) 86 | random.shuffle(names) 87 | SAMPLES_EACH_CHUNK = 1024 88 | chunk_num = (len(names) - 1) / SAMPLES_EACH_CHUNK + 1 89 | for i in range(chunk_num): 90 | with tf.python_io.TFRecordWriter( 91 | '%s_%s.tfrecord' % (output_prefix, i)) as record_writer: 92 | start = i * SAMPLES_EACH_CHUNK 93 | end = min(len(names), (i+1) * SAMPLES_EACH_CHUNK) 94 | X1d = [] 95 | X2d = [] 96 | Y = [] 97 | for name in names[start:end]: 98 | fea1 = pd.read_csv('{}/{}.1dfeat'.format(oned_feature_dir, name), 99 | header=None, sep='\s+') 100 | L = fea1.shape[0] 101 | if L > PADDING_FULL_LEN: 102 | continue 103 | 104 | x1d = fea1.iloc[:,range(23,43) + range(63,69)].values 105 | fea2 = pd.read_csv('{}/{}.2dfeat'.format(twod_feature_dir, name), 106 | header=None, sep='\s+') 107 | assert L == int(np.sqrt(fea2.shape[0])) 108 | data = fea2.iloc[:,[2,4,5,7]].values 109 | data = data.reshape((L, L, -1)) 110 | x2d = data[:L,:L,1:] 111 | feature={ 112 | 'size': _int64_feature(L), 113 | 'x1d' : _bytes_feature(x1d.astype(np.float32).tobytes()), 114 | 'x2d' : _bytes_feature(x2d.astype(np.float32).tobytes()), 115 | } 116 | if labeled: 117 | y = data[:L,:L,0].reshape((L,L)) 118 | y = np.where(y > 0.0, (y < 8.0).astype(np.int16), y).astype(np.int16) 119 | y[np.tril_indices(y.shape[0], 5)] = -1 120 | y_ = np.zeros((L,L,2)) 121 | for i in range(L): 122 | for j in range(L): 123 | if y[i,j] < 0: 124 | y_[i,j,:] = -1 125 | else: 126 | y_[i,j,y[i,j]] = 1 127 | feature.update({'y':_bytes_feature(y_.astype(np.int16).tobytes())}) 128 | example = tf.train.Example(features = tf.train.Features(feature=feature)) 129 | record_writer.write(example.SerializeToString()) 130 | 131 | def create_from_ccmpred(name_list,\ 132 | profile_dir, structure_dir,\ 133 | ccmpred_dir, distcb_dir,\ 134 | output_prefix, chunk_size): 135 | PADDING_FULL_LEN = 250 136 | padding_full_num = PADDING_FULL_LEN * PADDING_FULL_LEN 137 | names = pd.read_csv(name_list, names=['name'], header=None) 138 | names = list(names['name']) 139 | random.shuffle(names) 140 | SAMPLES_EACH_CHUNK = chunk_size 141 | chunk_num = (len(names) - 1) / SAMPLES_EACH_CHUNK + 1 142 | #chunk_num = len(names) / SAMPLES_EACH_CHUNK 143 | total_example_num = 0 144 | total_pos_num = 0 145 | total_neg_num = 0 146 | pbar = tqdm(total=len(names)) 147 | for i in range(chunk_num): 148 | with tf.python_io.TFRecordWriter( 149 | '%s_%s.tfrecord' % (output_prefix, i)) as record_writer: 150 | start = i * SAMPLES_EACH_CHUNK 151 | end = min(len(names), (i+1) * SAMPLES_EACH_CHUNK) 152 | #end = (i+1) * SAMPLES_EACH_CHUNK 153 | if i + 1 == chunk_num: 154 | end = len(names) 155 | 156 | X1d = [] 157 | X2d = [] 158 | Y = [] 159 | for name in names[start:end]: 160 | pbar.update(1) 161 | profile = np.loadtxt('{}/{}.profile'.format(profile_dir, name)) 162 | L = profile.shape[0] 163 | if L > PADDING_FULL_LEN: 164 | continue 165 | 166 | structure = pd.read_csv('{}/{}.structure'.format(structure_dir, name)) 167 | structure = structure.iloc[:, range(3,6) + range(16,19) + range(20,21)].values 168 | 169 | ccmpred = np.loadtxt('{}/{}.cc'.format(ccmpred_dir, name)) 170 | assert L == structure.shape[0] 171 | assert L == ccmpred.shape[0] 172 | x1d = np.concatenate([profile, structure], axis=1) 173 | x2d = ccmpred[:,:,np.newaxis] 174 | #np.fill_diagonal(y, -1.0) 175 | if distcb_dir is not None: 176 | y = np.loadtxt('{}/{}.distcb'.format(distcb_dir, name)) 177 | assert L == y.shape[0] 178 | y = np.where(y > 0.0, (y < 8.0).astype(np.int16), y).astype(np.int16) 179 | y[np.tril_indices(y.shape[0], 4)] = -1 180 | #np.fill_diagonal(y, -1) 181 | neg_num = np.sum(y==0) 182 | pos_num = np.sum(y==1) 183 | if pos_num < 100: 184 | continue 185 | total_pos_num += pos_num 186 | total_neg_num += neg_num 187 | 188 | example = tf.train.Example(features = tf.train.Features( 189 | feature={ 190 | 'size': _int64_feature(L), 191 | 'x1d' : _bytes_feature(x1d.astype(np.float32).tobytes()), 192 | 'x2d' : _bytes_feature(x2d.astype(np.float32).tobytes()), 193 | 'y' : _bytes_feature(y.astype(np.int16).tobytes()), 194 | 'name': _bytes_feature(name) 195 | })) 196 | else: 197 | example = tf.train.Example(features = tf.train.Features( 198 | feature={ 199 | 'size': _int64_feature(L), 200 | 'x1d' : _bytes_feature(x1d.astype(np.float32).tobytes()), 201 | 'x2d' : _bytes_feature(x2d.astype(np.float32).tobytes()), 202 | 'name': _bytes_feature(name) 203 | })) 204 | record_writer.write(example.SerializeToString()) 205 | total_example_num += 1 206 | pbar.close() 207 | if distcb_dir is not None: 208 | print total_example_num, total_pos_num + total_neg_num,\ 209 | total_pos_num, total_neg_num 210 | else: 211 | print total_example_num 212 | 213 | 214 | def main(): 215 | parser = argparse.ArgumentParser() 216 | parser.add_argument('--op', 217 | choices=['create_from_ccmpred', 'create_semi', 'create_supervised'], required=True) 218 | parser.add_argument('--name_list', type=str) 219 | parser.add_argument('--oned_feature_dir', type=str) 220 | parser.add_argument('--twod_feature_dir', type=str) 221 | parser.add_argument('--output_prefix', type=str) 222 | parser.add_argument('--labeled', action='store_true') 223 | parser.add_argument('--profile_dir', type=str) 224 | parser.add_argument('--structure_dir', type=str) 225 | parser.add_argument('--ccmpred_dir', type=str) 226 | parser.add_argument('--distcb_dir', type=str) 227 | parser.add_argument('--chunk_size', type=int, default=512) 228 | 229 | args = parser.parse_args() 230 | 231 | if args.op == 'create_semi': 232 | create_semi(args.name_list, args.oned_feature_dir,\ 233 | args.twod_feature_dir, args.output_prefix, args.labeled) 234 | if args.op == 'create_supervised': 235 | create_supervised(args.name_list, args.oned_feature_dir,\ 236 | args.twod_feature_dir, args.output_prefix) 237 | if args.op == 'create_from_ccmpred': 238 | create_from_ccmpred(args.name_list,\ 239 | args.profile_dir, args.structure_dir,\ 240 | args.ccmpred_dir, args.distcb_dir,\ 241 | args.output_prefix, args.chunk_size) 242 | if __name__ == '__main__': 243 | main() 244 | -------------------------------------------------------------------------------- /trainer/device.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python.client import device_lib 2 | 3 | def get_available_cpus(): 4 | local_device_protos = device_lib.list_local_devices() 5 | return [x.name for x in local_device_protos if x.device_type == 'CPU'] 6 | 7 | def get_available_gpus(): 8 | local_device_protos = device_lib.list_local_devices() 9 | return [x.name for x in local_device_protos if x.device_type == 'GPU'] 10 | 11 | print get_available_gpus() 12 | print get_available_cpus() 13 | -------------------------------------------------------------------------------- /trainer/input_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import random 3 | import numpy as np 4 | import gc 5 | from util import RunMode 6 | 7 | class MemDataset: 8 | pass 9 | 10 | class TfRecordDataset: 11 | def __init__(self, train_file_prefix='', chunk_num=0, val_size = 1,\ 12 | test_file_prefix='', unlabel_file_prefix='', unlabel_chunk_num=0): 13 | self.chunk_num = chunk_num 14 | self.train_file_prefix = train_file_prefix 15 | self.test_file_prefix = test_file_prefix 16 | self.unlabel_file_prefix = unlabel_file_prefix 17 | self.unlabel_chunk_num = unlabel_chunk_num 18 | 19 | idx = range(chunk_num) 20 | #random.shuffle(idx) 21 | if type(val_size) == float: 22 | train_chunk_num = np.ceil(chunk_num * (1.0 - val_size)) 23 | else: 24 | train_chunk_num = chunk_num - val_size 25 | 26 | self.train_chunks = idx[:train_chunk_num] 27 | self.val_chunks = idx[train_chunk_num:] 28 | 29 | def get_chunks(self, mode): 30 | input_file_prefix = self.train_file_prefix 31 | if mode == RunMode.TRAIN: 32 | chunks = self.train_chunks 33 | random.shuffle(chunks) 34 | elif mode == RunMode.VALIDATE: 35 | chunks = self.val_chunks 36 | elif mode == RunMode.TEST: 37 | input_file_prefix = self.test_file_prefix 38 | chunks = [0] 39 | elif model == RunMode.UNLABEL: 40 | input_file_prefix = self.unlabel_file_prefix 41 | chunks = range(self.unlabel_chunk_num) 42 | random.shuffle(chunks) 43 | return ['{}_{:d}.tfrecord'.format(input_file_prefix, c)\ 44 | for c in chunks] 45 | -------------------------------------------------------------------------------- /trainer/main.py: -------------------------------------------------------------------------------- 1 | import resnet 2 | import input_data 3 | import argparse 4 | import tensorflow as tf 5 | import numpy as np 6 | import logging 7 | import sys 8 | import json 9 | from tensorflow.python.lib.io import file_io 10 | import os 11 | 12 | def parse_model_config(json_file): 13 | #the 'open' function can't support goole cloud platform 14 | with file_io.FileIO(json_file, 'r') as f: 15 | config = json.load(f) 16 | return config 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--alg', type=str, choices=['gan', 'resn'], default='resn') 21 | parser.add_argument('--epoch', type=int, default=30) 22 | parser.add_argument('--batch_size', type=int, default=2) 23 | parser.add_argument('--learn_rate', type=float, default=0.0002) 24 | parser.add_argument('--beta1', type=float, default=0.9) 25 | parser.add_argument('--l2_reg', type=float, default=0.0001) 26 | parser.add_argument('--down_weight', type=float, default=1.0) 27 | parser.add_argument('--keep_prob', type=float, default=0.8) 28 | parser.add_argument('--op_alg', type=str, choices=['sgd', 'adam'], default='adam') 29 | parser.add_argument('--train_file_prefix', type=str) 30 | parser.add_argument('--unlabel_file_prefix', type=str) 31 | parser.add_argument('--test_file_prefix', type=str, default=None) 32 | parser.add_argument('--chunk_num', type=int, default=4) 33 | parser.add_argument('--unlabel_chunk_num', type=int) 34 | parser.add_argument('--gan_gen_iter', type=int, default=5) 35 | parser.add_argument('--model_config', type=str, required=True) 36 | parser.add_argument('--job_dir', type=str) 37 | parser.add_argument('--log_file', type=str) 38 | parser.add_argument('--summary_dir', type=str) 39 | parser.add_argument('--model_dir', type=str) 40 | parser.add_argument('--output_dir', type=str) 41 | parser.add_argument('--model_path', type=str) 42 | parser.add_argument('--mode', type=str, choices=['test', 'train'], default='train') 43 | 44 | args = parser.parse_args() 45 | if args.job_dir is not None: 46 | os.makedirs(args.job_dir) 47 | if args.summary_dir is None: 48 | args.summary_dir = '{}/summary'.format(args.job_dir) 49 | os.makedirs(args.summary_dir) 50 | if args.model_dir is None: 51 | args.model_dir = '{}/model'.format(args.job_dir) 52 | os.makedirs(args.model_dir) 53 | if args.log_file is None: 54 | args.log_file = '{}/run.log'.format(args.job_dir) 55 | logger = logging.getLogger() 56 | logger.setLevel(logging.DEBUG) 57 | formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') 58 | log_file_stream=file_io.FileIO(args.log_file,'a') 59 | fh = logging.StreamHandler(log_file_stream) 60 | fh.setFormatter(formatter) 61 | logger.addHandler(fh) 62 | ch = logging.StreamHandler(sys.stdout) 63 | ch.setFormatter(formatter) 64 | logger.addHandler(ch) 65 | 66 | model_config = parse_model_config(args.model_config) 67 | logging.info('train_config: {:s}'.format(args)) 68 | logging.info('model_config: {:s}'.format(json.dumps(model_config))) 69 | 70 | if args.alg == 'resn': 71 | with tf.Session() as sess: 72 | if args.mode == 'train': 73 | dataset = input_data.TfRecordDataset( 74 | args.train_file_prefix, args.chunk_num, val_size = 1, 75 | test_file_prefix = args.test_file_prefix) 76 | resn_ = resnet.Resnet(sess, dataset, train_config=args, model_config=model_config) 77 | resn_.train() 78 | elif args.mode == 'test': 79 | dataset = input_data.TfRecordDataset(test_file_prefix = args.test_file_prefix) 80 | resn_ = resnet.Resnet(sess, dataset, train_config=args, model_config=model_config) 81 | resn_.predict(args.output_dir, args.model_path) 82 | 83 | if __name__ == '__main__': 84 | main() 85 | -------------------------------------------------------------------------------- /trainer/model_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "1d":{ 3 | "filters":32, 4 | "kernel_size":7, 5 | "block_num":8, 6 | "channel_dim":32 7 | }, 8 | "2d":{ 9 | "filters":16, 10 | "kernel_size":5, 11 | "block_num":2, 12 | "channel_dim":3 13 | }, 14 | "2d_label_size":400, 15 | "1d_label_size":20 16 | } 17 | -------------------------------------------------------------------------------- /trainer/ops.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | from tensorflow.python.framework import ops 6 | 7 | def linear(input_, output_size, scope=None, stddev=0.02, bias_start=0.0, with_w=False): 8 | shape = input_.get_shape().as_list() 9 | 10 | with tf.variable_scope(scope or "Linear"): 11 | matrix = tf.get_variable("Matrix", [shape[1], output_size], tf.float32, 12 | tf.random_normal_initializer(stddev=stddev)) 13 | bias = tf.get_variable("bias", [output_size], 14 | initializer=tf.constant_initializer(bias_start)) 15 | if with_w: 16 | return tf.matmul(input_, matrix) + bias, matrix, bias 17 | else: 18 | return tf.matmul(input_, matrix) + bias 19 | 20 | def deconv2d(input_, output_shape, 21 | k_h=5, k_w=5, d_h=2, d_w=2, stddev=0.02, 22 | name="deconv2d", with_w=False): 23 | with tf.variable_scope(name): 24 | # filter : [height, width, output_channels, in_channels] 25 | w = tf.get_variable('w', [k_h, k_w, output_shape[-1], input_.get_shape()[-1]], 26 | initializer=tf.random_normal_initializer(stddev=stddev)) 27 | 28 | deconv = tf.nn.conv2d_transpose(input_, w, output_shape=output_shape, 29 | strides=[1, d_h, d_w, 1]) 30 | 31 | biases = tf.get_variable('biases', [output_shape[-1]], initializer=tf.constant_initializer(0.0)) 32 | deconv = tf.reshape(tf.nn.bias_add(deconv, biases), deconv.get_shape()) 33 | 34 | if with_w: 35 | return deconv, w, biases 36 | else: 37 | return deconv 38 | -------------------------------------------------------------------------------- /trainer/resnet.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import tensorflow as tf 4 | import util 5 | from util import RunMode 6 | import logging 7 | 8 | #PADDING_FULL_LEN = 500 9 | PADDING_FULL_LEN = 250 10 | 11 | class Resnet: 12 | def __init__(self, sess, dataset, train_config, model_config): 13 | self.sess = sess 14 | self.dataset = dataset 15 | self.train_config = train_config 16 | self.model_config = model_config 17 | 18 | self.input_tfrecord_files = tf.placeholder(tf.string, shape=[None]) 19 | self.keep_prob = tf.placeholder(tf.float32) 20 | self.training = tf.placeholder(tf.bool) 21 | 22 | self.x1d_channel_dim = model_config['1d']['channel_dim'] 23 | self.x2d_channel_dim = model_config['2d']['channel_dim'] 24 | 25 | def cnn_with_2dfeature(self, x2d, reuse=False): 26 | with tf.variable_scope('discriminator', reuse=reuse) as scope: 27 | block_num = 8 28 | filters = 16 29 | kernel_size = [4, 4] 30 | act = tf.nn.relu 31 | #kernel_initializer = tf.truncated_normal_initializer(stddev=0.01) 32 | kernel_initializer = tf.glorot_normal_initializer() 33 | #kernel_initializer = None 34 | bias_initializer = tf.zeros_initializer() 35 | #kernel_regularizer = tf.contrib.layers.l2_regularizer(scale=0.001) 36 | kernel_regularizer = None 37 | bias_regularizer = None 38 | 39 | for i in np.arange(block_num): 40 | inputs = x2d if i == 0 else conv_ 41 | conv_ = tf.layers.conv2d(inputs=inputs, filters=filters, 42 | kernel_size=kernel_size, strides=(1,1), padding='same', activation=act, 43 | kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, 44 | kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer) 45 | 46 | logits = tf.layers.conv2d(inputs=conv_, filters=1, 47 | kernel_size=kernel_size, strides=(1,1), padding='same', 48 | kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, 49 | kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer) 50 | 51 | logits = tf.reshape(logits, (-1, tf.shape(logits)[1], tf.shape(logits)[2])) 52 | return tf.sigmoid(logits), logits 53 | 54 | def resn_with_2dfeature(self, x2d, reuse=False): 55 | with tf.variable_scope('discriminator', reuse=reuse) as scope: 56 | block_num = 8 57 | filters = 32 58 | kernel_size = [4, 4] 59 | act = tf.nn.relu 60 | #kernel_initializer = tf.truncated_normal_initializer(stddev=0.01) 61 | kernel_initializer = tf.glorot_normal_initializer() 62 | #kernel_initializer = None 63 | bias_initializer = tf.zeros_initializer() 64 | #kernel_regularizer = tf.contrib.layers.l2_regularizer(scale=0.001) 65 | kernel_regularizer = None 66 | bias_regularizer = None 67 | 68 | prev = tf.layers.conv2d(inputs=x2d, filters=filters, 69 | kernel_size=kernel_size, strides=(1,1), padding='same', 70 | kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, 71 | kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer) 72 | for i in np.arange(block_num): 73 | conv_ = act(prev) 74 | conv_ = tf.layers.conv2d(inputs=conv_, filters=filters, 75 | kernel_size=kernel_size, strides=(1,1), padding='same', 76 | kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, 77 | kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer) 78 | conv_ = act(conv_) 79 | conv_ = tf.layers.conv2d(inputs=conv_, filters=filters, 80 | kernel_size=kernel_size, strides=(1,1), padding='same', 81 | kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, 82 | kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer) 83 | prev = tf.add(conv_, prev) 84 | 85 | logits = tf.layers.conv2d(inputs=prev, filters=1, 86 | kernel_size=kernel_size, strides=(1,1), padding='same', 87 | kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, 88 | kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer) 89 | 90 | logits = tf.reshape(logits, (-1, tf.shape(logits)[1], tf.shape(logits)[2])) 91 | return tf.sigmoid(logits), logits 92 | 93 | def resn(self, x1d, x2d, reuse=False): 94 | with tf.variable_scope('discriminator', reuse=reuse) as scope: 95 | act = tf.nn.relu 96 | 97 | filters_1d = self.model_config['1d']['filters'] 98 | kernel_size_1d = self.model_config['1d']['kernel_size'] 99 | block_num_1d = self.model_config['1d']['block_num'] 100 | 101 | filters_2d = self.model_config['2d']['filters'] 102 | kernel_size_2d = self.model_config['2d']['kernel_size'] 103 | block_num_2d = self.model_config['2d']['block_num'] 104 | 105 | #kernel_initializer = tf.glorot_normal_initializer() 106 | kernel_initializer = tf.variance_scaling_initializer() 107 | bias_initializer = tf.zeros_initializer() 108 | if self.train_config.l2_reg <= 0.0: 109 | kernel_regularizer = None 110 | else: 111 | kernel_regularizer = tf.contrib.layers.l2_regularizer(scale=self.train_config.l2_reg) 112 | bias_regularizer = None 113 | 114 | prev_1d = tf.layers.conv1d(inputs=x1d, filters=filters_1d, 115 | kernel_size=kernel_size_1d, strides=1, padding='same', 116 | kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, 117 | kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=False) 118 | for i in np.arange(block_num_1d): 119 | conv_1d = act(prev_1d) 120 | conv_1d = tf.layers.conv1d(inputs=conv_1d, filters=filters_1d, 121 | kernel_size=kernel_size_1d, strides=1, padding='same', 122 | kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, 123 | kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=False) 124 | conv_1d = act(conv_1d) 125 | conv_1d = tf.layers.conv1d(inputs=conv_1d, filters=filters_1d, 126 | kernel_size=kernel_size_1d, strides=1, padding='same', 127 | kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, 128 | kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=False) 129 | 130 | prev_1d = tf.add(conv_1d, prev_1d) 131 | 132 | out_1d = tf.expand_dims(prev_1d, axis=3) 133 | ones = tf.ones((1, PADDING_FULL_LEN)) 134 | #left_1d = tf.tensordot(out_1d, ones, [[3], [0]]) 135 | left_1d = tf.einsum('abcd,de->abce', out_1d, ones) 136 | left_1d = tf.transpose(left_1d, perm=[0,1,3,2]) 137 | right_1d = tf.transpose(left_1d, perm=[0,2,1,3]) 138 | print '1d shape', left_1d.shape, right_1d.shape 139 | 140 | input_2d = tf.concat([x2d, left_1d, right_1d], axis=3) 141 | print '2d shape', input_2d.shape 142 | 143 | prev_2d = tf.layers.conv2d(inputs=input_2d, filters=filters_2d, 144 | kernel_size=kernel_size_2d, strides=(1,1), padding='same', 145 | kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, 146 | kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=False) 147 | for i in np.arange(block_num_2d): 148 | conv_2d = act(prev_2d) 149 | conv_2d = tf.layers.conv2d(inputs=conv_2d, filters=filters_2d, 150 | kernel_size=kernel_size_2d, strides=(1,1), padding='same', 151 | kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, 152 | kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=False) 153 | conv_2d = act(conv_2d) 154 | conv_2d = tf.layers.conv2d(inputs=conv_2d, filters=filters_2d, 155 | kernel_size=kernel_size_2d, strides=(1,1), padding='same', 156 | kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, 157 | kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=False) 158 | prev_2d = tf.add(conv_2d, prev_2d) 159 | 160 | logits = tf.layers.conv2d(inputs=prev_2d, filters=1, 161 | kernel_size=kernel_size_2d, strides=(1,1), padding='same', 162 | kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, 163 | kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, use_bias=True) 164 | 165 | #logits = tf.reshape(logits, (-1, tf.shape(logits)[1], tf.shape(logits)[2])) 166 | logits = tf.squeeze(logits, 3) 167 | logits_tran = tf.transpose(logits, perm=[0, 2, 1]) 168 | logits = (logits + logits_tran) / 2.0 169 | return tf.sigmoid(logits), logits 170 | 171 | def evaluate(self, mode): 172 | self.sess.run(self.iterator.initializer,\ 173 | feed_dict={self.input_tfrecord_files:self.dataset.get_chunks(mode)}) 174 | acc = [] 175 | while True: 176 | try: 177 | pred, y, size = self.sess.run([self.pred, self.y, self.size]) 178 | for y_, pred_, size_ in zip(y, pred, size): 179 | #pred_ = (pred_ + np.transpose(pred_)) / 2.0 180 | acc_ = util.TopAccuracy(pred_[:size_, :size_], y_[:size_, :size_]) 181 | acc.append(acc_) 182 | except tf.errors.OutOfRangeError: 183 | break 184 | acc = np.array(acc) 185 | acc = np.mean(acc, axis=0) 186 | acc_str = ' '.join(['%.4f ' % acc_ for acc_ in acc]) 187 | logging.info('{:s} acc: {:s}'.format(mode, acc_str)) 188 | return 189 | 190 | def build_input(self): 191 | with tf.device('/cpu:0'): 192 | def parser(record): 193 | keys_to_features = { 194 | 'x1d' :tf.FixedLenFeature([], tf.string), 195 | 'x2d' :tf.FixedLenFeature([], tf.string), 196 | 'y' :tf.FixedLenFeature([], tf.string), 197 | 'size':tf.FixedLenFeature([], tf.int64)} 198 | parsed = tf.parse_single_example(record, keys_to_features) 199 | x1d = tf.decode_raw(parsed['x1d'], tf.float32) 200 | x2d = tf.decode_raw(parsed['x2d'] ,tf.float32) 201 | size = parsed['size'] 202 | x1d = tf.reshape(x1d, tf.stack([size, -1])) 203 | x2d = tf.reshape(x2d, tf.stack([size, size, -1])) 204 | y = tf.decode_raw(parsed['y'],tf.int16) 205 | y = tf.cast(y, tf.float32) 206 | y = tf.reshape(y, tf.stack([size, size])) 207 | return x1d, x2d, y, size 208 | 209 | dataset = tf.data.TFRecordDataset(self.input_tfrecord_files) 210 | dataset = dataset.map(parser, num_parallel_calls=64) 211 | dataset = dataset.prefetch(1024) 212 | dataset = dataset.shuffle(buffer_size=512) 213 | dataset = dataset.padded_batch(self.train_config.batch_size, 214 | padded_shapes=([PADDING_FULL_LEN, self.x1d_channel_dim], 215 | [PADDING_FULL_LEN, PADDING_FULL_LEN, self.x2d_channel_dim], 216 | [PADDING_FULL_LEN, PADDING_FULL_LEN], []), 217 | padding_values=(0.0, 0.0, -1.0, np.int64(PADDING_FULL_LEN))) 218 | iterator = dataset.make_initializable_iterator() 219 | x1d, x2d, y, size = iterator.get_next() 220 | return x1d, x2d, y, size, iterator 221 | 222 | def train(self): 223 | self.x1d, self.x2d, self.y, self.size, self.iterator = self.build_input() 224 | 225 | with tf.device('/gpu:0'): 226 | #self.pred, logits = self.discriminator_cnn(self.x2d) 227 | #self.pred, logits = self.discriminator_resn(self.x2d) 228 | self.pred, logits = self.resn(self.x1d, self.x2d) 229 | if self.train_config.down_weight >= 1.0: 230 | mask = tf.greater_equal(self.y, 0.0) 231 | labels = tf.boolean_mask(self.y, mask) 232 | logits = tf.boolean_mask(logits, mask) 233 | self.loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = labels, logits = logits)) 234 | else: 235 | mask_pos = tf.equal(self.y, 1.0) 236 | label_pos = tf.boolean_mask(self.y, mask_pos) 237 | logit_pos = tf.boolean_mask(logits, mask_pos) 238 | loss_pos = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = label_pos, logits = logit_pos)) 239 | 240 | mask_neg = tf.equal(self.y, 0.0) 241 | label_neg = tf.boolean_mask(self.y, mask_neg) 242 | logit_neg = tf.boolean_mask(logits, mask_neg) 243 | loss_neg = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = label_neg, logits = logit_neg)) 244 | self.loss = loss_neg * self.train_config.down_weight + loss_pos 245 | 246 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)#for batch normalization 247 | with tf.control_dependencies(update_ops): 248 | if self.train_config.op_alg == 'adam': 249 | optim = tf.train.AdamOptimizer(self.train_config.learn_rate, 250 | beta1=self.train_config.beta1).minimize(self.loss) 251 | elif self.train_config.op_alg == 'sgd': 252 | optim = tf.train.GradientDescentOptimizer( 253 | self.train_config.learn_rate).minimize(self.loss) 254 | 255 | tf.summary.scalar('train_loss', self.loss) 256 | merged_summary = tf.summary.merge_all() 257 | train_writer = tf.summary.FileWriter(self.train_config.summary_dir, self.sess.graph) 258 | tf.global_variables_initializer().run() 259 | steps = 0 260 | saver = tf.train.Saver() 261 | for epoch in np.arange(self.train_config.epoch): 262 | self.sess.run(self.iterator.initializer,\ 263 | feed_dict={self.input_tfrecord_files:self.dataset.get_chunks(RunMode.TRAIN)}) 264 | train_loss = 0.0 265 | while True: 266 | try: 267 | _, _loss, summary = self.sess.run([optim, self.loss, merged_summary]) 268 | train_loss += _loss 269 | train_writer.add_summary(summary, steps) 270 | steps += 1 271 | except tf.errors.OutOfRangeError: 272 | break 273 | saver.save(self.sess, '{}/model'.format(self.train_config.model_dir), 274 | global_step=epoch) 275 | logging.info('Epoch= {:d} train_loss= {:.4f}'.format(epoch, train_loss)) 276 | self.evaluate(RunMode.VALIDATE) 277 | if self.train_config.test_file_prefix is not None: 278 | self.evaluate(RunMode.TEST) 279 | train_writer.close() 280 | 281 | def build_input_test(self): 282 | with tf.device('/cpu:0'): 283 | def parser(record): 284 | keys_to_features = { 285 | 'x1d' :tf.FixedLenFeature([], tf.string), 286 | 'x2d' :tf.FixedLenFeature([], tf.string), 287 | 'name':tf.FixedLenFeature([], tf.string), 288 | 'size':tf.FixedLenFeature([], tf.int64)} 289 | parsed = tf.parse_single_example(record, keys_to_features) 290 | x1d = tf.decode_raw(parsed['x1d'], tf.float32) 291 | x2d = tf.decode_raw(parsed['x2d'] ,tf.float32) 292 | size = parsed['size'] 293 | x1d = tf.reshape(x1d, tf.stack([size, -1])) 294 | x2d = tf.reshape(x2d, tf.stack([size, size, -1])) 295 | name = parsed['name'] 296 | return x1d, x2d, name, size 297 | 298 | dataset = tf.data.TFRecordDataset(self.input_tfrecord_files) 299 | dataset = dataset.map(parser, num_parallel_calls=64) 300 | dataset = dataset.prefetch(512) 301 | #dataset = dataset.shuffle(buffer_size=512) 302 | dataset = dataset.padded_batch(self.train_config.batch_size, 303 | padded_shapes=([PADDING_FULL_LEN, self.x1d_channel_dim], 304 | [PADDING_FULL_LEN, PADDING_FULL_LEN, self.x2d_channel_dim], 305 | [], []), 306 | padding_values=(0.0, 0.0, "", np.int64(PADDING_FULL_LEN))) 307 | iterator = dataset.make_initializable_iterator() 308 | x1d, x2d, name, size = iterator.get_next() 309 | return x1d, x2d, name, size, iterator 310 | 311 | def predict(self, output_dir, model_path): 312 | x1d, x2d, name, size, iterator = self.build_input_test() 313 | preds, logits = self.resn(x1d, x2d) 314 | saver = tf.train.Saver() 315 | saver.restore(self.sess, model_path) 316 | self.sess.run(iterator.initializer, 317 | feed_dict={self.input_tfrecord_files:self.dataset.get_chunks(RunMode.TEST)}) 318 | while True: 319 | try: 320 | preds_, names_, sizes_, = self.sess.run([preds, name, size]) 321 | for pred_, name_, size_ in zip(preds_, names_, sizes_): 322 | pred_ = pred_[:size_, :size_] 323 | #inds = np.triu_indices_from(pred_, k=1) 324 | #pred_[(inds[1], inds[0])] = pred_[inds] 325 | #pred_ = (pred_ + np.transpose(pred_)) / 2.0 326 | output_path = '{}/{}.concat'.format(output_dir, name_) 327 | np.savetxt(output_path, pred_) 328 | except tf.errors.OutOfRangeError: 329 | break 330 | -------------------------------------------------------------------------------- /trainer/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from enum import Enum 3 | 4 | class RunMode(Enum): 5 | TRAIN=1 6 | VALIDATE=2 7 | TEST=3 8 | UNLABEL=4 9 | 10 | def TopAccuracy(pred=None, truth=None, ratio=[1, 0.5, 0.2, 0.1]): 11 | if pred is None: 12 | print 'please provide a predicted contact matrix' 13 | sys.exit(-1) 14 | 15 | if truth is None: 16 | print 'please provide a true contact matrix' 17 | sys.exit(-1) 18 | 19 | assert pred.shape[0] == pred.shape[1] 20 | assert pred.shape == truth.shape 21 | 22 | pred_truth = np.dstack( (pred, truth) ) 23 | 24 | M1s = np.ones_like(truth, dtype = np.int8) 25 | mask_LR = np.triu(M1s, 24) 26 | mask_MLR = np.triu(M1s, 12) 27 | mask_SMLR = np.triu(M1s, 6) 28 | mask_MR = mask_MLR - mask_LR 29 | mask_SR = mask_SMLR - mask_MLR 30 | 31 | seqLen = pred.shape[0] 32 | 33 | accs = [] 34 | for mask in [ mask_LR, mask_MR, mask_MLR, mask_SR]: 35 | 36 | res = pred_truth[mask.nonzero()] 37 | res_sorted = res [ (-res[:,0]).argsort() ] 38 | 39 | for r in ratio: 40 | numTops = int(seqLen * r) 41 | numTops = min(numTops, res_sorted.shape[0] ) 42 | topLabels = res_sorted[:numTops, 1] 43 | #numCorrects = ( (0