├── .gitignore ├── README.md ├── libs ├── __init__.py ├── config │ ├── __init__.py │ └── config.py ├── datasets │ ├── __init__.py │ ├── data_factory.py │ └── data_preprocessing.py ├── nets │ ├── __init__.py │ ├── highway_network.py │ └── network.py └── utils │ ├── __init__.py │ ├── acc_cal_v1.py │ └── acc_cal_v2.py ├── models ├── model.ckpt-20000.data-00000-of-00001 └── model.ckpt-20000.index ├── read_data_into_tfrecord.py ├── test.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | output 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep-learning-for-contact\_map\_v2 2 | 3 | This is an **unofficial** implementation of [Accurate De Novo Prediction of Protein Contact Map by Ultra-Deep Learning Model](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005324), which is a deep learning method for prediction of pretein contact map, with predicted contact map by other software(for example CCMpred, PSICOV, and so on) as input. 4 | 5 | ## Requirements 6 | 7 | - [python2.7]() 8 | - [Tensorflow (>= 1.0.0)](https://www.tensorflow.org/install/install_linux) 9 | - [Numpy](https://github.com/numpy/numpy/blob/master/INSTALL.rst.txt) 10 | 11 | ## Introduction 12 | - Network structure: 2 networks(residual network and highway network) were implemented; 13 | - Batch normalization and L2 regulation were implemented for optimization. 14 | 15 | ## Need to do 16 | 1. Get protein structure 1D features(for example sequence, sse, ACA, and so on), and 2D features(for example predicted CCMpred, PSICOV and other pairwise features) 17 | 2. Modify `./read_into_tfrecord.py`, and used it to transfer your data to tfrecord 18 | 3. set your own config in `./libs/config/config.py` 19 | 4. run `python train.py` 20 | -------------------------------------------------------------------------------- /libs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doubleQ2018/Deep-learning-for-contact_map_v2/a039e99d39ce99ffde42ef4c6a429b5c9055e2b3/libs/__init__.py -------------------------------------------------------------------------------- /libs/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doubleQ2018/Deep-learning-for-contact_map_v2/a039e99d39ce99ffde42ef4c6a429b5c9055e2b3/libs/config/__init__.py -------------------------------------------------------------------------------- /libs/config/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import tensorflow as tf 4 | 5 | # database files 6 | tf.app.flags.DEFINE_string( 7 | 'train_file', 'train.pkl', 8 | 'Directory where checkpoints and event logs are written to.') 9 | 10 | tf.app.flags.DEFINE_string( 11 | 'valid_file', 'valid.pkl', 12 | 'Directory where checkpoints and event logs are written to.') 13 | 14 | tf.app.flags.DEFINE_string( 15 | 'test_file', 'test.pkl', 16 | 'Directory where checkpoints and event logs are written to.') 17 | 18 | # dir paths 19 | tf.app.flags.DEFINE_string( 20 | 'train_dir', './output/residual_network/', 21 | 'Directory where checkpoints and event logs are written to.') 22 | 23 | tf.app.flags.DEFINE_string( 24 | 'data_dir', './data/', 25 | 'Directory of database.') 26 | 27 | # network building params 28 | tf.app.flags.DEFINE_integer( 29 | 'filter_size_1d', 17, 30 | 'filter size for 1D conv.') 31 | 32 | tf.app.flags.DEFINE_integer( 33 | 'filter_size_2d', 3, 34 | 'filter size for 2D conv.') 35 | 36 | tf.app.flags.DEFINE_integer( 37 | 'block_num_1d', 1, 38 | 'num of residual block for 1D conv.') 39 | 40 | tf.app.flags.DEFINE_integer( 41 | 'block_num_2d', 20, 42 | 'num of residual block for 2D conv.') 43 | 44 | # net training params 45 | tf.app.flags.DEFINE_integer( 46 | 'max_iters', 100000, 47 | 'maximum iteration times') 48 | 49 | # restore model 50 | tf.app.flags.DEFINE_bool( 51 | 'restore_previous_if_exists', True, 52 | 'restore models trained previous') 53 | 54 | FLAGS = tf.app.flags.FLAGS 55 | -------------------------------------------------------------------------------- /libs/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doubleQ2018/Deep-learning-for-contact_map_v2/a039e99d39ce99ffde42ef4c6a429b5c9055e2b3/libs/datasets/__init__.py -------------------------------------------------------------------------------- /libs/datasets/data_factory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import tensorflow as tf 4 | import glob 5 | from tensorflow.python.lib.io.tf_record import TFRecordCompressionType 6 | 7 | def get_dataset(split_name, dataset_dir, file_pattern=None): 8 | if file_pattern is None: 9 | file_pattern = split_name + '*.tfrecord' 10 | tfrecords = glob.glob(dataset_dir + '/records/' + file_pattern) 11 | name,seqLen, seq_feature, pair_feature, label = read_tfrecord(tfrecords) 12 | 13 | return name, seqLen, seq_feature, pair_feature, label 14 | 15 | 16 | def read_tfrecord(tfrecords_filename): 17 | if not isinstance(tfrecords_filename, list): 18 | tfrecords_filename = [tfrecords_filename] 19 | filename_queue = tf.train.string_input_producer( 20 | tfrecords_filename, num_epochs=100) 21 | 22 | options = tf.python_io.TFRecordOptions(TFRecordCompressionType.ZLIB) 23 | reader = tf.TFRecordReader(options=options) 24 | _, serialized_example = reader.read(filename_queue) 25 | features = tf.parse_single_example( 26 | serialized_example, 27 | features={ 28 | 'name': tf.FixedLenFeature([], tf.string), 29 | 'seqLen': tf.FixedLenFeature([], tf.int64), 30 | 'seq_feature': tf.FixedLenFeature([], tf.string), 31 | 'pair_feature': tf.FixedLenFeature([], tf.string), 32 | 'label_matrix': tf.FixedLenFeature([], tf.string), 33 | }) 34 | name = features['name'] 35 | seqLen = tf.cast(features['seqLen'], tf.int32) 36 | seq_feature = tf.decode_raw(features['seq_feature'], tf.float32) 37 | seq_feature = tf.reshape(seq_feature, [seqLen, -1]) # reshape seq feature to shape = (L, feature_maps) 38 | pair_feature = tf.decode_raw(features['pair_feature'], tf.float32) 39 | pair_feature = tf.reshape(pair_feature, [seqLen, seqLen, -1]) # reshape pair feature to shape = (L, L, feature_maps) 40 | label = tf.decode_raw(features['label_matrix'], tf.uint8) 41 | label = tf.reshape(label, [seqLen, seqLen, 1]) # reshape label to shape = (L, L, 1) 42 | 43 | return name, seqLen, seq_feature, pair_feature, label 44 | 45 | def test(): 46 | dataset_dir = "data/" 47 | split_name = "train" 48 | name, seqLen, seq_feature, pair_feature, label = get_dataset(split_name, dataset_dir) 49 | 50 | init = tf.initialize_local_variables() 51 | sess = tf.Session() 52 | sess.run(init) 53 | tf.train.start_queue_runners(sess=sess) 54 | name, seqLen, seq, pair, label = sess.run([name,seqLen, seq_feature, pair_feature, label]) 55 | print name 56 | print seqLen 57 | print seq.shape 58 | print pair.shape 59 | for l in label: 60 | print ''.join([str(i) for i in l]) 61 | 62 | #test() 63 | -------------------------------------------------------------------------------- /libs/datasets/data_preprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import cPickle as pickle 5 | import numpy as np 6 | import math 7 | import tensorflow as tf 8 | from tensorflow.python.lib.io.tf_record import TFRecordCompressionType 9 | from libs.config.config import * 10 | 11 | 12 | def _int64_feature(values): 13 | if not isinstance(values, (tuple, list)): 14 | values = [values] 15 | return tf.train.Feature(int64_list=tf.train.Int64List(value=values)) 16 | 17 | def _bytes_feature(values): 18 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values])) 19 | 20 | 21 | def to_tfexample_raw(name, seqLen, seq_feature, pair_feature, label_data): 22 | return tf.train.Example(features=tf.train.Features(feature={ 23 | 'name': _bytes_feature(name), 24 | 'seqLen': _int64_feature(seqLen), 25 | 'seq_feature': _bytes_feature(seq_feature), # of shape (L, L, 26) 26 | 'pair_feature': _bytes_feature(pair_feature), # of shape (L, L, 5) 27 | 'label_matrix': _bytes_feature(label_data), # of shape (L, L) 28 | })) 29 | 30 | def get_dataset_filename(dataset_dir, split_name, shard_id, num_shards): 31 | output_filename = '%s_%05d-of-%05d.tfrecord' % (split_name, shard_id, num_shards) 32 | return os.path.join(dataset_dir, output_filename) 33 | 34 | def extract_single(info): 35 | name = info['name'] 36 | seq = info['sequence'] 37 | seqLen = len(seq) 38 | acc = info['ACC'] 39 | ss3 = info['SS3'] 40 | pssm = info['PSSM'] 41 | sequence_profile = np.concatenate((pssm, ss3, acc), axis = 1) 42 | ccmpred = info['ccmpredZ'] 43 | psicov = info['psicovZ'] 44 | other = info['OtherPairs'] 45 | pairwise_profile = np.dstack((ccmpred, psicov)) 46 | pairwise_profile = np.concatenate((pairwise_profile, other), axis = 2) #shape = (L, L, 5) 47 | true_contact = info['contactMatrix'] 48 | true_contact[true_contact < 0] = 0 # transfer -1 to 0, shape = (L, L) 49 | true_contact = np.tril(true_contact, k=-6) + np.triu(true_contact, k=6) # remove the diagnol contact 50 | true_contact = true_contact.astype(np.uint8) 51 | 52 | return name, seqLen, sequence_profile, pairwise_profile, true_contact 53 | 54 | def add_to_tfrecord(records_dir, split_name, infos): 55 | """Loads image files and writes files to a TFRecord. 56 | Note: masks and bboxes will lose shape info after converting to string. 57 | """ 58 | num_shards = int(len(infos) / 1000) 59 | num_per_shard = int(math.ceil(len(infos) / float(num_shards))) 60 | 61 | with tf.Graph().as_default(), tf.device('/cpu:0'): 62 | with tf.Session('') as sess: 63 | for shard_id in range(num_shards): 64 | record_filename = get_dataset_filename(records_dir, split_name, shard_id, num_shards) 65 | options = tf.python_io.TFRecordOptions(TFRecordCompressionType.ZLIB) 66 | with tf.python_io.TFRecordWriter(record_filename, options=options) as tfrecord_writer: 67 | start_ndx = shard_id * num_per_shard 68 | end_ndx = min((shard_id + 1) * num_per_shard, len(infos)) 69 | print "processing %s_data from %d to %d..." %(split_name, start_ndx, end_ndx) 70 | for i in range(start_ndx, end_ndx): 71 | info = infos[i] 72 | name, seqLen, seq_feature, pair_feature, label = extract_single(info) 73 | if seqLen > 300: 74 | continue 75 | #print "generate tfrecord for %s" %name 76 | seq_feature = seq_feature.astype(np.float32) 77 | pair_feature = pair_feature.astype(np.float32) 78 | label = label.astype(np.uint8) 79 | 80 | example = to_tfexample_raw(name, seqLen, seq_feature.tostring(), pair_feature.tostring(), label.tostring()) 81 | tfrecord_writer.write(example.SerializeToString()) 82 | 83 | -------------------------------------------------------------------------------- /libs/nets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doubleQ2018/Deep-learning-for-contact_map_v2/a039e99d39ce99ffde42ef4c6a429b5c9055e2b3/libs/nets/__init__.py -------------------------------------------------------------------------------- /libs/nets/highway_network.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | 6 | def weight_variable(shape, regularizer, name="W"): 7 | if regularizer == None: 8 | initial = tf.truncated_normal(shape, stddev=0.1) 9 | return tf.Variable(initial, name) 10 | else: 11 | return tf.get_variable(name, shape, 12 | initializer=tf.random_normal_initializer(), regularizer=regularizer) 13 | 14 | def bias_variable(shape, name="b"): 15 | initial = tf.constant(0.1, shape=shape) 16 | return tf.Variable(initial, name) 17 | 18 | ### Incoming shape (batch_size, L(seqLen), feature_num) 19 | ### Output[:, i, j, :] = incoming[:. i, :] + incoming[:, j, :] + incoming[:, (i+j)/2, :] 20 | def seq2pairwise(incoming): 21 | L = tf.shape(incoming)[1] 22 | #save the indexes of each position 23 | v = tf.range(0, L, 1) 24 | i, j = tf.meshgrid(v, v) 25 | m = (i+j)/2 26 | #switch batch dim with L dim to put L at first 27 | incoming2 = tf.transpose(incoming, perm=[1, 0, 2]) 28 | #full matrix i with element in incomming2 indexed i[i][j] 29 | out1 = tf.nn.embedding_lookup(incoming2, i) 30 | out2 = tf.nn.embedding_lookup(incoming2, j) 31 | out3 = tf.nn.embedding_lookup(incoming2, m) 32 | #concatante final feature dim together 33 | out = tf.concat([out1, out2, out3], axis=3) 34 | #return to original dims 35 | output = tf.transpose(out, perm=[2, 0, 1, 3]) 36 | return output 37 | 38 | def highway_1d(incoming, out_channels, filter_size, \ 39 | regularizer, batch_norm=False, scope=None, name="highway_net1d"): 40 | net = incoming 41 | in_channels = incoming.get_shape().as_list()[-1] 42 | ident = net 43 | with tf.variable_scope(scope, default_name = name, values=[incoming]) as scope: 44 | # 1st conv layer in residual block 45 | W = weight_variable([filter_size, in_channels, out_channels], \ 46 | regularizer, name="W") 47 | b = bias_variable([out_channels], name="b") 48 | W_T = weight_variable([filter_size, in_channels, out_channels], \ 49 | regularizer, name="W_T") 50 | b_T = bias_variable([out_channels], name="b_T") 51 | 52 | H = tf.nn.conv1d(net, W, stride=1, padding='SAME') + b 53 | if batch_norm: 54 | H = tf.contrib.layers.batch_norm(H) 55 | H = tf.nn.relu(H) 56 | 57 | T = tf.nn.conv1d(net, W_T, stride=1, padding='SAME') + b_T 58 | if batch_norm: 59 | T = tf.contrib.layers.batch_norm(T) 60 | T = tf.nn.relu(T) 61 | C = tf.subtract(1.0, T, name="carry_gate") 62 | 63 | if in_channels != out_channels: 64 | ch = (out_channels - in_channels)//2 65 | remain = out_channels-in_channels-ch 66 | ident = tf.pad(ident, [[0, 0], [0, 0], [ch, remain]]) 67 | in_channels = out_channels 68 | 69 | net = tf.add(tf.multiply(H, T), tf.multiply(ident, C)) 70 | return net 71 | 72 | 73 | def highway_2d(incoming, out_channels, filter_size, \ 74 | regularizer, batch_norm=False, scope=None, name="highway_net2d"): 75 | net = incoming 76 | in_channels = incoming.get_shape().as_list()[-1] 77 | ident = net 78 | with tf.variable_scope(scope, default_name = name, values=[incoming]) as scope: 79 | # 1st conv layer in residual block 80 | W = weight_variable([filter_size, filter_size, in_channels, out_channels], \ 81 | regularizer, name="W") 82 | b = bias_variable([out_channels], name="b") 83 | W_T = weight_variable([filter_size, filter_size, in_channels, out_channels], \ 84 | regularizer, name="W_T") 85 | b_T = bias_variable([out_channels], name="b_T") 86 | 87 | H = tf.nn.conv2d(net, W, strides=[1,1,1,1], padding='SAME') + b 88 | if batch_norm: 89 | H = tf.contrib.layers.batch_norm(H) 90 | H = tf.nn.relu(H) 91 | 92 | T = tf.nn.conv2d(net, W_T, strides=[1,1,1,1], padding='SAME') + b_T 93 | if batch_norm: 94 | T = tf.contrib.layers.batch_norm(T) 95 | T = tf.nn.relu(T) 96 | C = tf.subtract(1.0, T, name="carry_gate") 97 | 98 | if in_channels != out_channels: 99 | ch = (out_channels - in_channels)//2 100 | remain = out_channels-in_channels-ch 101 | ident = tf.pad(ident, [[0, 0], [0, 0], [0, 0], [ch, remain]]) 102 | in_channels = out_channels 103 | 104 | net = tf.add(tf.multiply(H, T), tf.multiply(ident, C)) 105 | return net 106 | 107 | def one_hot(contact_map): 108 | # change the shape to (L, L, 2) 109 | tmp = np.where(contact_map > 0, 0, 1) 110 | true_contact = np.stack((tmp, contact_map), axis=-1) 111 | return true_contact.astype(np.float32) 112 | 113 | def build_loss(output_prob, y, weight=None): 114 | y = tf.py_func(one_hot, [y], tf.float32) 115 | los = -tf.reduce_mean(tf.multiply(tf.log(tf.clip_by_value(output_prob,1e-10,1.0)), y)) 116 | return los 117 | 118 | def build(input_1d, input_2d, label, 119 | filter_size_1d=17, filter_size_2d=3, block_num_1d=0, block_num_2d=10, 120 | regulation=True, batch_norm=True): 121 | 122 | regularizer = None 123 | if regulation: 124 | regularizer = tf.contrib.layers.l2_regularizer(scale=0.1) 125 | 126 | net = input_1d 127 | 128 | channel_step = 2 129 | ######## 1d Highway Network ########## 130 | out_channels = net.get_shape().as_list()[-1] 131 | for i in xrange(block_num_1d): #1D-residual blocks building 132 | out_channels += channel_step 133 | net = highway_1d(net, out_channels, filter_size_1d, 134 | regularizer, batch_norm=batch_norm, name="Highway_1D_"+str(i)) 135 | 136 | ####################################### 137 | 138 | # Conversion of sequential to pairwise feature 139 | with tf.name_scope('1d_to_2d'): 140 | net = seq2pairwise(net) 141 | 142 | # Merge coevolution info(pairwise potential) and above feature 143 | if block_num_1d == 0: 144 | net = input_2d 145 | else: 146 | net = tf.concat([net, input_2d], axis=3) 147 | out_channels = net.get_shape().as_list()[-1] 148 | 149 | ######## 2d Highway Network ########## 150 | for i in xrange(block_num_2d): #2D-residual blocks building 151 | out_channels += channel_step 152 | net = highway_2d(net, out_channels, filter_size_2d, 153 | regularizer, batch_norm=batch_norm, name="Highway_2D_"+str(i)) 154 | ####################################### 155 | 156 | # softmax channels of each pair into a score 157 | with tf.variable_scope('softmax_layer', values=[net]) as scpoe: 158 | W_out = weight_variable([1, 1, out_channels, 2], regularizer, 'W') 159 | b_out = bias_variable([2], 'b') 160 | output_prob = tf.nn.softmax(tf.nn.conv2d(net, W_out, strides=[1,1,1,1], padding='SAME') + b_out) 161 | 162 | with tf.name_scope('loss_function'): 163 | loss = build_loss(output_prob, label) 164 | if regulation: 165 | reg_variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) 166 | reg_term = tf.contrib.layers.apply_regularization(regularizer, reg_variables) 167 | loss += reg_term 168 | tf.summary.scalar('loss', loss) 169 | output = {} 170 | output['output_prob'] = output_prob 171 | output['loss'] = loss 172 | 173 | return output 174 | -------------------------------------------------------------------------------- /libs/nets/network.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | 6 | def weight_variable(shape, regularizer, name="W"): 7 | if regularizer == None: 8 | initial = tf.truncated_normal(shape, stddev=0.1) 9 | return tf.Variable(initial, name) 10 | else: 11 | return tf.get_variable(name, shape, 12 | initializer=tf.random_normal_initializer(), regularizer=regularizer) 13 | 14 | 15 | def bias_variable(shape, name="b"): 16 | initial = tf.constant(0.1, shape=shape) 17 | return tf.Variable(initial, name) 18 | 19 | ### Incoming shape (batch_size, L(seqLen), feature_num) 20 | ### Output[:, i, j, :] = incoming[:. i, :] + incoming[:, j, :] + incoming[:, (i+j)/2, :] 21 | def seq2pairwise(incoming): 22 | L = tf.shape(incoming)[1] 23 | #save the indexes of each position 24 | v = tf.range(0, L, 1) 25 | i, j = tf.meshgrid(v, v) 26 | m = (i+j)/2 27 | #switch batch dim with L dim to put L at first 28 | incoming2 = tf.transpose(incoming, perm=[1, 0, 2]) 29 | #full matrix i with element in incomming2 indexed i[i][j] 30 | out1 = tf.nn.embedding_lookup(incoming2, i) 31 | out2 = tf.nn.embedding_lookup(incoming2, j) 32 | out3 = tf.nn.embedding_lookup(incoming2, m) 33 | #concatante final feature dim together 34 | out = tf.concat([out1, out2, out3], axis=3) 35 | #return to original dims 36 | output = tf.transpose(out, perm=[2, 0, 1, 3]) 37 | return output 38 | 39 | def build_block_1d(incoming, out_channels, filter_size, 40 | regularizer, batch_norm=False, scope=None, name="ResidualBlock_1d"): 41 | 42 | net = incoming 43 | in_channels = incoming.get_shape().as_list()[-1] 44 | ident = net 45 | with tf.variable_scope(scope, default_name = name, values=[incoming]) as scope: 46 | # 1st conv layer in residual block 47 | W1 = weight_variable([filter_size, in_channels, out_channels], regularizer, name="W1") 48 | #variable_summaries(W1) 49 | b1 = bias_variable([out_channels], name="b1") 50 | #variable_summaries(b1) 51 | net = tf.nn.conv1d(net, W1, stride=1, padding='SAME') + b1 52 | ### Add batch nomalization 53 | if batch_norm: 54 | net = tf.contrib.layers.batch_norm(net) 55 | net = tf.nn.relu(net) 56 | # 2nd conv layer in residual block 57 | W2 = weight_variable([filter_size, out_channels, out_channels], regularizer, name="W2") 58 | #variable_summaries(W2) 59 | b2 = bias_variable([out_channels], name="b2") 60 | #variable_summaries(b2) 61 | net = tf.nn.conv1d(net, W2, stride=1, padding='SAME') + b2 62 | ### Add batch nomalization 63 | if batch_norm: 64 | net = tf.contrib.layers.batch_norm(net) 65 | net = tf.nn.relu(net) 66 | if in_channels != out_channels: 67 | ch = (out_channels - in_channels)//2 68 | remain = out_channels-in_channels-ch 69 | ident = tf.pad(ident, [[0, 0], [0, 0], [ch, remain]]) 70 | in_channels = out_channels 71 | # Add the original featrues to result, identify 72 | net = net + ident 73 | return net 74 | 75 | def build_block_2d(incoming, out_channels, filter_size, 76 | regularizer, batch_norm=False, scope=None, name="ResidualBlock_2d"): 77 | 78 | net = incoming 79 | in_channels = incoming.get_shape().as_list()[-1] 80 | ident = net 81 | with tf.variable_scope(scope, default_name = name, values=[incoming]) as scope: 82 | # 1st conv layer in residual block 83 | W1 = weight_variable([filter_size, filter_size, in_channels, out_channels], regularizer, name="W1") 84 | #variable_summaries(W1) 85 | b1 = bias_variable([out_channels], name="b1") 86 | #variable_summaries(b1) 87 | net = tf.nn.conv2d(net, W1, strides=[1,1,1,1], padding='SAME') + b1 88 | ### Add batch nomalization 89 | if batch_norm: 90 | net = tf.contrib.layers.batch_norm(net) 91 | net = tf.nn.relu(net) 92 | ### 2nd conv layer in residual block 93 | W2 = weight_variable([filter_size, filter_size, out_channels, out_channels], regularizer, name="W2") 94 | #variable_summaries(W2) 95 | b2 = bias_variable([out_channels], name="b2") 96 | #variable_summaries(b2) 97 | net = tf.nn.conv2d(net, W2, strides=[1,1,1,1], padding='SAME') + b2 98 | ### Add batch nomalization 99 | if batch_norm: 100 | net = tf.contrib.layers.batch_norm(net) 101 | net = tf.nn.relu(net) 102 | if in_channels != out_channels: 103 | ch = (out_channels - in_channels)//2 104 | remain = out_channels-in_channels-ch 105 | ident = tf.pad(ident, [[0, 0], [0, 0], [0, 0], [ch, remain]]) 106 | in_channels = out_channels 107 | ### Add the original featrues to result 108 | net = net + ident 109 | return net 110 | 111 | def one_hot(contact_map): 112 | # change the shape to (L, L, 2) 113 | tmp = np.where(contact_map > 0, 0, 1) 114 | true_contact = np.stack((tmp, contact_map), axis=-1) 115 | return true_contact.astype(np.float32) 116 | 117 | def build_loss(output_prob, y, weight=None): 118 | y = tf.py_func(one_hot, [y], tf.float32) 119 | los = -tf.reduce_mean(tf.multiply(tf.log(tf.clip_by_value(output_prob,1e-10,1.0)), y)) 120 | return los 121 | 122 | def build(input_1d, input_2d, label, 123 | filter_size_1d=17, filter_size_2d=3, block_num_1d=0, block_num_2d=10, 124 | regulation=True, batch_norm=True): 125 | 126 | regularizer = None 127 | if regulation: 128 | regularizer = tf.contrib.layers.l2_regularizer(scale=0.1) 129 | 130 | net = input_1d 131 | 132 | channel_step = 2 133 | ######## 1d Residual Network ########## 134 | out_channels = net.get_shape().as_list()[-1] 135 | for i in xrange(block_num_1d): #1D-residual blocks building 136 | out_channels += channel_step 137 | net = build_block_1d(net, out_channels, filter_size_1d, 138 | regularizer, batch_norm=batch_norm, name="ResidualBlock_1D_"+str(i)) 139 | 140 | ####################################### 141 | 142 | # Conversion of sequential to pairwise feature 143 | with tf.name_scope('1d_to_2d'): 144 | net = seq2pairwise(net) 145 | 146 | # Merge coevolution info(pairwise potential) and above feature 147 | if block_num_1d == 0: 148 | net = input_2d 149 | else: 150 | net = tf.concat([net, input_2d], axis=3) 151 | out_channels = net.get_shape().as_list()[-1] 152 | 153 | ######## 2d Residual Network ########## 154 | for i in xrange(block_num_2d): #2D-residual blocks building 155 | out_channels += channel_step 156 | net = build_block_2d(net, out_channels, filter_size_2d, 157 | regularizer, batch_norm=batch_norm, name="ResidualBlock_2D_"+str(i)) 158 | ####################################### 159 | 160 | # softmax channels of each pair into a score 161 | with tf.variable_scope('softmax_layer', values=[net]) as scpoe: 162 | W_out = weight_variable([1, 1, out_channels, 2], regularizer, 'W') 163 | b_out = bias_variable([2], 'b') 164 | output_prob = tf.nn.softmax(tf.nn.conv2d(net, W_out, strides=[1,1,1,1], padding='SAME') + b_out) 165 | 166 | with tf.name_scope('loss_function'): 167 | loss = build_loss(output_prob, label) 168 | if regulation: 169 | reg_variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) 170 | reg_term = tf.contrib.layers.apply_regularization(regularizer, reg_variables) 171 | loss += reg_term 172 | tf.summary.scalar('loss', loss) 173 | output = {} 174 | output['output_prob'] = output_prob 175 | output['loss'] = loss 176 | 177 | return output 178 | 179 | 180 | -------------------------------------------------------------------------------- /libs/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doubleQ2018/Deep-learning-for-contact_map_v2/a039e99d39ce99ffde42ef4c6a429b5c9055e2b3/libs/utils/__init__.py -------------------------------------------------------------------------------- /libs/utils/acc_cal_v1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import numpy as np 5 | 6 | def topKaccuracy(y_out, y, k): 7 | L = y.shape[0] 8 | 9 | m = np.ones_like(y, dtype=np.int8) 10 | lm = np.triu(m, 24) 11 | mm = np.triu(m, 12) 12 | sm = np.triu(m, 6) 13 | 14 | avg_pred = (y_out + y_out.transpose((1, 0))) / 2.0 15 | truth = np.concatenate((avg_pred[..., np.newaxis], y[..., np.newaxis]), axis=-1) 16 | 17 | accs = [] 18 | for x in [lm, mm, sm]: 19 | selected_truth = truth[x.nonzero()] 20 | selected_truth_sorted = selected_truth[(selected_truth[:, 0]).argsort()[::-1]] 21 | tops_num = min(selected_truth_sorted.shape[0], L/k) 22 | truth_in_pred = selected_truth_sorted[:, 1].astype(np.int8) 23 | corrects_num = np.bincount(truth_in_pred[0: tops_num], minlength=2) 24 | acc = 1.0 * corrects_num[1] / (tops_num + 0.0001) 25 | accs.append(acc) 26 | 27 | return accs 28 | 29 | def topLmatrix(predict_matrix): 30 | m, n = predict_matrix.shape 31 | points = [] 32 | for i in xrange(m): 33 | for j in xrange(i+1, n): 34 | points.append((i, j, predict_matrix[i][j])) 35 | topL = sorted(points, key = lambda x: x[-1], reverse = True)[: max(m, n)] 36 | matrix = np.zeros((m, n), dtype = np.uint8) 37 | for p in topL: 38 | matrix[p[0]][p[1]] = 1 39 | return matrix 40 | 41 | def evaluate(predict_matrix, contact_matrix): 42 | acc_k_1 = topKaccuracy(predict_matrix, contact_matrix, 1) 43 | acc_k_2 = topKaccuracy(predict_matrix, contact_matrix, 2) 44 | acc_k_5 = topKaccuracy(predict_matrix, contact_matrix, 5) 45 | acc_k_10 = topKaccuracy(predict_matrix, contact_matrix, 10) 46 | tmp = [] 47 | tmp.append(acc_k_1) 48 | tmp.append(acc_k_2) 49 | tmp.append(acc_k_5) 50 | tmp.append(acc_k_10) 51 | return tmp 52 | 53 | def output_result(avg_acc): 54 | print "Long Range:" 55 | print "Method L/10 L/5 L/2 L" 56 | print "Acc : %.3f %.3f %.3f %.3f" \ 57 | %(avg_acc[3][0], avg_acc[2][0], avg_acc[1][0], avg_acc[0][0]) 58 | print "Medium Range:" 59 | print "Method L/10 L/5 L/2 L" 60 | print "Acc : %.3f %.3f %.3f %.3f" \ 61 | %(avg_acc[3][1], avg_acc[2][1], avg_acc[1][1], avg_acc[0][1]) 62 | print "Short Range:" 63 | print "Method L/10 L/5 L/2 L" 64 | print "Acc : %.3f %.3f %.3f %.3f" \ 65 | %(avg_acc[3][2], avg_acc[2][2], avg_acc[1][2], avg_acc[0][2]) 66 | 67 | def test(): 68 | with open("data/PSICOV/psicov.list", "r") as fin: 69 | names = [line.rstrip("\n") for line in fin] 70 | 71 | accs = [] 72 | for i in range(len(names)): 73 | name = names[i] 74 | print "processing in %d: %s" %(i+1, name) 75 | 76 | #prediction_path = "data/PSICOV/clm/" 77 | #prediction_path = "data/PSICOV/ccmpred" 78 | #prediction_path = "data/PSICOV/psicov_matrix" 79 | prediction_path = "data/PSICOV/mf_matrix" 80 | f = os.path.join(prediction_path, name + ".mfDCA") 81 | if not os.path.exists(f): 82 | print "not exist..." 83 | continue 84 | y_out = np.loadtxt(f) 85 | 86 | dist_path = "data/PSICOV/dis/" 87 | y = np.loadtxt(os.path.join(dist_path, name + ".dis")) 88 | y[y > 8] = 0 89 | y[y != 0] = 1 90 | y = y.astype(np.int8) 91 | y = np.tril(y, k=-6) + np.triu(y, k=6) 92 | 93 | acc = evaluate(y_out, y) 94 | accs.append(acc) 95 | accs = np.array(accs) 96 | avg_acc = np.mean(accs, axis=0) 97 | output_result(avg_acc) 98 | 99 | -------------------------------------------------------------------------------- /libs/utils/acc_cal_v2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import numpy as np 5 | 6 | def topKaccuracy(y_out, y, k): 7 | L = y.shape[0] 8 | 9 | m = np.ones_like(y, dtype=np.int8) 10 | lm = np.triu(m, 24) 11 | mm = np.triu(m, 12) 12 | sm = np.triu(m, 6) 13 | 14 | sm = sm - mm 15 | mm = mm - lm 16 | 17 | avg_pred = (y_out + y_out.transpose((1, 0))) / 2.0 18 | truth = np.concatenate((avg_pred[..., np.newaxis], y[..., np.newaxis]), axis=-1) 19 | 20 | accs = [] 21 | for x in [lm, mm, sm]: 22 | selected_truth = truth[x.nonzero()] 23 | selected_truth_sorted = selected_truth[(selected_truth[:, 0]).argsort()[::-1]] 24 | tops_num = min(selected_truth_sorted.shape[0], L/k) 25 | truth_in_pred = selected_truth_sorted[:, 1].astype(np.int8) 26 | corrects_num = np.bincount(truth_in_pred[0: tops_num], minlength=2) 27 | acc = 1.0 * corrects_num[1] / (tops_num + 0.0001) 28 | accs.append(acc) 29 | 30 | return accs 31 | 32 | def evaluate(predict_matrix, contact_matrix): 33 | acc_k_1 = topKaccuracy(predict_matrix, contact_matrix, 1) 34 | acc_k_2 = topKaccuracy(predict_matrix, contact_matrix, 2) 35 | acc_k_5 = topKaccuracy(predict_matrix, contact_matrix, 5) 36 | acc_k_10 = topKaccuracy(predict_matrix, contact_matrix, 10) 37 | tmp = [] 38 | tmp.append(acc_k_1) 39 | tmp.append(acc_k_2) 40 | tmp.append(acc_k_5) 41 | tmp.append(acc_k_10) 42 | return tmp 43 | 44 | def output_result(avg_acc): 45 | print "Long Range(> 24):" 46 | print "Method L/10 L/5 L/2 L" 47 | print "Acc : %.3f %.3f %.3f %.3f" \ 48 | %(avg_acc[3][0], avg_acc[2][0], avg_acc[1][0], avg_acc[0][0]) 49 | print "Medium Range(12 - 24):" 50 | print "Method L/10 L/5 L/2 L" 51 | print "Acc : %.3f %.3f %.3f %.3f" \ 52 | %(avg_acc[3][1], avg_acc[2][1], avg_acc[1][1], avg_acc[0][1]) 53 | print "Short Range(6 - 12):" 54 | print "Method L/10 L/5 L/2 L" 55 | print "Acc : %.3f %.3f %.3f %.3f" \ 56 | %(avg_acc[3][2], avg_acc[2][2], avg_acc[1][2], avg_acc[0][2]) 57 | 58 | def test(): 59 | with open("data/PSICOV/psicov.list", "r") as fin: 60 | names = [line.rstrip("\n") for line in fin] 61 | 62 | accs = [] 63 | for i in range(len(names)): 64 | name = names[i] 65 | print "processing in %d: %s" %(i+1, name) 66 | 67 | #prediction_path = "data/PSICOV/clm/" 68 | prediction_path = "data/PSICOV/new_psicov/" 69 | #prediction_path = "data/PSICOV/psicov_matrix" 70 | #prediction_path = "data/PSICOV/mf_matrix" 71 | #prediction_path = "psicov_result" 72 | f = os.path.join(prediction_path, name + ".ccmpred") 73 | if not os.path.exists(f): 74 | print "not exist..." 75 | continue 76 | y_out = np.loadtxt(f) 77 | 78 | dist_path = "data/PSICOV/dis/" 79 | y = np.loadtxt(os.path.join(dist_path, name + ".dis")) 80 | y[y > 8] = 0 81 | y[y != 0] = 1 82 | y = y.astype(np.int8) 83 | y = np.tril(y, k=-6) + np.triu(y, k=6) 84 | 85 | acc = evaluate(y_out, y) 86 | accs.append(acc) 87 | accs = np.array(accs) 88 | avg_acc = np.mean(accs, axis=0) 89 | output_result(avg_acc) 90 | 91 | -------------------------------------------------------------------------------- /models/model.ckpt-20000.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doubleQ2018/Deep-learning-for-contact_map_v2/a039e99d39ce99ffde42ef4c6a429b5c9055e2b3/models/model.ckpt-20000.data-00000-of-00001 -------------------------------------------------------------------------------- /models/model.ckpt-20000.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doubleQ2018/Deep-learning-for-contact_map_v2/a039e99d39ce99ffde42ef4c6a429b5c9055e2b3/models/model.ckpt-20000.index -------------------------------------------------------------------------------- /read_data_into_tfrecord.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import cPickle as pickle 4 | 5 | from libs.datasets.data_preprocessing import * 6 | from libs.config.config import * 7 | 8 | FLAGS = tf.app.flags.FLAGS 9 | 10 | def read_pkl(name): 11 | with open(name) as fin: 12 | return pickle.load(fin) 13 | 14 | train_infos = read_pkl(FLAGS.train_file) 15 | records_dir = os.path.join(FLAGS.data_dir, 'records/') 16 | add_to_tfrecord(records_dir, 'train', train_infos) 17 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import libs.nets.network as network 4 | import libs.datasets.data_preprocessing as data_preprocess 5 | from libs.config.config import * 6 | from libs.utils.acc_cal_v2 import topKaccuracy, evaluate, output_result 7 | 8 | import tensorflow as tf 9 | import numpy as np 10 | import cPickle as pickle 11 | import os 12 | # using GPU numbered 0 13 | os.environ["CUDA_VISIBLE_DEVICES"]='0' 14 | 15 | def load_test_data(): 16 | datafile = "data/pdb25-test-500.release.contactFeatures.pkl" 17 | f = open(datafile) 18 | data = pickle.load(f) 19 | f.close() 20 | return data 21 | 22 | def test(): 23 | # restore graph 24 | input_1d = tf.placeholder("float", shape=[None, None, 26], name="input_x1") 25 | input_2d = tf.placeholder("float", shape=[None, None, None, 1], name="input_x2") 26 | label = tf.placeholder("float", shape=None, name="input_y") 27 | output = network.build(input_1d, input_2d, label, 28 | FLAGS.filter_size_1d, FLAGS.filter_size_2d, 29 | FLAGS.block_num_1d, FLAGS.block_num_2d, 30 | regulation=True, batch_norm=True) 31 | prob = output['output_prob'] 32 | 33 | # restore model 34 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7) 35 | sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 36 | #checkpoint_path = tf.train.latest_checkpoint(FLAGS.train_dir) 37 | checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt-20000") 38 | print "Loading model from %s" %checkpoint_path 39 | restorer = tf.train.Saver() 40 | restorer.restore(sess, checkpoint_path) 41 | 42 | # prediction 43 | data = load_test_data() 44 | input_acc = [] 45 | output_acc = [] 46 | for i in range(len(data)): 47 | d = data[i] 48 | name, seqLen, sequence_profile, pairwise_profile, true_contact = \ 49 | data_preprocess.extract_single(d) 50 | sequence_profile = sequence_profile[np.newaxis, ...] 51 | pairwise_profile = pairwise_profile[np.newaxis, ...][:,:,:,0:1] # single CCMpred 52 | y_out = sess.run(prob, \ 53 | feed_dict = {input_1d: sequence_profile, input_2d: pairwise_profile}) 54 | input_acc.append(evaluate(pairwise_profile[0,:,:,0], true_contact)) 55 | output_acc.append(evaluate(y_out[0,:,:,1], true_contact)) 56 | 57 | print "Input result:" 58 | output_result(np.mean(np.array(input_acc), axis=0)) 59 | print "\nOutput result:" 60 | output_result(np.mean(np.array(output_acc), axis=0)) 61 | 62 | if __name__ == "__main__": 63 | test() 64 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import libs.nets.network as network 4 | import libs.datasets.data_factory as dataset 5 | from libs.config.config import * 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | from time import gmtime, strftime 10 | import time 11 | import os 12 | 13 | 14 | # using GPU numbered 0 15 | os.environ["CUDA_VISIBLE_DEVICES"]='1' 16 | 17 | def restore(sess): 18 | if FLAGS.restore_previous_if_exists: 19 | try: 20 | checkpoint_path = tf.train.latest_checkpoint(FLAGS.train_dir) 21 | restorer = tf.train.Saver() 22 | restorer.restore(sess, checkpoint_path) 23 | print ('restored previous model %s from %s'\ 24 | %(checkpoint_path, FLAGS.train_dir)) 25 | time.sleep(2) 26 | return 27 | except: 28 | print ('--restore_previous_if_exists is set, but failed to restore in %s %s'\ 29 | % (FLAGS.train_dir, checkpoint_path)) 30 | time.sleep(2) 31 | 32 | def train(): 33 | name, seqLen, seq_feature, pair_feature, label = \ 34 | dataset.get_dataset('train', FLAGS.data_dir) 35 | data_queue = tf.RandomShuffleQueue(capacity=32, min_after_dequeue=16, 36 | dtypes=(name.dtype, seqLen.dtype, 37 | seq_feature.dtype, pair_feature.dtype, label.dtype)) 38 | enqueue_op = data_queue.enqueue((name, seqLen, seq_feature, pair_feature, label)) 39 | data_queue_runner = tf.train.QueueRunner(data_queue, [enqueue_op] * 4) 40 | tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_queue_runner) 41 | (name, seqLen, seq_feature, pair_feature, label) = data_queue.dequeue() 42 | 43 | input_1d = tf.reshape(seq_feature, (1, seqLen, 26)) 44 | input_2d = tf.reshape(pair_feature, (1, seqLen, seqLen, 5)) 45 | label = tf.reshape(label, (1, seqLen, seqLen)) 46 | 47 | output = network.build(input_1d, input_2d, label, 48 | FLAGS.filter_size_1d, FLAGS.filter_size_2d, 49 | FLAGS.block_num_1d, FLAGS.block_num_2d, 50 | regulation=True, batch_norm=True) 51 | prob = output['output_prob'] 52 | loss = output['loss'] 53 | 54 | train_step = tf.train.GradientDescentOptimizer(0.01).minimize(loss) 55 | 56 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.80) 57 | sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 58 | init_op = tf.group(tf.global_variables_initializer(), 59 | tf.local_variables_initializer()) 60 | sess.run(init_op) 61 | 62 | # save log 63 | summary_op = tf.summary.merge_all() 64 | logdir = os.path.join(FLAGS.train_dir, strftime('%Y%m%d%H%M%S', gmtime())) 65 | if not os.path.exists(logdir): 66 | os.makedirs(logdir) 67 | summary_writer = tf.summary.FileWriter(logdir, graph=sess.graph) 68 | 69 | #restore model 70 | restore(sess) 71 | 72 | # main loop 73 | coord = tf.train.Coordinator() 74 | threads = [] 75 | for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS): 76 | threads.extend(qr.create_threads(sess, coord=coord, daemon=True, start=True)) 77 | tf.train.start_queue_runners(sess=sess, coord=coord) 78 | 79 | saver = tf.train.Saver(max_to_keep=20) 80 | # train iteration 81 | for step in xrange(FLAGS.max_iters): 82 | _, ids, L, los, output_prob = \ 83 | sess.run([train_step, name, seqLen, loss, prob]) 84 | print "iter %d: id = %s, seqLen = %3d, loss = %.4f" %(step, ids, L, los) 85 | 86 | if step % 100 == 0: 87 | summary_str = sess.run(summary_op) 88 | summary_writer.add_summary(summary_str, step) 89 | summary_writer.flush() 90 | 91 | if (step % 10000 == 0 or step + 1 == FLAGS.max_iters) and step != 0: 92 | checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') 93 | saver.save(sess, checkpoint_path, global_step=step, write_meta_graph=False) 94 | 95 | def test(): 96 | input_1d = tf.constant(np.random.rand(1,10,26).astype(np.float32)) 97 | input_2d = tf.constant(np.random.rand(1,10,10,5).astype(np.float32)) 98 | label = tf.constant(np.random.randint(2, size=(1,10,10))) 99 | 100 | output = network.build(input_1d, input_2d, label) 101 | prob = output['output_prob'] 102 | 103 | init = tf.initialize_all_variables() 104 | sess = tf.Session() 105 | sess.run(init) 106 | print sess.run(prob) 107 | 108 | train() 109 | --------------------------------------------------------------------------------