├── .gitignore ├── LICENSE ├── README.md ├── main.py └── resnet_model.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | .idea -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Ritchie. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Residual Networks in TensorFlow 2 | 3 | ## Residual Network in TensorFlow 4 | This entire code is implemented in pure TensorFlow and I have made it simple to run with different settings. 5 | 6 | ## Simple Instructions 7 | - Running Training and Evaluation 8 | - `python main.py` 9 | - If you want to modify any parameters, you can use for example `python main.py --n_epoch==10` 10 | - The default runs on CIFAR-10 dataset and this configuration is made for that. 11 | - `n_epoch`: number of epochs 12 | - Default `10` 13 | - `n_batch`: batch size 14 | - Default `64` 15 | - `n_img_row`: dimension of image (row) 16 | - Default `32` 17 | - `n_img_col`: dimension of image (col) 18 | - Default `32` 19 | - `n_img_channels`: number of channels 20 | - Default `3` 21 | - `n_classes`: number of classes 22 | - Default `10` 23 | - `lr`: learning rate (momentum optimizer) 24 | - Default `0.1` 25 | - `n_resid_units`: number of residual units 26 | - Default `5` 27 | - `lr_schedule`: number of epoch for the learning rate to decrease by `lr_factor` 28 | - Default `60` 29 | - This multiplies the LR every 60 epochs by `lr_factor`. 30 | - `lr_factor`: the factor for reducing LR 31 | - Default `0.1`. 32 | 33 | - Running TensorBoard 34 | - Training logs 35 | - `tensorboard --logdir=train_log` 36 | - Evaluation logs 37 | - `tensorboard --logdir=eval_log` 38 | - You can use any path you want. 39 | - If you encountered a `permission denied` error, you can easily solve it by changing the directory to `tmp/train_log`. 40 | - I experienced this while running on Amazon AWS and it was solved with this fix. 41 | 42 | ## Credits 43 | - The original model was obtained from the [official repository](https://github.com/tensorflow/models/tree/master/resnet) by TensorFlow. 44 | - The paper on [Residual Networks](https://arxiv.org/abs/1512.03385) on arXiv.org. 45 | 46 | ## Dependencies 47 | - To simplify the code, I read the CIFAR dataset using [TensorLayer](https://github.com/zsdonghao/tensorlayer). 48 | - Simply run `sudo pip install tensorlayer` and you are good to go. 49 | - TensorFlow v0.12 50 | - If you would like to run this code in a few minutes on Amazon AWS, just use the open-source AMI [TFAMI.v3](https://github.com/ritchieng/tensorflow-aws-ami). 51 | 52 | ## License 53 | MIT 54 | 55 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorlayer as tl 3 | from tensorlayer.layers import set_keep 4 | import numpy as np 5 | import resnet_model 6 | import argparse 7 | 8 | parser = argparse.ArgumentParser(description='Define parameters.') 9 | 10 | parser.add_argument('--n_epoch', type=int, default=10) 11 | parser.add_argument('--n_batch', type=int, default=64) 12 | parser.add_argument('--n_img_row', type=int, default=32) 13 | parser.add_argument('--n_img_col', type=int, default=32) 14 | parser.add_argument('--n_img_channels', type=int, default=3) 15 | parser.add_argument('--n_classes', type=int, default=10) 16 | parser.add_argument('--lr', type=float, default=0.1) 17 | parser.add_argument('--n_resid_units', type=int, default=5) 18 | parser.add_argument('--lr_schedule', type=int, default=60) 19 | parser.add_argument('--lr_factor', type=float, default=0.1) 20 | 21 | args = parser.parse_args() 22 | 23 | 24 | class CNNEnv: 25 | def __init__(self): 26 | 27 | # The data, shuffled and split between train and test sets 28 | self.x_train, self.y_train, self.x_test, self.y_test = tl.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False) 29 | 30 | # Reorder dimensions for tensorflow 31 | self.mean = np.mean(self.x_train, axis=0, keepdims=True) 32 | self.std = np.std(self.x_train) 33 | self.x_train = (self.x_train - self.mean) / self.std 34 | self.x_test = (self.x_test - self.mean) / self.std 35 | 36 | print('x_train shape:', self.x_train.shape) 37 | print('x_test shape:', self.x_test.shape) 38 | print('y_train shape:', self.y_train.shape) 39 | print('y_test shape:', self.y_test.shape) 40 | 41 | # For generator 42 | self.num_examples = self.x_train.shape[0] 43 | self.index_in_epoch = 0 44 | self.epochs_completed = 0 45 | 46 | # Basic info 47 | self.batch_num = args.n_batch 48 | self.num_epoch = args.n_epoch 49 | self.img_row = args.n_img_row 50 | self.img_col = args.n_img_col 51 | self.img_channels = args.n_img_channels 52 | self.nb_classes = args.n_classes 53 | self.num_iter = self.x_train.shape[0] / self.batch_num # per epoch 54 | 55 | def next_batch(self, batch_size): 56 | """Return the next `batch_size` examples from this data set.""" 57 | self.batch_size = batch_size 58 | 59 | start = self.index_in_epoch 60 | self.index_in_epoch += self.batch_size 61 | 62 | if self.index_in_epoch > self.num_examples: 63 | # Finished epoch 64 | self.epochs_completed += 1 65 | # Shuffle the data 66 | perm = np.arange(self.num_examples) 67 | np.random.shuffle(perm) 68 | self.x_train = self.x_train[perm] 69 | self.y_train = self.y_train[perm] 70 | 71 | # Start next epoch 72 | start = 0 73 | self.index_in_epoch = self.batch_size 74 | assert self.batch_size <= self.num_examples 75 | end = self.index_in_epoch 76 | return self.x_train[start:end], self.y_train[start:end] 77 | 78 | def train(self, hps): 79 | config = tf.ConfigProto() 80 | config.gpu_options.allow_growth = True 81 | sess = tf.InteractiveSession(config=config) 82 | 83 | img = tf.placeholder(tf.float32, shape=[self.batch_num, 32, 32, 3]) 84 | labels = tf.placeholder(tf.int32, shape=[self.batch_num, ]) 85 | 86 | model = resnet_model.ResNet(hps, img, labels, 'train') 87 | model.build_graph() 88 | 89 | merged = model.summaries 90 | train_writer = tf.summary.FileWriter("/tmp/train_log", sess.graph) 91 | 92 | sess.run(tf.global_variables_initializer()) 93 | print('Done initializing variables') 94 | print('Running model...') 95 | 96 | # Set default learning rate for scheduling 97 | lr = args.lr 98 | 99 | for j in range(self.num_epoch): 100 | print('Epoch {}'.format(j+1)) 101 | 102 | # Decrease learning rate every args.lr_schedule epoch 103 | # By args.lr_factor 104 | if (j + 1) % args.lr_schedule == 0: 105 | lr *= args.lr_factor 106 | 107 | for i in range(self.num_iter): 108 | batch = self.next_batch(self.batch_num) 109 | feed_dict = {img: batch[0], 110 | labels: batch[1], 111 | model.lrn_rate: lr} 112 | _, l, ac, summary, lr = sess.run([model.train_op, model.cost, model.acc, merged, model.lrn_rate], feed_dict=feed_dict) 113 | train_writer.add_summary(summary, i) 114 | # 115 | if i % 200 == 0: 116 | print('step', i+1) 117 | print('Training loss', l) 118 | print('Training accuracy', ac) 119 | print('Learning rate', lr) 120 | 121 | print('Running evaluation...') 122 | 123 | test_loss, test_acc, n_batch = 0, 0, 0 124 | for batch in tl.iterate.minibatches(inputs=self.x_test, 125 | targets=self.y_test, 126 | batch_size=self.batch_num, 127 | shuffle=False): 128 | feed_dict_eval = {img: batch[0], labels: batch[1]} 129 | 130 | loss, ac = sess.run([model.cost, model.acc], feed_dict=feed_dict_eval) 131 | test_loss += loss 132 | test_acc += ac 133 | n_batch += 1 134 | 135 | tot_test_loss = test_loss / n_batch 136 | tot_test_acc = test_acc / n_batch 137 | 138 | print(' Test loss: {}'.format(tot_test_loss)) 139 | print(' Test accuracy: {}'.format(tot_test_acc)) 140 | 141 | print('Completed training and evaluation.') 142 | 143 | run = CNNEnv() 144 | 145 | hps = resnet_model.HParams(batch_size=run.batch_num, 146 | num_classes=run.nb_classes, 147 | min_lrn_rate=0.0001, 148 | lrn_rate=args.lr, 149 | num_residual_units=args.n_resid_units, 150 | use_bottleneck=False, 151 | weight_decay_rate=0.0002, 152 | relu_leakiness=0.1, 153 | optimizer='mom') 154 | 155 | run.train(hps) 156 | -------------------------------------------------------------------------------- /resnet_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """ResNet model. 17 | 18 | Related papers: 19 | https://arxiv.org/pdf/1603.05027v2.pdf 20 | https://arxiv.org/pdf/1512.03385v1.pdf 21 | https://arxiv.org/pdf/1605.07146v1.pdf 22 | """ 23 | from collections import namedtuple 24 | 25 | import numpy as np 26 | import tensorflow as tf 27 | 28 | from tensorflow.python.training import moving_averages 29 | 30 | HParams = namedtuple('HParams', 31 | 'batch_size, num_classes, min_lrn_rate, lrn_rate, ' 32 | 'num_residual_units, use_bottleneck, weight_decay_rate, ' 33 | 'relu_leakiness, optimizer') 34 | 35 | 36 | class ResNet(object): 37 | """ResNet model.""" 38 | 39 | def __init__(self, hps, images, labels, mode): 40 | """ResNet constructor. 41 | 42 | Args: 43 | hps: Hyperparameters. 44 | images: Batches of images. [batch_size, image_size, image_size, 3] 45 | labels: Batches of labels. [batch_size, num_classes] 46 | mode: One of 'train' and 'eval'. 47 | """ 48 | self.hps = hps 49 | self._images = images 50 | self.labels = labels 51 | self.mode = mode 52 | 53 | self._extra_train_ops = [] 54 | 55 | def build_graph(self): 56 | """Build a whole graph for the model.""" 57 | self.global_step = tf.contrib.framework.get_or_create_global_step() 58 | self._build_model() 59 | if self.mode == 'train': 60 | self._build_train_op() 61 | self.summaries = tf.summary.merge_all() 62 | 63 | def _stride_arr(self, stride): 64 | """Map a stride scalar to the stride array for tf.nn.conv2d.""" 65 | return [1, stride, stride, 1] 66 | 67 | def _build_model(self): 68 | """Build the core model within the graph.""" 69 | with tf.variable_scope('init'): 70 | x = self._images 71 | x = self._conv('init_conv', x, 3, 3, 16, self._stride_arr(1)) 72 | 73 | strides = [1, 2, 2] 74 | activate_before_residual = [True, False, False] 75 | if self.hps.use_bottleneck: 76 | res_func = self._bottleneck_residual 77 | filters = [16, 64, 128, 256] 78 | else: 79 | res_func = self._residual 80 | filters = [16, 16, 32, 64] 81 | # Uncomment the following codes to use w28-10 wide residual network. 82 | # It is more memory efficient than very deep residual network and has 83 | # comparably good performance. 84 | # https://arxiv.org/pdf/1605.07146v1.pdf 85 | # filters = [16, 160, 320, 640] 86 | # Update hps.num_residual_units to 9 87 | 88 | with tf.variable_scope('unit_1_0'): 89 | x = res_func(x, filters[0], filters[1], 90 | self._stride_arr(strides[0]), 91 | activate_before_residual[0]) 92 | for i in xrange(1, self.hps.num_residual_units): 93 | with tf.variable_scope('unit_1_%d' % i): 94 | x = res_func(x, filters[1], filters[1], self._stride_arr(1), 95 | False) 96 | 97 | with tf.variable_scope('unit_2_0'): 98 | x = res_func(x, filters[1], filters[2], 99 | self._stride_arr(strides[1]), 100 | activate_before_residual[1]) 101 | for i in xrange(1, self.hps.num_residual_units): 102 | with tf.variable_scope('unit_2_%d' % i): 103 | x = res_func(x, filters[2], filters[2], self._stride_arr(1), 104 | False) 105 | 106 | with tf.variable_scope('unit_3_0'): 107 | x = res_func(x, filters[2], filters[3], 108 | self._stride_arr(strides[2]), 109 | activate_before_residual[2]) 110 | for i in xrange(1, self.hps.num_residual_units): 111 | with tf.variable_scope('unit_3_%d' % i): 112 | x = res_func(x, filters[3], filters[3], self._stride_arr(1), 113 | False) 114 | 115 | with tf.variable_scope('unit_last'): 116 | x = self._batch_norm('final_bn', x) 117 | x = self._relu(x, self.hps.relu_leakiness) 118 | x = self._global_avg_pool(x) 119 | 120 | with tf.variable_scope('logit'): 121 | logits = self._fully_connected(x, self.hps.num_classes) 122 | self.predictions = tf.nn.softmax(logits) 123 | 124 | with tf.variable_scope('costs'): 125 | xent = tf.nn.sparse_softmax_cross_entropy_with_logits( 126 | logits, self.labels) 127 | self.cost = tf.reduce_mean(xent, name='xent') 128 | self.cost += self._decay() 129 | 130 | tf.summary.scalar('cost', self.cost) 131 | 132 | with tf.variable_scope('acc'): 133 | correct_prediction = tf.equal( 134 | tf.cast(tf.argmax(logits, 1), tf.int32), self.labels) 135 | self.acc = tf.reduce_mean( 136 | tf.cast(correct_prediction, tf.float32), name='accu') 137 | 138 | tf.summary.scalar('accuracy', self.acc) 139 | 140 | def _build_train_op(self): 141 | """Build training specific ops for the graph.""" 142 | self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32) 143 | tf.summary.scalar('learning rate', self.lrn_rate) 144 | 145 | trainable_variables = tf.trainable_variables() 146 | grads = tf.gradients(self.cost, trainable_variables) 147 | 148 | if self.hps.optimizer == 'sgd': 149 | optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate) 150 | elif self.hps.optimizer == 'mom': 151 | optimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9) 152 | 153 | apply_op = optimizer.apply_gradients( 154 | zip(grads, trainable_variables), 155 | global_step=self.global_step, name='train_step') 156 | 157 | train_ops = [apply_op] + self._extra_train_ops 158 | self.train_op = tf.group(*train_ops) 159 | 160 | # TODO(xpan): Consider batch_norm in contrib/layers/python/layers/layers.py 161 | def _batch_norm(self, name, x): 162 | """Batch normalization.""" 163 | with tf.variable_scope(name): 164 | params_shape = [x.get_shape()[-1]] 165 | 166 | beta = tf.get_variable( 167 | 'beta', params_shape, tf.float32, 168 | initializer=tf.constant_initializer(0.0, tf.float32)) 169 | gamma = tf.get_variable( 170 | 'gamma', params_shape, tf.float32, 171 | initializer=tf.constant_initializer(1.0, tf.float32)) 172 | 173 | if self.mode == 'train': 174 | mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments') 175 | 176 | moving_mean = tf.get_variable( 177 | 'moving_mean', params_shape, tf.float32, 178 | initializer=tf.constant_initializer(0.0, tf.float32), 179 | trainable=False) 180 | moving_variance = tf.get_variable( 181 | 'moving_variance', params_shape, tf.float32, 182 | initializer=tf.constant_initializer(1.0, tf.float32), 183 | trainable=False) 184 | 185 | self._extra_train_ops.append( 186 | moving_averages.assign_moving_average( 187 | moving_mean, mean, 0.9)) 188 | self._extra_train_ops.append( 189 | moving_averages.assign_moving_average( 190 | moving_variance, variance, 0.9)) 191 | else: 192 | mean = tf.get_variable( 193 | 'moving_mean', params_shape, tf.float32, 194 | initializer=tf.constant_initializer(0.0, tf.float32), 195 | trainable=False) 196 | variance = tf.get_variable( 197 | 'moving_variance', params_shape, tf.float32, 198 | initializer=tf.constant_initializer(1.0, tf.float32), 199 | trainable=False) 200 | tf.summary.histogram(mean.op.name, mean) 201 | tf.summary.histogram(variance.op.name, variance) 202 | # elipson used to be 1e-5. Maybe 0.001 solves NaN problem in deeper net. 203 | y = tf.nn.batch_normalization( 204 | x, mean, variance, beta, gamma, 0.001) 205 | y.set_shape(x.get_shape()) 206 | return y 207 | 208 | def _residual(self, x, in_filter, out_filter, stride, 209 | activate_before_residual=False): 210 | """Residual unit with 2 sub layers.""" 211 | if activate_before_residual: 212 | with tf.variable_scope('shared_activation'): 213 | x = self._batch_norm('init_bn', x) 214 | x = self._relu(x, self.hps.relu_leakiness) 215 | orig_x = x 216 | else: 217 | with tf.variable_scope('residual_only_activation'): 218 | orig_x = x 219 | x = self._batch_norm('init_bn', x) 220 | x = self._relu(x, self.hps.relu_leakiness) 221 | 222 | with tf.variable_scope('sub1'): 223 | x = self._conv('conv1', x, 3, in_filter, out_filter, stride) 224 | 225 | with tf.variable_scope('sub2'): 226 | x = self._batch_norm('bn2', x) 227 | x = self._relu(x, self.hps.relu_leakiness) 228 | x = self._conv('conv2', x, 3, out_filter, out_filter, [1, 1, 1, 1]) 229 | 230 | with tf.variable_scope('sub_add'): 231 | if in_filter != out_filter: 232 | orig_x = tf.nn.avg_pool(orig_x, stride, stride, 'VALID') 233 | orig_x = tf.pad( 234 | orig_x, [[0, 0], [0, 0], [0, 0], 235 | [(out_filter - in_filter) // 2, 236 | (out_filter - in_filter) // 2]]) 237 | x += orig_x 238 | 239 | tf.logging.info('image after unit %s', x.get_shape()) 240 | return x 241 | 242 | def _bottleneck_residual(self, x, in_filter, out_filter, stride, 243 | activate_before_residual=False): 244 | """Bottleneck resisual unit with 3 sub layers.""" 245 | if activate_before_residual: 246 | with tf.variable_scope('common_bn_relu'): 247 | x = self._batch_norm('init_bn', x) 248 | x = self._relu(x, self.hps.relu_leakiness) 249 | orig_x = x 250 | else: 251 | with tf.variable_scope('residual_bn_relu'): 252 | orig_x = x 253 | x = self._batch_norm('init_bn', x) 254 | x = self._relu(x, self.hps.relu_leakiness) 255 | 256 | with tf.variable_scope('sub1'): 257 | x = self._conv('conv1', x, 1, in_filter, out_filter / 4, stride) 258 | 259 | with tf.variable_scope('sub2'): 260 | x = self._batch_norm('bn2', x) 261 | x = self._relu(x, self.hps.relu_leakiness) 262 | x = self._conv('conv2', x, 3, out_filter / 4, out_filter / 4, 263 | [1, 1, 1, 1]) 264 | 265 | with tf.variable_scope('sub3'): 266 | x = self._batch_norm('bn3', x) 267 | x = self._relu(x, self.hps.relu_leakiness) 268 | x = self._conv('conv3', x, 1, out_filter / 4, out_filter, 269 | [1, 1, 1, 1]) 270 | 271 | with tf.variable_scope('sub_add'): 272 | if in_filter != out_filter: 273 | orig_x = self._conv('project', orig_x, 1, in_filter, 274 | out_filter, stride) 275 | x += orig_x 276 | 277 | tf.logging.info('image after unit %s', x.get_shape()) 278 | return x 279 | 280 | def _decay(self): 281 | """L2 weight decay loss.""" 282 | costs = [] 283 | for var in tf.trainable_variables(): 284 | if var.op.name.find(r'DW') > 0: 285 | costs.append(tf.nn.l2_loss(var)) 286 | # tf.histogram_summary(var.op.name, var) 287 | 288 | return tf.mul(self.hps.weight_decay_rate, tf.add_n(costs)) 289 | 290 | def _conv(self, name, x, filter_size, in_filters, out_filters, strides): 291 | """Convolution.""" 292 | with tf.variable_scope(name): 293 | n = filter_size * filter_size * out_filters 294 | kernel = tf.get_variable( 295 | 'DW', [filter_size, filter_size, in_filters, out_filters], 296 | tf.float32, initializer=tf.random_normal_initializer( 297 | stddev=np.sqrt(2.0 / n))) 298 | return tf.nn.conv2d(x, kernel, strides, padding='SAME') 299 | 300 | def _relu(self, x, leakiness=0.0): 301 | """Relu, with optional leaky support.""" 302 | # return tf.select(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu') 303 | return tf.nn.relu(x) 304 | 305 | def _fully_connected(self, x, out_dim): 306 | """FullyConnected layer for final output.""" 307 | x = tf.reshape(x, [self.hps.batch_size, -1]) 308 | w = tf.get_variable( 309 | 'DW', [x.get_shape()[1], out_dim], 310 | initializer=tf.uniform_unit_scaling_initializer(factor=1.0)) 311 | b = tf.get_variable('biases', [out_dim], 312 | initializer=tf.constant_initializer()) 313 | return tf.nn.xw_plus_b(x, w, b) 314 | 315 | def _global_avg_pool(self, x): 316 | assert x.get_shape().ndims == 4 317 | return tf.reduce_mean(x, [1, 2]) 318 | --------------------------------------------------------------------------------