161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CNN Feature Extractor
2 |
3 | - This code can be used to extract the CNN penultimate layer feature vectors from the state-of-the-art Convolutional neural network architectures
4 | which are trained on 1 million ImageNet images.
5 |
6 | - The code shows the example of using RESNET-152 version 2. But it can be easily modified to use a different CNN architectures, provided in the models folder, which are originally developed by Tensorflow-slim authors.
7 | - ResNet 152 v2
8 | - ResNet 152 v1
9 | - VGG16
10 |
11 |
--------------------------------------------------------------------------------
/constant.py:
--------------------------------------------------------------------------------
1 | ## params
2 |
3 | # Reward Settings
4 | REWARD_SALIENCY = True
5 | REWARD_SEGMENTATION = False
6 |
7 | # videographerA3C
8 | EPISODE_BUFFER = 20
9 |
10 | # Saliency
11 | SAL_THRESHOLD = 0.9
12 |
13 | # video360Env
14 | ENV_HEIGHT = 400
15 | ENV_WIDTH = 800
16 |
17 |
18 | EYE_ROOT = "brain/"
19 | HTML = "html/"
20 |
21 | AROUND = "index.html"
22 | NAMESPACE = "/test"
23 |
24 | EQUIRECTANGULAR = 'equirectangular'
25 | CUBEMAP = 'cubemap'
26 |
27 | # modes
28 | TRAIN = 'train'
29 | TEST = 'test'
30 |
31 | # models
32 | RESNET152 = 'resnet152'
33 |
34 | # ckpts
35 | RESNET_152_CKPT = 'resnet/resnet_v2_152.ckpt'
36 | VGG_PRETRAIN_MODEL = 'vgg16/vgg16.npy'
37 |
38 | # model scopes
39 | CINE = 'cinematography'
40 | VIDRNN = 'videographyRNN_'
41 |
42 | GLOBAL = 'global'
43 |
44 | # videos categories
45 | CAT1 = 'dancing'
46 | CAT2 = 'fight'
47 |
48 | # placeholders
49 | FRAME_FEATURE_INPUTS = 'frame_feature_inputs'
50 |
51 | # Dirs
52 | PRETRAINED_ROOT = '../pretrainedWeightsRepo/'
53 | DATA_DIR = '../segmentation_data/images/'
54 | VIDEO_DATA_FOLDER = '../dataset/source/'
55 | FRAMES_DATA_FOLDER = '../dataset/extracted/'
56 |
57 | MODEL_PATH = '../pretrainedWeightsRepo/trained_model/'
58 |
--------------------------------------------------------------------------------
/featureExtractor.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import tensorflow.contrib.slim as slim
3 | import numpy as np
4 | import constant
5 | from models import resnet_v2 as resnet
6 |
7 |
8 | class FrameFeatures():
9 | def __init__(self, height, width, channel=3, architecture=constant.RESNET152, trainable=False):
10 | self.height = height
11 | self.width = width
12 | self.channel = channel
13 | self.inputs = tf.placeholder(shape=[None, self.height, self.width, self.channel],
14 | name=constant.FRAME_FEATURE_INPUTS,
15 | dtype=tf.float32)
16 | if architecture == constant.RESNET152:
17 | with slim.arg_scope(resnet.resnet_arg_scope()):
18 | self.features, _ = resnet.resnet_v2_152(inputs=self.inputs,
19 | #num_classes=1001,
20 | global_pool=True,
21 | is_training=trainable)
22 | self.features = tf.squeeze(self.features,[1,2])
23 | print('Resnet 152 model loaded.')
24 |
25 | def restore_diff_scope(self, sess, scope=''):
26 | self.sess = sess
27 | net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope+'resnet_v2_152')
28 | assignments = []
29 | for i_var in net_vars:
30 | ckpt_var = tf.contrib.framework.load_variable(constant.PRETRAINED_ROOT + constant.RESNET_152_CKPT,
31 | i_var.name.replace(scope, ''))
32 | assignments.append(i_var.assign(ckpt_var))
33 | sess.run(assignments)
34 | print('Resnet 152 checkpoints restored.')
35 |
36 | def restore(self, sess, scope='resnet_v2_152'):
37 | self.sess = sess
38 | self.saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope))
39 | self.saver.restore(sess, constant.PRETRAINED_ROOT + constant.RESNET_152_CKPT)
40 | print('Resnet 152 checkpoints restored.')
41 |
42 | def getFeatures(self, images):
43 | # Images: [batch, height, width, channel]
44 | images = images / 127.5 - 1.0
45 | predictedFeatures = self.sess.run(self.features, feed_dict={self.inputs: images})
46 | return predictedFeatures
47 |
48 |
49 | # Example of usage
50 | if __name__ == '__main__':
51 | with tf.Session() as sess:
52 | with tf.name_scope(constant.RESNET152) as scope:
53 | features = FrameFeatures(225, 225)
54 | features.restore(sess)
55 | img = tf.read_file('./download.jpg')
56 | img = tf.image.decode_jpeg(img, channels=3)
57 | img = tf.image.resize_images(img, [225, 225])
58 | img = tf.expand_dims(img, axis=0)
59 |
60 | r1 = np.argmax(features.getFeatures(img.eval()))
61 | print(r1)
62 |
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | # import sys
2 | #
3 | # sys.path.insert(0, "/cs/student/projects4/ml/2016/nmutha/msc_project/code/Around360_debug/brain")
--------------------------------------------------------------------------------
/models/resnet_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Contains building blocks for various versions of Residual Networks.
16 |
17 | Residual networks (ResNets) were proposed in:
18 | Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
19 | Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015
20 |
21 | More variants were introduced in:
22 | Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
23 | Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016
24 |
25 | We can obtain different ResNet variants by changing the network depth, width,
26 | and form of residual unit. This module implements the infrastructure for
27 | building them. Concrete ResNet units and full ResNet networks are implemented in
28 | the accompanying resnet_v1.py and resnet_v2.py modules.
29 |
30 | Compared to https://github.com/KaimingHe/deep-residual-networks, in the current
31 | implementation we subsample the output activations in the last residual unit of
32 | each block, instead of subsampling the input activations in the first residual
33 | unit of each block. The two implementations give identical results but our
34 | implementation is more memory efficient.
35 | """
36 | from __future__ import absolute_import
37 | from __future__ import division
38 | from __future__ import print_function
39 |
40 | import collections
41 | import tensorflow as tf
42 |
43 | slim = tf.contrib.slim
44 |
45 |
46 | class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])):
47 | """A named tuple describing a ResNet block.
48 |
49 | Its parts are:
50 | scope: The scope of the `Block`.
51 | unit_fn: The ResNet unit function which takes as input a `Tensor` and
52 | returns another `Tensor` with the output of the ResNet unit.
53 | args: A list of length equal to the number of units in the `Block`. The list
54 | contains one (depth, depth_bottleneck, stride) tuple for each unit in the
55 | block to serve as argument to unit_fn.
56 | """
57 |
58 |
59 | def subsample(inputs, factor, scope=None):
60 | """Subsamples the input along the spatial dimensions.
61 |
62 | Args:
63 | inputs: A `Tensor` of size [batch, height_in, width_in, channels].
64 | factor: The subsampling factor.
65 | scope: Optional variable_scope.
66 |
67 | Returns:
68 | output: A `Tensor` of size [batch, height_out, width_out, channels] with the
69 | input, either intact (if factor == 1) or subsampled (if factor > 1).
70 | """
71 | if factor == 1:
72 | return inputs
73 | else:
74 | return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope)
75 |
76 |
77 | def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None):
78 | """Strided 2-D convolution with 'SAME' padding.
79 |
80 | When stride > 1, then we do explicit zero-padding, followed by conv2d with
81 | 'VALID' padding.
82 |
83 | Note that
84 |
85 | net = conv2d_same(inputs, num_outputs, 3, stride=stride)
86 |
87 | is equivalent to
88 |
89 | net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME')
90 | net = subsample(net, factor=stride)
91 |
92 | whereas
93 |
94 | net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME')
95 |
96 | is different when the input's height or width is even, which is why we add the
97 | current function. For more details, see ResnetUtilsTest.testConv2DSameEven().
98 |
99 | Args:
100 | inputs: A 4-D tensor of size [batch, height_in, width_in, channels].
101 | num_outputs: An integer, the number of output filters.
102 | kernel_size: An int with the kernel_size of the filters.
103 | stride: An integer, the output stride.
104 | rate: An integer, rate for atrous convolution.
105 | scope: Scope.
106 |
107 | Returns:
108 | output: A 4-D tensor of size [batch, height_out, width_out, channels] with
109 | the convolution output.
110 | """
111 | if stride == 1:
112 | return slim.conv2d(inputs, num_outputs, kernel_size, stride=1, rate=rate,
113 | padding='SAME', scope=scope)
114 | else:
115 | kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1)
116 | pad_total = kernel_size_effective - 1
117 | pad_beg = pad_total // 2
118 | pad_end = pad_total - pad_beg
119 | inputs = tf.pad(inputs,
120 | [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
121 | return slim.conv2d(inputs, num_outputs, kernel_size, stride=stride,
122 | rate=rate, padding='VALID', scope=scope)
123 |
124 |
125 | @slim.add_arg_scope
126 | def stack_blocks_dense(net, blocks, output_stride=None,
127 | outputs_collections=None):
128 | """Stacks ResNet `Blocks` and controls output feature density.
129 |
130 | First, this function creates scopes for the ResNet in the form of
131 | 'block_name/unit_1', 'block_name/unit_2', etc.
132 |
133 | Second, this function allows the user to explicitly control the ResNet
134 | output_stride, which is the ratio of the input to output spatial resolution.
135 | This is useful for dense prediction tasks such as semantic segmentation or
136 | object detection.
137 |
138 | Most ResNets consist of 4 ResNet blocks and subsample the activations by a
139 | factor of 2 when transitioning between consecutive ResNet blocks. This results
140 | to a nominal ResNet output_stride equal to 8. If we set the output_stride to
141 | half the nominal network stride (e.g., output_stride=4), then we compute
142 | responses twice.
143 |
144 | Control of the output feature density is implemented by atrous convolution.
145 |
146 | Args:
147 | net: A `Tensor` of size [batch, height, width, channels].
148 | blocks: A list of length equal to the number of ResNet `Blocks`. Each
149 | element is a ResNet `Block` object describing the units in the `Block`.
150 | output_stride: If `None`, then the output will be computed at the nominal
151 | network stride. If output_stride is not `None`, it specifies the requested
152 | ratio of input to output spatial resolution, which needs to be equal to
153 | the product of unit strides from the start up to some level of the ResNet.
154 | For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1,
155 | then valid values for the output_stride are 1, 2, 6, 24 or None (which
156 | is equivalent to output_stride=24).
157 | outputs_collections: Collection to add the ResNet block outputs.
158 |
159 | Returns:
160 | net: Output tensor with stride equal to the specified output_stride.
161 |
162 | Raises:
163 | ValueError: If the target output_stride is not valid.
164 | """
165 | # The current_stride variable keeps track of the effective stride of the
166 | # activations. This allows us to invoke atrous convolution whenever applying
167 | # the next residual unit would result in the activations having stride larger
168 | # than the target output_stride.
169 | current_stride = 1
170 |
171 | # The atrous convolution rate parameter.
172 | rate = 1
173 |
174 | for block in blocks:
175 | with tf.variable_scope(block.scope, 'block', [net]) as sc:
176 | for i, unit in enumerate(block.args):
177 | if output_stride is not None and current_stride > output_stride:
178 | raise ValueError('The target output_stride cannot be reached.')
179 |
180 | with tf.variable_scope('unit_%d' % (i + 1), values=[net]):
181 | # If we have reached the target output_stride, then we need to employ
182 | # atrous convolution with stride=1 and multiply the atrous rate by the
183 | # current unit's stride for use in subsequent layers.
184 | if output_stride is not None and current_stride == output_stride:
185 | net = block.unit_fn(net, rate=rate, **dict(unit, stride=1))
186 | rate *= unit.get('stride', 1)
187 |
188 | else:
189 | net = block.unit_fn(net, rate=1, **unit)
190 | current_stride *= unit.get('stride', 1)
191 | net = slim.utils.collect_named_outputs(outputs_collections, sc.name, net)
192 |
193 | if output_stride is not None and current_stride != output_stride:
194 | raise ValueError('The target output_stride cannot be reached.')
195 |
196 | return net
197 |
198 |
199 | def resnet_arg_scope(weight_decay=0.0001,
200 | batch_norm_decay=0.997,
201 | batch_norm_epsilon=1e-5,
202 | batch_norm_scale=True,
203 | activation_fn=tf.nn.relu,
204 | use_batch_norm=True):
205 | """Defines the default ResNet arg scope.
206 |
207 | TODO(gpapan): The batch-normalization related default values above are
208 | appropriate for use in conjunction with the reference ResNet models
209 | released at https://github.com/KaimingHe/deep-residual-networks. When
210 | training ResNets from scratch, they might need to be tuned.
211 |
212 | Args:
213 | weight_decay: The weight decay to use for regularizing the model.
214 | batch_norm_decay: The moving average decay when estimating layer activation
215 | statistics in batch normalization.
216 | batch_norm_epsilon: Small constant to prevent division by zero when
217 | normalizing activations by their variance in batch normalization.
218 | batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
219 | activations in the batch normalization layer.
220 | activation_fn: The activation function which is used in ResNet.
221 | use_batch_norm: Whether or not to use batch normalization.
222 |
223 | Returns:
224 | An `arg_scope` to use for the resnet models.
225 | """
226 | batch_norm_params = {
227 | 'decay': batch_norm_decay,
228 | 'epsilon': batch_norm_epsilon,
229 | 'scale': batch_norm_scale,
230 | 'updates_collections': tf.GraphKeys.UPDATE_OPS,
231 | }
232 |
233 | with slim.arg_scope(
234 | [slim.conv2d],
235 | weights_regularizer=slim.l2_regularizer(weight_decay),
236 | weights_initializer=slim.variance_scaling_initializer(),
237 | activation_fn=activation_fn,
238 | normalizer_fn=slim.batch_norm if use_batch_norm else None,
239 | normalizer_params=batch_norm_params):
240 | with slim.arg_scope([slim.batch_norm], **batch_norm_params):
241 | # The following implies padding='SAME' for pool1, which makes feature
242 | # alignment easier for dense prediction tasks. This is also used in
243 | # https://github.com/facebook/fb.resnet.torch. However the accompanying
244 | # code of 'Deep Residual Learning for Image Recognition' uses
245 | # padding='VALID' for pool1. You can switch to that choice by setting
246 | # slim.arg_scope([slim.max_pool2d], padding='VALID').
247 | with slim.arg_scope([slim.max_pool2d], padding='SAME') as arg_sc:
248 | return arg_sc
249 |
--------------------------------------------------------------------------------
/models/resnet_v1.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Contains definitions for the original form of Residual Networks.
16 |
17 | The 'v1' residual networks (ResNets) implemented in this module were proposed
18 | by:
19 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
20 | Deep Residual Learning for Image Recognition. arXiv:1512.03385
21 |
22 | Other variants were introduced in:
23 | [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
24 | Identity Mappings in Deep Residual Networks. arXiv: 1603.05027
25 |
26 | The networks defined in this module utilize the bottleneck building block of
27 | [1] with projection shortcuts only for increasing depths. They employ batch
28 | normalization *after* every weight layer. This is the architecture used by
29 | MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and
30 | ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1'
31 | architecture and the alternative 'v2' architecture of [2] which uses batch
32 | normalization *before* every weight layer in the so-called full pre-activation
33 | units.
34 |
35 | Typical use:
36 |
37 | from tensorflow.contrib.slim.nets import resnet_v1
38 |
39 | ResNet-101 for image classification into 1000 classes:
40 |
41 | # inputs has shape [batch, 224, 224, 3]
42 | with slim.arg_scope(resnet_v1.resnet_arg_scope()):
43 | net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False)
44 |
45 | ResNet-101 for semantic segmentation into 21 classes:
46 |
47 | # inputs has shape [batch, 513, 513, 3]
48 | with slim.arg_scope(resnet_v1.resnet_arg_scope()):
49 | net, end_points = resnet_v1.resnet_v1_101(inputs,
50 | 21,
51 | is_training=False,
52 | global_pool=False,
53 | output_stride=16)
54 | """
55 | from __future__ import absolute_import
56 | from __future__ import division
57 | from __future__ import print_function
58 |
59 | import tensorflow as tf
60 |
61 | from com.nitishmutha.around.visualcortex.retina.models import resnet_utils
62 |
63 | resnet_arg_scope = resnet_utils.resnet_arg_scope
64 | slim = tf.contrib.slim
65 |
66 |
67 | @slim.add_arg_scope
68 | def bottleneck(inputs,
69 | depth,
70 | depth_bottleneck,
71 | stride,
72 | rate=1,
73 | outputs_collections=None,
74 | scope=None,
75 | use_bounded_activations=False):
76 | """Bottleneck residual unit variant with BN after convolutions.
77 |
78 | This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
79 | its definition. Note that we use here the bottleneck variant which has an
80 | extra bottleneck layer.
81 |
82 | When putting together two consecutive ResNet blocks that use this unit, one
83 | should use stride = 2 in the last unit of the first block.
84 |
85 | Args:
86 | inputs: A tensor of size [batch, height, width, channels].
87 | depth: The depth of the ResNet unit output.
88 | depth_bottleneck: The depth of the bottleneck layers.
89 | stride: The ResNet unit's stride. Determines the amount of downsampling of
90 | the units output compared to its input.
91 | rate: An integer, rate for atrous convolution.
92 | outputs_collections: Collection to add the ResNet unit output.
93 | scope: Optional variable_scope.
94 | use_bounded_activations: Whether or not to use bounded activations. Bounded
95 | activations better lend themselves to quantized inference.
96 |
97 | Returns:
98 | The ResNet unit's output.
99 | """
100 | with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc:
101 | depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
102 | if depth == depth_in:
103 | shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
104 | else:
105 | shortcut = slim.conv2d(
106 | inputs,
107 | depth, [1, 1],
108 | stride=stride,
109 | activation_fn=tf.nn.relu6 if use_bounded_activations else None,
110 | scope='shortcut')
111 |
112 | residual = slim.conv2d(inputs, depth_bottleneck, [1, 1], stride=1,
113 | scope='conv1')
114 | residual = resnet_utils.conv2d_same(residual, depth_bottleneck, 3, stride,
115 | rate=rate, scope='conv2')
116 | residual = slim.conv2d(residual, depth, [1, 1], stride=1,
117 | activation_fn=None, scope='conv3')
118 |
119 | if use_bounded_activations:
120 | # Use clip_by_value to simulate bandpass activation.
121 | residual = tf.clip_by_value(residual, -6.0, 6.0)
122 | output = tf.nn.relu6(shortcut + residual)
123 | else:
124 | output = tf.nn.relu(shortcut + residual)
125 |
126 | return slim.utils.collect_named_outputs(outputs_collections,
127 | sc.original_name_scope,
128 | output)
129 |
130 |
131 | def resnet_v1(inputs,
132 | blocks,
133 | num_classes=None,
134 | is_training=True,
135 | global_pool=True,
136 | output_stride=None,
137 | include_root_block=True,
138 | spatial_squeeze=True,
139 | reuse=None,
140 | scope=None):
141 | """Generator for v1 ResNet models.
142 |
143 | This function generates a family of ResNet v1 models. See the resnet_v1_*()
144 | methods for specific model instantiations, obtained by selecting different
145 | block instantiations that produce ResNets of various depths.
146 |
147 | Training for image classification on Imagenet is usually done with [224, 224]
148 | inputs, resulting in [7, 7] feature maps at the output of the last ResNet
149 | block for the ResNets defined in [1] that have nominal stride equal to 32.
150 | However, for dense prediction tasks we advise that one uses inputs with
151 | spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
152 | this case the feature maps at the ResNet output will have spatial shape
153 | [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
154 | and corners exactly aligned with the input image corners, which greatly
155 | facilitates alignment of the features to the image. Using as input [225, 225]
156 | images results in [8, 8] feature maps at the output of the last ResNet block.
157 |
158 | For dense prediction tasks, the ResNet needs to run in fully-convolutional
159 | (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
160 | have nominal stride equal to 32 and a good choice in FCN mode is to use
161 | output_stride=16 in order to increase the density of the computed features at
162 | small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.
163 |
164 | Args:
165 | inputs: A tensor of size [batch, height_in, width_in, channels].
166 | blocks: A list of length equal to the number of ResNet blocks. Each element
167 | is a resnet_utils.Block object describing the units in the block.
168 | num_classes: Number of predicted classes for classification tasks. If None
169 | we return the features before the logit layer.
170 | is_training: whether is training or not.
171 | global_pool: If True, we perform global average pooling before computing the
172 | logits. Set to True for image classification, False for dense prediction.
173 | output_stride: If None, then the output will be computed at the nominal
174 | network stride. If output_stride is not None, it specifies the requested
175 | ratio of input to output spatial resolution.
176 | include_root_block: If True, include the initial convolution followed by
177 | max-pooling, if False excludes it.
178 | spatial_squeeze: if True, logits is of shape [B, C], if false logits is
179 | of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
180 | To use this parameter, the input images must be smaller than 300x300
181 | pixels, in which case the output logit layer does not contain spatial
182 | information and can be removed.
183 | reuse: whether or not the network and its variables should be reused. To be
184 | able to reuse 'scope' must be given.
185 | scope: Optional variable_scope.
186 |
187 | Returns:
188 | net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
189 | If global_pool is False, then height_out and width_out are reduced by a
190 | factor of output_stride compared to the respective height_in and width_in,
191 | else both height_out and width_out equal one. If num_classes is None, then
192 | net is the output of the last ResNet block, potentially after global
193 | average pooling. If num_classes is not None, net contains the pre-softmax
194 | activations.
195 | end_points: A dictionary from components of the network to the corresponding
196 | activation.
197 |
198 | Raises:
199 | ValueError: If the target output_stride is not valid.
200 | """
201 | with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc:
202 | end_points_collection = sc.name + '_end_points'
203 | with slim.arg_scope([slim.conv2d, bottleneck,
204 | resnet_utils.stack_blocks_dense],
205 | outputs_collections=end_points_collection):
206 | with slim.arg_scope([slim.batch_norm], is_training=is_training):
207 | net = inputs
208 | if include_root_block:
209 | if output_stride is not None:
210 | if output_stride % 4 != 0:
211 | raise ValueError('The output_stride needs to be a multiple of 4.')
212 | output_stride /= 4
213 | net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
214 | net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')
215 | net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
216 | if global_pool:
217 | # Global average pooling.
218 | net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
219 | if num_classes is not None:
220 | net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
221 | normalizer_fn=None, scope='logits')
222 | if spatial_squeeze:
223 | net = tf.squeeze(net, [1, 2], name='SpatialSqueeze')
224 | # Convert end_points_collection into a dictionary of end_points.
225 | end_points = slim.utils.convert_collection_to_dict(
226 | end_points_collection)
227 | if num_classes is not None:
228 | end_points['predictions'] = slim.softmax(net, scope='predictions')
229 | return net, end_points
230 | resnet_v1.default_image_size = 224
231 |
232 |
233 | def resnet_v1_block(scope, base_depth, num_units, stride):
234 | """Helper function for creating a resnet_v1 bottleneck block.
235 |
236 | Args:
237 | scope: The scope of the block.
238 | base_depth: The depth of the bottleneck layer for each unit.
239 | num_units: The number of units in the block.
240 | stride: The stride of the block, implemented as a stride in the last unit.
241 | All other units have stride=1.
242 |
243 | Returns:
244 | A resnet_v1 bottleneck block.
245 | """
246 | return resnet_utils.Block(scope, bottleneck, [{
247 | 'depth': base_depth * 4,
248 | 'depth_bottleneck': base_depth,
249 | 'stride': 1
250 | }] * (num_units - 1) + [{
251 | 'depth': base_depth * 4,
252 | 'depth_bottleneck': base_depth,
253 | 'stride': stride
254 | }])
255 |
256 |
257 | def resnet_v1_50(inputs,
258 | num_classes=None,
259 | is_training=True,
260 | global_pool=True,
261 | output_stride=None,
262 | spatial_squeeze=True,
263 | reuse=None,
264 | scope='resnet_v1_50'):
265 | """ResNet-50 model of [1]. See resnet_v1() for arg and return description."""
266 | blocks = [
267 | resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
268 | resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
269 | resnet_v1_block('block3', base_depth=256, num_units=6, stride=2),
270 | resnet_v1_block('block4', base_depth=512, num_units=3, stride=1),
271 | ]
272 | return resnet_v1(inputs, blocks, num_classes, is_training,
273 | global_pool=global_pool, output_stride=output_stride,
274 | include_root_block=True, spatial_squeeze=spatial_squeeze,
275 | reuse=reuse, scope=scope)
276 | resnet_v1_50.default_image_size = resnet_v1.default_image_size
277 |
278 |
279 | def resnet_v1_101(inputs,
280 | num_classes=None,
281 | is_training=True,
282 | global_pool=True,
283 | output_stride=None,
284 | spatial_squeeze=True,
285 | reuse=None,
286 | scope='resnet_v1_101'):
287 | """ResNet-101 model of [1]. See resnet_v1() for arg and return description."""
288 | blocks = [
289 | resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
290 | resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
291 | resnet_v1_block('block3', base_depth=256, num_units=23, stride=2),
292 | resnet_v1_block('block4', base_depth=512, num_units=3, stride=1),
293 | ]
294 | return resnet_v1(inputs, blocks, num_classes, is_training,
295 | global_pool=global_pool, output_stride=output_stride,
296 | include_root_block=True, spatial_squeeze=spatial_squeeze,
297 | reuse=reuse, scope=scope)
298 | resnet_v1_101.default_image_size = resnet_v1.default_image_size
299 |
300 |
301 | def resnet_v1_152(inputs,
302 | num_classes=None,
303 | is_training=True,
304 | global_pool=True,
305 | output_stride=None,
306 | spatial_squeeze=True,
307 | reuse=None,
308 | scope='resnet_v1_152'):
309 | """ResNet-152 model of [1]. See resnet_v1() for arg and return description."""
310 | blocks = [
311 | resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
312 | resnet_v1_block('block2', base_depth=128, num_units=8, stride=2),
313 | resnet_v1_block('block3', base_depth=256, num_units=36, stride=2),
314 | resnet_v1_block('block4', base_depth=512, num_units=3, stride=1),
315 | ]
316 | return resnet_v1(inputs, blocks, num_classes, is_training,
317 | global_pool=global_pool, output_stride=output_stride,
318 | include_root_block=True, spatial_squeeze=spatial_squeeze,
319 | reuse=reuse, scope=scope)
320 | resnet_v1_152.default_image_size = resnet_v1.default_image_size
321 |
322 |
323 | def resnet_v1_200(inputs,
324 | num_classes=None,
325 | is_training=True,
326 | global_pool=True,
327 | output_stride=None,
328 | spatial_squeeze=True,
329 | reuse=None,
330 | scope='resnet_v1_200'):
331 | """ResNet-200 model of [2]. See resnet_v1() for arg and return description."""
332 | blocks = [
333 | resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
334 | resnet_v1_block('block2', base_depth=128, num_units=24, stride=2),
335 | resnet_v1_block('block3', base_depth=256, num_units=36, stride=2),
336 | resnet_v1_block('block4', base_depth=512, num_units=3, stride=1),
337 | ]
338 | return resnet_v1(inputs, blocks, num_classes, is_training,
339 | global_pool=global_pool, output_stride=output_stride,
340 | include_root_block=True, spatial_squeeze=spatial_squeeze,
341 | reuse=reuse, scope=scope)
342 | resnet_v1_200.default_image_size = resnet_v1.default_image_size
--------------------------------------------------------------------------------
/models/resnet_v2.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Contains definitions for the preactivation form of Residual Networks.
16 |
17 | Residual networks (ResNets) were originally proposed in:
18 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
19 | Deep Residual Learning for Image Recognition. arXiv:1512.03385
20 |
21 | The full preactivation 'v2' ResNet variant implemented in this module was
22 | introduced by:
23 | [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
24 | Identity Mappings in Deep Residual Networks. arXiv: 1603.05027
25 |
26 | The key difference of the full preactivation 'v2' variant compared to the
27 | 'v1' variant in [1] is the use of batch normalization before every weight layer.
28 |
29 | Typical use:
30 |
31 | from tensorflow.contrib.slim.nets import resnet_v2
32 |
33 | ResNet-101 for image classification into 1000 classes:
34 |
35 | # inputs has shape [batch, 224, 224, 3]
36 | with slim.arg_scope(resnet_v2.resnet_arg_scope()):
37 | net, end_points = resnet_v2.resnet_v2_101(inputs, 1000, is_training=False)
38 |
39 | ResNet-101 for semantic segmentation into 21 classes:
40 |
41 | # inputs has shape [batch, 513, 513, 3]
42 | with slim.arg_scope(resnet_v2.resnet_arg_scope(is_training)):
43 | net, end_points = resnet_v2.resnet_v2_101(inputs,
44 | 21,
45 | is_training=False,
46 | global_pool=False,
47 | output_stride=16)
48 | """
49 | from __future__ import absolute_import
50 | from __future__ import division
51 | from __future__ import print_function
52 |
53 | import tensorflow as tf
54 |
55 | from com.nitishmutha.around.visualcortex.retina.models import resnet_utils
56 |
57 | slim = tf.contrib.slim
58 | resnet_arg_scope = resnet_utils.resnet_arg_scope
59 |
60 |
61 | @slim.add_arg_scope
62 | def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1,
63 | outputs_collections=None, scope=None):
64 | """Bottleneck residual unit variant with BN before convolutions.
65 |
66 | This is the full preactivation residual unit variant proposed in [2]. See
67 | Fig. 1(b) of [2] for its definition. Note that we use here the bottleneck
68 | variant which has an extra bottleneck layer.
69 |
70 | When putting together two consecutive ResNet blocks that use this unit, one
71 | should use stride = 2 in the last unit of the first block.
72 |
73 | Args:
74 | inputs: A tensor of size [batch, height, width, channels].
75 | depth: The depth of the ResNet unit output.
76 | depth_bottleneck: The depth of the bottleneck layers.
77 | stride: The ResNet unit's stride. Determines the amount of downsampling of
78 | the units output compared to its input.
79 | rate: An integer, rate for atrous convolution.
80 | outputs_collections: Collection to add the ResNet unit output.
81 | scope: Optional variable_scope.
82 |
83 | Returns:
84 | The ResNet unit's output.
85 | """
86 | with tf.variable_scope(scope, 'bottleneck_v2', [inputs]) as sc:
87 | depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
88 | preact = slim.batch_norm(inputs, activation_fn=tf.nn.relu, scope='preact')
89 | if depth == depth_in:
90 | shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
91 | else:
92 | shortcut = slim.conv2d(preact, depth, [1, 1], stride=stride,
93 | normalizer_fn=None, activation_fn=None,
94 | scope='shortcut')
95 |
96 | residual = slim.conv2d(preact, depth_bottleneck, [1, 1], stride=1,
97 | scope='conv1')
98 | residual = resnet_utils.conv2d_same(residual, depth_bottleneck, 3, stride,
99 | rate=rate, scope='conv2')
100 | residual = slim.conv2d(residual, depth, [1, 1], stride=1,
101 | normalizer_fn=None, activation_fn=None,
102 | scope='conv3')
103 |
104 | output = shortcut + residual
105 |
106 | return slim.utils.collect_named_outputs(outputs_collections,
107 | sc.original_name_scope,
108 | output)
109 |
110 |
111 | def resnet_v2(inputs,
112 | blocks,
113 | num_classes=None,
114 | is_training=True,
115 | global_pool=True,
116 | output_stride=None,
117 | include_root_block=True,
118 | spatial_squeeze=True,
119 | reuse=None,
120 | scope=None):
121 | """Generator for v2 (preactivation) ResNet models.
122 |
123 | This function generates a family of ResNet v2 models. See the resnet_v2_*()
124 | methods for specific model instantiations, obtained by selecting different
125 | block instantiations that produce ResNets of various depths.
126 |
127 | Training for image classification on Imagenet is usually done with [224, 224]
128 | inputs, resulting in [7, 7] feature maps at the output of the last ResNet
129 | block for the ResNets defined in [1] that have nominal stride equal to 32.
130 | However, for dense prediction tasks we advise that one uses inputs with
131 | spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
132 | this case the feature maps at the ResNet output will have spatial shape
133 | [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
134 | and corners exactly aligned with the input image corners, which greatly
135 | facilitates alignment of the features to the image. Using as input [225, 225]
136 | images results in [8, 8] feature maps at the output of the last ResNet block.
137 |
138 | For dense prediction tasks, the ResNet needs to run in fully-convolutional
139 | (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
140 | have nominal stride equal to 32 and a good choice in FCN mode is to use
141 | output_stride=16 in order to increase the density of the computed features at
142 | small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.
143 |
144 | Args:
145 | inputs: A tensor of size [batch, height_in, width_in, channels].
146 | blocks: A list of length equal to the number of ResNet blocks. Each element
147 | is a resnet_utils.Block object describing the units in the block.
148 | num_classes: Number of predicted classes for classification tasks. If None
149 | we return the features before the logit layer.
150 | is_training: whether is training or not.
151 | global_pool: If True, we perform global average pooling before computing the
152 | logits. Set to True for image classification, False for dense prediction.
153 | output_stride: If None, then the output will be computed at the nominal
154 | network stride. If output_stride is not None, it specifies the requested
155 | ratio of input to output spatial resolution.
156 | include_root_block: If True, include the initial convolution followed by
157 | max-pooling, if False excludes it. If excluded, `inputs` should be the
158 | results of an activation-less convolution.
159 | spatial_squeeze: if True, logits is of shape [B, C], if false logits is
160 | of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
161 | To use this parameter, the input images must be smaller than 300x300
162 | pixels, in which case the output logit layer does not contain spatial
163 | information and can be removed.
164 | reuse: whether or not the network and its variables should be reused. To be
165 | able to reuse 'scope' must be given.
166 | scope: Optional variable_scope.
167 |
168 |
169 | Returns:
170 | net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
171 | If global_pool is False, then height_out and width_out are reduced by a
172 | factor of output_stride compared to the respective height_in and width_in,
173 | else both height_out and width_out equal one. If num_classes is None, then
174 | net is the output of the last ResNet block, potentially after global
175 | average pooling. If num_classes is not None, net contains the pre-softmax
176 | activations.
177 | end_points: A dictionary from components of the network to the corresponding
178 | activation.
179 |
180 | Raises:
181 | ValueError: If the target output_stride is not valid.
182 | """
183 | with tf.variable_scope(scope, 'resnet_v2', [inputs], reuse=reuse) as sc:
184 | end_points_collection = sc.name + '_end_points'
185 | with slim.arg_scope([slim.conv2d, bottleneck,
186 | resnet_utils.stack_blocks_dense],
187 | outputs_collections=end_points_collection):
188 | with slim.arg_scope([slim.batch_norm], is_training=is_training):
189 | net = inputs
190 | if include_root_block:
191 | if output_stride is not None:
192 | if output_stride % 4 != 0:
193 | raise ValueError('The output_stride needs to be a multiple of 4.')
194 | output_stride /= 4
195 | # We do not include batch normalization or activation functions in
196 | # conv1 because the first ResNet unit will perform these. Cf.
197 | # Appendix of [2].
198 | with slim.arg_scope([slim.conv2d],
199 | activation_fn=None, normalizer_fn=None):
200 | net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
201 | net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')
202 | net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
203 | # This is needed because the pre-activation variant does not have batch
204 | # normalization or activation functions in the residual unit output. See
205 | # Appendix of [2].
206 | net = slim.batch_norm(net, activation_fn=tf.nn.relu, scope='postnorm')
207 | if global_pool:
208 | # Global average pooling.
209 | net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
210 | if num_classes is not None:
211 | net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
212 | normalizer_fn=None, scope='logits')
213 | if spatial_squeeze:
214 | net = tf.squeeze(net, [1, 2], name='SpatialSqueeze')
215 | # Convert end_points_collection into a dictionary of end_points.
216 | end_points = slim.utils.convert_collection_to_dict(
217 | end_points_collection)
218 | if num_classes is not None:
219 | end_points['predictions'] = slim.softmax(net, scope='predictions')
220 | return net, end_points
221 | resnet_v2.default_image_size = 224
222 |
223 |
224 | def resnet_v2_block(scope, base_depth, num_units, stride):
225 | """Helper function for creating a resnet_v2 bottleneck block.
226 |
227 | Args:
228 | scope: The scope of the block.
229 | base_depth: The depth of the bottleneck layer for each unit.
230 | num_units: The number of units in the block.
231 | stride: The stride of the block, implemented as a stride in the last unit.
232 | All other units have stride=1.
233 |
234 | Returns:
235 | A resnet_v2 bottleneck block.
236 | """
237 | return resnet_utils.Block(scope, bottleneck, [{
238 | 'depth': base_depth * 4,
239 | 'depth_bottleneck': base_depth,
240 | 'stride': 1
241 | }] * (num_units - 1) + [{
242 | 'depth': base_depth * 4,
243 | 'depth_bottleneck': base_depth,
244 | 'stride': stride
245 | }])
246 | resnet_v2.default_image_size = 224
247 |
248 |
249 | def resnet_v2_50(inputs,
250 | num_classes=None,
251 | is_training=True,
252 | global_pool=True,
253 | output_stride=None,
254 | spatial_squeeze=True,
255 | reuse=None,
256 | scope='resnet_v2_50'):
257 | """ResNet-50 model of [1]. See resnet_v2() for arg and return description."""
258 | blocks = [
259 | resnet_v2_block('block1', base_depth=64, num_units=3, stride=2),
260 | resnet_v2_block('block2', base_depth=128, num_units=4, stride=2),
261 | resnet_v2_block('block3', base_depth=256, num_units=6, stride=2),
262 | resnet_v2_block('block4', base_depth=512, num_units=3, stride=1),
263 | ]
264 | return resnet_v2(inputs, blocks, num_classes, is_training=is_training,
265 | global_pool=global_pool, output_stride=output_stride,
266 | include_root_block=True, spatial_squeeze=spatial_squeeze,
267 | reuse=reuse, scope=scope)
268 | resnet_v2_50.default_image_size = resnet_v2.default_image_size
269 |
270 |
271 | def resnet_v2_101(inputs,
272 | num_classes=None,
273 | is_training=True,
274 | global_pool=True,
275 | output_stride=None,
276 | spatial_squeeze=True,
277 | reuse=None,
278 | scope='resnet_v2_101'):
279 | """ResNet-101 model of [1]. See resnet_v2() for arg and return description."""
280 | blocks = [
281 | resnet_v2_block('block1', base_depth=64, num_units=3, stride=2),
282 | resnet_v2_block('block2', base_depth=128, num_units=4, stride=2),
283 | resnet_v2_block('block3', base_depth=256, num_units=23, stride=2),
284 | resnet_v2_block('block4', base_depth=512, num_units=3, stride=1),
285 | ]
286 | return resnet_v2(inputs, blocks, num_classes, is_training=is_training,
287 | global_pool=global_pool, output_stride=output_stride,
288 | include_root_block=True, spatial_squeeze=spatial_squeeze,
289 | reuse=reuse, scope=scope)
290 | resnet_v2_101.default_image_size = resnet_v2.default_image_size
291 |
292 |
293 | def resnet_v2_152(inputs,
294 | num_classes=None,
295 | is_training=True,
296 | global_pool=True,
297 | output_stride=None,
298 | spatial_squeeze=True,
299 | reuse=None,
300 | scope='resnet_v2_152'):
301 | """ResNet-152 model of [1]. See resnet_v2() for arg and return description."""
302 | blocks = [
303 | resnet_v2_block('block1', base_depth=64, num_units=3, stride=2),
304 | resnet_v2_block('block2', base_depth=128, num_units=8, stride=2),
305 | resnet_v2_block('block3', base_depth=256, num_units=36, stride=2),
306 | resnet_v2_block('block4', base_depth=512, num_units=3, stride=1),
307 | ]
308 | return resnet_v2(inputs, blocks, num_classes, is_training=is_training,
309 | global_pool=global_pool, output_stride=output_stride,
310 | include_root_block=True, spatial_squeeze=spatial_squeeze,
311 | reuse=reuse, scope=scope)
312 | resnet_v2_152.default_image_size = resnet_v2.default_image_size
313 |
314 |
315 | def resnet_v2_200(inputs,
316 | num_classes=None,
317 | is_training=True,
318 | global_pool=True,
319 | output_stride=None,
320 | spatial_squeeze=True,
321 | reuse=None,
322 | scope='resnet_v2_200'):
323 | """ResNet-200 model of [2]. See resnet_v2() for arg and return description."""
324 | blocks = [
325 | resnet_v2_block('block1', base_depth=64, num_units=3, stride=2),
326 | resnet_v2_block('block2', base_depth=128, num_units=24, stride=2),
327 | resnet_v2_block('block3', base_depth=256, num_units=36, stride=2),
328 | resnet_v2_block('block4', base_depth=512, num_units=3, stride=1),
329 | ]
330 | return resnet_v2(inputs, blocks, num_classes, is_training=is_training,
331 | global_pool=global_pool, output_stride=output_stride,
332 | include_root_block=True, spatial_squeeze=spatial_squeeze,
333 | reuse=reuse, scope=scope)
334 | resnet_v2_200.default_image_size = resnet_v2.default_image_size
335 |
--------------------------------------------------------------------------------
/models/vgg.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Contains model definitions for versions of the Oxford VGG network.
16 |
17 | These model definitions were introduced in the following technical report:
18 |
19 | Very Deep Convolutional Networks For Large-Scale Image Recognition
20 | Karen Simonyan and Andrew Zisserman
21 | arXiv technical report, 2015
22 | PDF: http://arxiv.org/pdf/1409.1556.pdf
23 | ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf
24 | CC-BY-4.0
25 |
26 | More information can be obtained from the VGG website:
27 | www.robots.ox.ac.uk/~vgg/research/very_deep/
28 |
29 | Usage:
30 | with slim.arg_scope(vgg.vgg_arg_scope()):
31 | outputs, end_points = vgg.vgg_a(inputs)
32 |
33 | with slim.arg_scope(vgg.vgg_arg_scope()):
34 | outputs, end_points = vgg.vgg_16(inputs)
35 |
36 | @@vgg_a
37 | @@vgg_16
38 | @@vgg_19
39 | """
40 | from __future__ import absolute_import
41 | from __future__ import division
42 | from __future__ import print_function
43 |
44 | import tensorflow as tf
45 |
46 | slim = tf.contrib.slim
47 |
48 |
49 | def vgg_arg_scope(weight_decay=0.0005):
50 | """Defines the VGG arg scope.
51 |
52 | Args:
53 | weight_decay: The l2 regularization coefficient.
54 |
55 | Returns:
56 | An arg_scope.
57 | """
58 | with slim.arg_scope([slim.conv2d, slim.fully_connected],
59 | activation_fn=tf.nn.relu,
60 | weights_regularizer=slim.l2_regularizer(weight_decay),
61 | biases_initializer=tf.zeros_initializer()):
62 | with slim.arg_scope([slim.conv2d], padding='SAME') as arg_sc:
63 | return arg_sc
64 |
65 |
66 | def vgg_a(inputs,
67 | num_classes=1000,
68 | is_training=True,
69 | dropout_keep_prob=0.5,
70 | spatial_squeeze=True,
71 | scope='vgg_a',
72 | fc_conv_padding='VALID'):
73 | """Oxford Net VGG 11-Layers version A Example.
74 |
75 | Note: All the fully_connected layers have been transformed to conv2d layers.
76 | To use in classification mode, resize input to 224x224.
77 |
78 | Args:
79 | inputs: a tensor of size [batch_size, height, width, channels].
80 | num_classes: number of predicted classes.
81 | is_training: whether or not the model is being trained.
82 | dropout_keep_prob: the probability that activations are kept in the dropout
83 | layers during training.
84 | spatial_squeeze: whether or not should squeeze the spatial dimensions of the
85 | outputs. Useful to remove unnecessary dimensions for classification.
86 | scope: Optional scope for the variables.
87 | fc_conv_padding: the type of padding to use for the fully connected layer
88 | that is implemented as a convolutional layer. Use 'SAME' padding if you
89 | are applying the network in a fully convolutional manner and want to
90 | get a prediction map downsampled by a factor of 32 as an output.
91 | Otherwise, the output prediction map will be (input / 32) - 6 in case of
92 | 'VALID' padding.
93 |
94 | Returns:
95 | the last op containing the log predictions and end_points dict.
96 | """
97 | with tf.variable_scope(scope, 'vgg_a', [inputs]) as sc:
98 | end_points_collection = sc.name + '_end_points'
99 | # Collect outputs for conv2d, fully_connected and max_pool2d.
100 | with slim.arg_scope([slim.conv2d, slim.max_pool2d],
101 | outputs_collections=end_points_collection):
102 | net = slim.repeat(inputs, 1, slim.conv2d, 64, [3, 3], scope='conv1')
103 | net = slim.max_pool2d(net, [2, 2], scope='pool1')
104 | net = slim.repeat(net, 1, slim.conv2d, 128, [3, 3], scope='conv2')
105 | net = slim.max_pool2d(net, [2, 2], scope='pool2')
106 | net = slim.repeat(net, 2, slim.conv2d, 256, [3, 3], scope='conv3')
107 | net = slim.max_pool2d(net, [2, 2], scope='pool3')
108 | net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv4')
109 | net = slim.max_pool2d(net, [2, 2], scope='pool4')
110 | net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv5')
111 | net = slim.max_pool2d(net, [2, 2], scope='pool5')
112 | # Use conv2d instead of fully_connected layers.
113 | net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
114 | net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
115 | scope='dropout6')
116 | net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
117 | net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
118 | scope='dropout7')
119 | net = slim.conv2d(net, num_classes, [1, 1],
120 | activation_fn=None,
121 | normalizer_fn=None,
122 | scope='fc8')
123 | # Convert end_points_collection into a end_point dict.
124 | end_points = slim.utils.convert_collection_to_dict(end_points_collection)
125 | if spatial_squeeze:
126 | net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
127 | end_points[sc.name + '/fc8'] = net
128 | return net, end_points
129 | vgg_a.default_image_size = 224
130 |
131 |
132 | def vgg_16(inputs,
133 | num_classes=1000,
134 | is_training=True,
135 | dropout_keep_prob=0.5,
136 | spatial_squeeze=True,
137 | scope='vgg_16',
138 | fc_conv_padding='VALID'):
139 | """Oxford Net VGG 16-Layers version D Example.
140 |
141 | Note: All the fully_connected layers have been transformed to conv2d layers.
142 | To use in classification mode, resize input to 224x224.
143 |
144 | Args:
145 | inputs: a tensor of size [batch_size, height, width, channels].
146 | num_classes: number of predicted classes.
147 | is_training: whether or not the model is being trained.
148 | dropout_keep_prob: the probability that activations are kept in the dropout
149 | layers during training.
150 | spatial_squeeze: whether or not should squeeze the spatial dimensions of the
151 | outputs. Useful to remove unnecessary dimensions for classification.
152 | scope: Optional scope for the variables.
153 | fc_conv_padding: the type of padding to use for the fully connected layer
154 | that is implemented as a convolutional layer. Use 'SAME' padding if you
155 | are applying the network in a fully convolutional manner and want to
156 | get a prediction map downsampled by a factor of 32 as an output.
157 | Otherwise, the output prediction map will be (input / 32) - 6 in case of
158 | 'VALID' padding.
159 |
160 | Returns:
161 | the last op containing the log predictions and end_points dict.
162 | """
163 | with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc:
164 | end_points_collection = sc.name + '_end_points'
165 | # Collect outputs for conv2d, fully_connected and max_pool2d.
166 | with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
167 | outputs_collections=end_points_collection):
168 | net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
169 | net = slim.max_pool2d(net, [2, 2], scope='pool1')
170 | net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
171 | net = slim.max_pool2d(net, [2, 2], scope='pool2')
172 | net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
173 | net = slim.max_pool2d(net, [2, 2], scope='pool3')
174 | net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
175 | net = slim.max_pool2d(net, [2, 2], scope='pool4')
176 | net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
177 | net = slim.max_pool2d(net, [2, 2], scope='pool5')
178 | # Use conv2d instead of fully_connected layers.
179 | net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
180 | net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
181 | scope='dropout6')
182 | net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
183 | net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
184 | scope='dropout7')
185 | net = slim.conv2d(net, num_classes, [1, 1],
186 | activation_fn=None,
187 | normalizer_fn=None,
188 | scope='fc8')
189 | # Convert end_points_collection into a end_point dict.
190 | end_points = slim.utils.convert_collection_to_dict(end_points_collection)
191 | if spatial_squeeze:
192 | net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
193 | end_points[sc.name + '/fc8'] = net
194 | return net, end_points
195 | vgg_16.default_image_size = 224
196 |
197 |
198 | def vgg_19(inputs,
199 | num_classes=1000,
200 | is_training=True,
201 | dropout_keep_prob=0.5,
202 | spatial_squeeze=True,
203 | scope='vgg_19',
204 | fc_conv_padding='VALID'):
205 | """Oxford Net VGG 19-Layers version E Example.
206 |
207 | Note: All the fully_connected layers have been transformed to conv2d layers.
208 | To use in classification mode, resize input to 224x224.
209 |
210 | Args:
211 | inputs: a tensor of size [batch_size, height, width, channels].
212 | num_classes: number of predicted classes.
213 | is_training: whether or not the model is being trained.
214 | dropout_keep_prob: the probability that activations are kept in the dropout
215 | layers during training.
216 | spatial_squeeze: whether or not should squeeze the spatial dimensions of the
217 | outputs. Useful to remove unnecessary dimensions for classification.
218 | scope: Optional scope for the variables.
219 | fc_conv_padding: the type of padding to use for the fully connected layer
220 | that is implemented as a convolutional layer. Use 'SAME' padding if you
221 | are applying the network in a fully convolutional manner and want to
222 | get a prediction map downsampled by a factor of 32 as an output.
223 | Otherwise, the output prediction map will be (input / 32) - 6 in case of
224 | 'VALID' padding.
225 |
226 |
227 | Returns:
228 | the last op containing the log predictions and end_points dict.
229 | """
230 | with tf.variable_scope(scope, 'vgg_19', [inputs]) as sc:
231 | end_points_collection = sc.name + '_end_points'
232 | # Collect outputs for conv2d, fully_connected and max_pool2d.
233 | with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
234 | outputs_collections=end_points_collection):
235 | net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
236 | net = slim.max_pool2d(net, [2, 2], scope='pool1')
237 | net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
238 | net = slim.max_pool2d(net, [2, 2], scope='pool2')
239 | net = slim.repeat(net, 4, slim.conv2d, 256, [3, 3], scope='conv3')
240 | net = slim.max_pool2d(net, [2, 2], scope='pool3')
241 | net = slim.repeat(net, 4, slim.conv2d, 512, [3, 3], scope='conv4')
242 | net = slim.max_pool2d(net, [2, 2], scope='pool4')
243 | net = slim.repeat(net, 4, slim.conv2d, 512, [3, 3], scope='conv5')
244 | net = slim.max_pool2d(net, [2, 2], scope='pool5')
245 | # Use conv2d instead of fully_connected layers.
246 | net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
247 | net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
248 | scope='dropout6')
249 | net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
250 | net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
251 | scope='dropout7')
252 | net = slim.conv2d(net, num_classes, [1, 1],
253 | activation_fn=None,
254 | normalizer_fn=None,
255 | scope='fc8')
256 | # Convert end_points_collection into a end_point dict.
257 | end_points = slim.utils.convert_collection_to_dict(end_points_collection)
258 | if spatial_squeeze:
259 | net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
260 | end_points[sc.name + '/fc8'] = net
261 | return net, end_points
262 | vgg_19.default_image_size = 224
263 |
264 | # Alias
265 | vgg_d = vgg_16
266 | vgg_e = vgg_19
--------------------------------------------------------------------------------