├── models ├── __init__.py ├── resnet_utils.py ├── vgg.py ├── resnet_v1.py └── resnet_v2.py ├── .idea ├── vcs.xml ├── misc.xml ├── modules.xml ├── CNN-Feature-Extractor.iml └── workspace.xml ├── README.md ├── constant.py └── featureExtractor.py /models/__init__.py: -------------------------------------------------------------------------------- 1 | # import sys 2 | # 3 | # sys.path.insert(0, "/cs/student/projects4/ml/2016/nmutha/msc_project/code/Around360_debug/brain") -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/CNN-Feature-Extractor.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CNN Feature Extractor 2 | 3 | - This code can be used to extract the CNN penultimate layer feature vectors from the state-of-the-art Convolutional neural network architectures 4 | which are trained on 1 million ImageNet images. 5 | 6 | - The code shows the example of using RESNET-152 version 2. But it can be easily modified to use a different CNN architectures, provided in the models folder, which are originally developed by Tensorflow-slim authors. 7 | - ResNet 152 v2 8 | - ResNet 152 v1 9 | - VGG16 10 | 11 | -------------------------------------------------------------------------------- /constant.py: -------------------------------------------------------------------------------- 1 | ## params 2 | 3 | # Reward Settings 4 | REWARD_SALIENCY = True 5 | REWARD_SEGMENTATION = False 6 | 7 | # videographerA3C 8 | EPISODE_BUFFER = 20 9 | 10 | # Saliency 11 | SAL_THRESHOLD = 0.9 12 | 13 | # video360Env 14 | ENV_HEIGHT = 400 15 | ENV_WIDTH = 800 16 | 17 | 18 | EYE_ROOT = "brain/" 19 | HTML = "html/" 20 | 21 | AROUND = "index.html" 22 | NAMESPACE = "/test" 23 | 24 | EQUIRECTANGULAR = 'equirectangular' 25 | CUBEMAP = 'cubemap' 26 | 27 | # modes 28 | TRAIN = 'train' 29 | TEST = 'test' 30 | 31 | # models 32 | RESNET152 = 'resnet152' 33 | 34 | # ckpts 35 | RESNET_152_CKPT = 'resnet/resnet_v2_152.ckpt' 36 | VGG_PRETRAIN_MODEL = 'vgg16/vgg16.npy' 37 | 38 | # model scopes 39 | CINE = 'cinematography' 40 | VIDRNN = 'videographyRNN_' 41 | 42 | GLOBAL = 'global' 43 | 44 | # videos categories 45 | CAT1 = 'dancing' 46 | CAT2 = 'fight' 47 | 48 | # placeholders 49 | FRAME_FEATURE_INPUTS = 'frame_feature_inputs' 50 | 51 | # Dirs 52 | PRETRAINED_ROOT = '../pretrainedWeightsRepo/' 53 | DATA_DIR = '../segmentation_data/images/' 54 | VIDEO_DATA_FOLDER = '../dataset/source/' 55 | FRAMES_DATA_FOLDER = '../dataset/extracted/' 56 | 57 | MODEL_PATH = '../pretrainedWeightsRepo/trained_model/' 58 | -------------------------------------------------------------------------------- /featureExtractor.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.contrib.slim as slim 3 | import numpy as np 4 | import constant 5 | from models import resnet_v2 as resnet 6 | 7 | 8 | class FrameFeatures(): 9 | def __init__(self, height, width, channel=3, architecture=constant.RESNET152, trainable=False): 10 | self.height = height 11 | self.width = width 12 | self.channel = channel 13 | self.inputs = tf.placeholder(shape=[None, self.height, self.width, self.channel], 14 | name=constant.FRAME_FEATURE_INPUTS, 15 | dtype=tf.float32) 16 | if architecture == constant.RESNET152: 17 | with slim.arg_scope(resnet.resnet_arg_scope()): 18 | self.features, _ = resnet.resnet_v2_152(inputs=self.inputs, 19 | #num_classes=1001, 20 | global_pool=True, 21 | is_training=trainable) 22 | self.features = tf.squeeze(self.features,[1,2]) 23 | print('Resnet 152 model loaded.') 24 | 25 | def restore_diff_scope(self, sess, scope=''): 26 | self.sess = sess 27 | net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope+'resnet_v2_152') 28 | assignments = [] 29 | for i_var in net_vars: 30 | ckpt_var = tf.contrib.framework.load_variable(constant.PRETRAINED_ROOT + constant.RESNET_152_CKPT, 31 | i_var.name.replace(scope, '')) 32 | assignments.append(i_var.assign(ckpt_var)) 33 | sess.run(assignments) 34 | print('Resnet 152 checkpoints restored.') 35 | 36 | def restore(self, sess, scope='resnet_v2_152'): 37 | self.sess = sess 38 | self.saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)) 39 | self.saver.restore(sess, constant.PRETRAINED_ROOT + constant.RESNET_152_CKPT) 40 | print('Resnet 152 checkpoints restored.') 41 | 42 | def getFeatures(self, images): 43 | # Images: [batch, height, width, channel] 44 | images = images / 127.5 - 1.0 45 | predictedFeatures = self.sess.run(self.features, feed_dict={self.inputs: images}) 46 | return predictedFeatures 47 | 48 | 49 | # Example of usage 50 | if __name__ == '__main__': 51 | with tf.Session() as sess: 52 | with tf.name_scope(constant.RESNET152) as scope: 53 | features = FrameFeatures(225, 225) 54 | features.restore(sess) 55 | img = tf.read_file('./download.jpg') 56 | img = tf.image.decode_jpeg(img, channels=3) 57 | img = tf.image.resize_images(img, [225, 225]) 58 | img = tf.expand_dims(img, axis=0) 59 | 60 | r1 = np.argmax(features.getFeatures(img.eval())) 61 | print(r1) 62 | -------------------------------------------------------------------------------- /models/resnet_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Contains building blocks for various versions of Residual Networks. 16 | 17 | Residual networks (ResNets) were proposed in: 18 | Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 19 | Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015 20 | 21 | More variants were introduced in: 22 | Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 23 | Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016 24 | 25 | We can obtain different ResNet variants by changing the network depth, width, 26 | and form of residual unit. This module implements the infrastructure for 27 | building them. Concrete ResNet units and full ResNet networks are implemented in 28 | the accompanying resnet_v1.py and resnet_v2.py modules. 29 | 30 | Compared to https://github.com/KaimingHe/deep-residual-networks, in the current 31 | implementation we subsample the output activations in the last residual unit of 32 | each block, instead of subsampling the input activations in the first residual 33 | unit of each block. The two implementations give identical results but our 34 | implementation is more memory efficient. 35 | """ 36 | from __future__ import absolute_import 37 | from __future__ import division 38 | from __future__ import print_function 39 | 40 | import collections 41 | import tensorflow as tf 42 | 43 | slim = tf.contrib.slim 44 | 45 | 46 | class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])): 47 | """A named tuple describing a ResNet block. 48 | 49 | Its parts are: 50 | scope: The scope of the `Block`. 51 | unit_fn: The ResNet unit function which takes as input a `Tensor` and 52 | returns another `Tensor` with the output of the ResNet unit. 53 | args: A list of length equal to the number of units in the `Block`. The list 54 | contains one (depth, depth_bottleneck, stride) tuple for each unit in the 55 | block to serve as argument to unit_fn. 56 | """ 57 | 58 | 59 | def subsample(inputs, factor, scope=None): 60 | """Subsamples the input along the spatial dimensions. 61 | 62 | Args: 63 | inputs: A `Tensor` of size [batch, height_in, width_in, channels]. 64 | factor: The subsampling factor. 65 | scope: Optional variable_scope. 66 | 67 | Returns: 68 | output: A `Tensor` of size [batch, height_out, width_out, channels] with the 69 | input, either intact (if factor == 1) or subsampled (if factor > 1). 70 | """ 71 | if factor == 1: 72 | return inputs 73 | else: 74 | return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope) 75 | 76 | 77 | def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None): 78 | """Strided 2-D convolution with 'SAME' padding. 79 | 80 | When stride > 1, then we do explicit zero-padding, followed by conv2d with 81 | 'VALID' padding. 82 | 83 | Note that 84 | 85 | net = conv2d_same(inputs, num_outputs, 3, stride=stride) 86 | 87 | is equivalent to 88 | 89 | net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME') 90 | net = subsample(net, factor=stride) 91 | 92 | whereas 93 | 94 | net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME') 95 | 96 | is different when the input's height or width is even, which is why we add the 97 | current function. For more details, see ResnetUtilsTest.testConv2DSameEven(). 98 | 99 | Args: 100 | inputs: A 4-D tensor of size [batch, height_in, width_in, channels]. 101 | num_outputs: An integer, the number of output filters. 102 | kernel_size: An int with the kernel_size of the filters. 103 | stride: An integer, the output stride. 104 | rate: An integer, rate for atrous convolution. 105 | scope: Scope. 106 | 107 | Returns: 108 | output: A 4-D tensor of size [batch, height_out, width_out, channels] with 109 | the convolution output. 110 | """ 111 | if stride == 1: 112 | return slim.conv2d(inputs, num_outputs, kernel_size, stride=1, rate=rate, 113 | padding='SAME', scope=scope) 114 | else: 115 | kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1) 116 | pad_total = kernel_size_effective - 1 117 | pad_beg = pad_total // 2 118 | pad_end = pad_total - pad_beg 119 | inputs = tf.pad(inputs, 120 | [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) 121 | return slim.conv2d(inputs, num_outputs, kernel_size, stride=stride, 122 | rate=rate, padding='VALID', scope=scope) 123 | 124 | 125 | @slim.add_arg_scope 126 | def stack_blocks_dense(net, blocks, output_stride=None, 127 | outputs_collections=None): 128 | """Stacks ResNet `Blocks` and controls output feature density. 129 | 130 | First, this function creates scopes for the ResNet in the form of 131 | 'block_name/unit_1', 'block_name/unit_2', etc. 132 | 133 | Second, this function allows the user to explicitly control the ResNet 134 | output_stride, which is the ratio of the input to output spatial resolution. 135 | This is useful for dense prediction tasks such as semantic segmentation or 136 | object detection. 137 | 138 | Most ResNets consist of 4 ResNet blocks and subsample the activations by a 139 | factor of 2 when transitioning between consecutive ResNet blocks. This results 140 | to a nominal ResNet output_stride equal to 8. If we set the output_stride to 141 | half the nominal network stride (e.g., output_stride=4), then we compute 142 | responses twice. 143 | 144 | Control of the output feature density is implemented by atrous convolution. 145 | 146 | Args: 147 | net: A `Tensor` of size [batch, height, width, channels]. 148 | blocks: A list of length equal to the number of ResNet `Blocks`. Each 149 | element is a ResNet `Block` object describing the units in the `Block`. 150 | output_stride: If `None`, then the output will be computed at the nominal 151 | network stride. If output_stride is not `None`, it specifies the requested 152 | ratio of input to output spatial resolution, which needs to be equal to 153 | the product of unit strides from the start up to some level of the ResNet. 154 | For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1, 155 | then valid values for the output_stride are 1, 2, 6, 24 or None (which 156 | is equivalent to output_stride=24). 157 | outputs_collections: Collection to add the ResNet block outputs. 158 | 159 | Returns: 160 | net: Output tensor with stride equal to the specified output_stride. 161 | 162 | Raises: 163 | ValueError: If the target output_stride is not valid. 164 | """ 165 | # The current_stride variable keeps track of the effective stride of the 166 | # activations. This allows us to invoke atrous convolution whenever applying 167 | # the next residual unit would result in the activations having stride larger 168 | # than the target output_stride. 169 | current_stride = 1 170 | 171 | # The atrous convolution rate parameter. 172 | rate = 1 173 | 174 | for block in blocks: 175 | with tf.variable_scope(block.scope, 'block', [net]) as sc: 176 | for i, unit in enumerate(block.args): 177 | if output_stride is not None and current_stride > output_stride: 178 | raise ValueError('The target output_stride cannot be reached.') 179 | 180 | with tf.variable_scope('unit_%d' % (i + 1), values=[net]): 181 | # If we have reached the target output_stride, then we need to employ 182 | # atrous convolution with stride=1 and multiply the atrous rate by the 183 | # current unit's stride for use in subsequent layers. 184 | if output_stride is not None and current_stride == output_stride: 185 | net = block.unit_fn(net, rate=rate, **dict(unit, stride=1)) 186 | rate *= unit.get('stride', 1) 187 | 188 | else: 189 | net = block.unit_fn(net, rate=1, **unit) 190 | current_stride *= unit.get('stride', 1) 191 | net = slim.utils.collect_named_outputs(outputs_collections, sc.name, net) 192 | 193 | if output_stride is not None and current_stride != output_stride: 194 | raise ValueError('The target output_stride cannot be reached.') 195 | 196 | return net 197 | 198 | 199 | def resnet_arg_scope(weight_decay=0.0001, 200 | batch_norm_decay=0.997, 201 | batch_norm_epsilon=1e-5, 202 | batch_norm_scale=True, 203 | activation_fn=tf.nn.relu, 204 | use_batch_norm=True): 205 | """Defines the default ResNet arg scope. 206 | 207 | TODO(gpapan): The batch-normalization related default values above are 208 | appropriate for use in conjunction with the reference ResNet models 209 | released at https://github.com/KaimingHe/deep-residual-networks. When 210 | training ResNets from scratch, they might need to be tuned. 211 | 212 | Args: 213 | weight_decay: The weight decay to use for regularizing the model. 214 | batch_norm_decay: The moving average decay when estimating layer activation 215 | statistics in batch normalization. 216 | batch_norm_epsilon: Small constant to prevent division by zero when 217 | normalizing activations by their variance in batch normalization. 218 | batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the 219 | activations in the batch normalization layer. 220 | activation_fn: The activation function which is used in ResNet. 221 | use_batch_norm: Whether or not to use batch normalization. 222 | 223 | Returns: 224 | An `arg_scope` to use for the resnet models. 225 | """ 226 | batch_norm_params = { 227 | 'decay': batch_norm_decay, 228 | 'epsilon': batch_norm_epsilon, 229 | 'scale': batch_norm_scale, 230 | 'updates_collections': tf.GraphKeys.UPDATE_OPS, 231 | } 232 | 233 | with slim.arg_scope( 234 | [slim.conv2d], 235 | weights_regularizer=slim.l2_regularizer(weight_decay), 236 | weights_initializer=slim.variance_scaling_initializer(), 237 | activation_fn=activation_fn, 238 | normalizer_fn=slim.batch_norm if use_batch_norm else None, 239 | normalizer_params=batch_norm_params): 240 | with slim.arg_scope([slim.batch_norm], **batch_norm_params): 241 | # The following implies padding='SAME' for pool1, which makes feature 242 | # alignment easier for dense prediction tasks. This is also used in 243 | # https://github.com/facebook/fb.resnet.torch. However the accompanying 244 | # code of 'Deep Residual Learning for Image Recognition' uses 245 | # padding='VALID' for pool1. You can switch to that choice by setting 246 | # slim.arg_scope([slim.max_pool2d], padding='VALID'). 247 | with slim.arg_scope([slim.max_pool2d], padding='SAME') as arg_sc: 248 | return arg_sc 249 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | utils.printLog 41 | 42 | 43 | print 44 | 45 | 46 | 47 | 49 | 50 | 56 | 57 | 58 | 59 | 60 | true 61 | DEFINITION_ORDER 62 | 63 | 64 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 |