├── Detector ├── RetinaNet.py ├── input_producer.py └── layers.py ├── LICENSE ├── README.md ├── results.PNG ├── test.py ├── tfrecord ├── tfrecord_VOC.py └── tfrecord_utils.py ├── train.py ├── utils ├── bbox.py └── preprocess.py └── weights └── readme.md /Detector/RetinaNet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import collections 3 | 4 | from Detector.layers import * 5 | from utils.bbox import iou, change_box_order, box_iou 6 | from utils.preprocess import * 7 | 8 | #from Detector import Network 9 | from tensorflow.contrib import learn 10 | from Detector.input_producer import InputProducer 11 | 12 | FLAGS = tf.app.flags.FLAGS 13 | slim = tf.contrib.slim 14 | resnet_version = {"resnet50": [3, 4, 6, 3], 15 | "resnet101": [3, 4, 23, 3], 16 | "resnet152": [3, 8, 36, 3], 17 | "se-resnet50": [3, 4, 6, 3], 18 | "se-resnet101": [3, 4, 23, 3]} 19 | 20 | class RetinaNet(): 21 | def __init__(self, backbone, loss_fn=None): 22 | #super().__init__(out_charset) 23 | 24 | # Set tune scope 25 | self.scope="resnet_model|FPN|head" 26 | 27 | assert backbone in resnet_version.keys() 28 | self.backbone = backbone 29 | 30 | self.use_se_block = "se-resnet" in backbone 31 | 32 | self.input_size = FLAGS.input_size 33 | self.input_shape = np.array([self.input_size, self.input_size]) 34 | 35 | self.num_classes = FLAGS.num_classes 36 | 37 | self.use_bn = FLAGS.use_bn 38 | self.probability = 0.01 39 | self.cls_thresh = FLAGS.cls_thresh 40 | self.nms_thresh = FLAGS.nms_thresh 41 | self.max_detect = FLAGS.max_detect 42 | 43 | self.anchor_areas = [32*32., 64*64., 128*128., 256*256., 512*512.] # p3 -> p7 44 | self.aspect_ratios = [1/2., 1/1., 2/1.] 45 | self.scale_ratios = [1., pow(2,1/3.), pow(2,2/3.)] 46 | self.num_anchors = len(self.aspect_ratios) * len(self.scale_ratios) 47 | self.anchor_boxes = self._get_anchor_boxes() 48 | 49 | print("backbone : ", self.backbone) 50 | print("use_bn : ", self.use_bn) 51 | print("use_se_block : ", self.use_se_block) 52 | print("input_size : ", self.input_size) 53 | print("num_classes : ", self.num_classes) 54 | 55 | def preprocess_image(self, image, boxes, labels, is_train=True): 56 | """ pre-process / Augmentation """ 57 | if is_train: 58 | image, boxes, labels = distorted_bounding_box_crop(image, boxes, labels) 59 | 60 | image, boxes = random_horizontal_flip(image, boxes) 61 | image, boxes = random_vertical_flip(image, boxes) 62 | 63 | image, boxes = resize_image_and_boxes(image, boxes, self.input_size) 64 | image = normalize_image(image) 65 | 66 | image = random_adjust_brightness(image) 67 | image = random_adjust_contrast(image) 68 | image = random_adjust_hue(image) 69 | image = random_adjust_saturation(image) 70 | 71 | else: 72 | image, boxes, labels = distorted_bounding_box_crop(image, boxes, labels) 73 | 74 | image, boxes = resize_image_and_boxes(image, boxes, self.input_size) 75 | image = normalize_image(image) 76 | 77 | return image, boxes, labels 78 | 79 | def get_logits(self, inputs, mode, **kwargs): 80 | """Get RetinaNet logits(output)""" 81 | features_resnet = self.resnet(inputs, mode, self.use_bn) 82 | features = self.FPN(features_resnet, mode) 83 | 84 | with tf.variable_scope("head"): 85 | box_subnet = [] 86 | class_subnet = [] 87 | for n, feature in enumerate(features): 88 | _box = self.head(feature, self.num_anchors * 4, "C%d_loc_head" % (n+3)) # add linear? 89 | _class = self.head(feature, self.num_anchors * self.num_classes, "C%d_cls_head" % (n+3)) 90 | 91 | _box = tf.reshape(_box, [FLAGS.batch_size, -1, 4]) 92 | _class = tf.reshape(_class, [FLAGS.batch_size, -1, self.num_classes]) 93 | 94 | box_subnet.append(_box) 95 | class_subnet.append(_class) 96 | 97 | logits = tf.concat(box_subnet, axis=1), tf.concat(class_subnet, axis=1) 98 | 99 | return logits 100 | 101 | 102 | def resnet(self, inputs, mode, use_bn): 103 | """Build convolutional network layers attached to the given input tensor""" 104 | training = (mode == learn.ModeKeys.TRAIN) and not FLAGS.bn_freeze 105 | 106 | blocks = resnet_version[self.backbone] 107 | 108 | with tf.variable_scope("resnet_model"): 109 | ## stage 1 110 | C1 = conv_layer(inputs, 64, kernel_size=7, strides=2) 111 | C1 = norm_layer(C1, training, use_bn) 112 | C1 = pool_layer(C1, (3, 3), stride=(2, 2)) 113 | 114 | ## stage2 115 | C2 = res_block(C1, [64, 64, 256], training, use_bn, self.use_se_block, strides=1, downsample=True) 116 | for i in range(blocks[0] - 1): 117 | C2 = res_block(C2, [64, 64, 256], training, use_bn, self.use_se_block) 118 | 119 | ## stage3 120 | C3 = res_block(C2, [128, 128, 512], training, use_bn, self.use_se_block, strides=2, downsample=True) 121 | for i in range(blocks[1] - 1): 122 | C3 = res_block(C3, [128, 128, 512], training, use_bn, self.use_se_block) 123 | 124 | ## stage4 125 | C4 = res_block(C3, [256, 256, 1024], training, use_bn, self.use_se_block, strides=2, downsample=True) 126 | for i in range(blocks[2] - 1): 127 | C4 = res_block(C4, [256, 256, 1024], training, use_bn, self.use_se_block) 128 | 129 | ## stage5 130 | C5 = res_block(C4, [512, 512, 2048], training, use_bn, self.use_se_block, strides=2, downsample=True) 131 | for i in range(blocks[3] - 1): 132 | C5 = res_block(C5, [512, 512, 2048], training, use_bn, self.use_se_block) 133 | 134 | return [None, C1, C2, C3, C4, C5] 135 | 136 | def FPN(self, C, mode): 137 | 138 | with tf.variable_scope("FPN"): #TO do... check FPN for ReinaNet 139 | P5 = conv_layer(C[5], 256, kernel_size=1) 140 | P4 = upsampling(P5, size=(2, 2)) + conv_layer(C[4], 256, kernel_size=1) 141 | P4 = conv_layer(P4, 256, kernel_size=3) 142 | 143 | P3 = upsampling(P4, size=(2, 2)) + conv_layer(C[3], 256, kernel_size=1) 144 | P3 = conv_layer(P3, 256, kernel_size=3) 145 | 146 | P6 = conv_layer(C[5], 256, kernel_size=3, strides=2) 147 | P7 = relu(P6) 148 | P7 = conv_layer(P7, 256, kernel_size=3, strides=2) 149 | 150 | return P3, P4, P5, P6, P7 151 | 152 | def head(self, feature, out, scope): 153 | with tf.variable_scope(scope): 154 | _kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.01) 155 | for _ in range(4): 156 | feature = conv_layer(feature, 256, kernel_size=3, use_bias=False, kernel_initializer=_kernel_initializer) 157 | feature = relu(feature) 158 | 159 | if "cls" in scope: #cls_subnet 160 | #feature = conv_layer(feature, out, kernel_size=3, kernel_initializer=tf.zeros_initializer()) # cls subnet -> bias = -log((1-pi)/pi) , pi=0.01 161 | feature = conv_layer(feature, out, kernel_size=3, kernel_initializer=_kernel_initializer) 162 | bias_initial = tf.ones(out, dtype=tf.float32) * -tf.log((1 - self.probability) / self.probability) 163 | feature = tf.nn.bias_add(feature, bias_initial) 164 | 165 | elif "loc" in scope: #loc_subnet 166 | feature = conv_layer(feature, out, kernel_size=3, kernel_initializer=_kernel_initializer) 167 | return feature 168 | 169 | def get_loss(self, y_pred, y_true, alpha=0.25, gamma=2.0): 170 | 171 | def regression_loss(pred_boxes, gt_boxes, weights=1.0): 172 | """Regression loss (Smooth L1 loss (=huber loss)) 173 | pred_boxes: [# anchors, 4] 174 | gt_boxes: [# anchors, 4] 175 | weights: Tensor of weights multiplied by loss with shape [# anchors] 176 | """ 177 | #loc_loss = tf.losses.huber_loss(labels=gt_boxes, predictions=pred_boxes, 178 | #weights=weights, scope='box_loss') 179 | #return loc_loss 180 | x = tf.abs(pred_boxes-gt_boxes) 181 | x = tf.where(tf.less(x, 1.0), 0.5*x**2, x-0.5) 182 | x = tf.reduce_sum(x) 183 | return x 184 | 185 | def focal_loss(preds_cls, gt_cls, 186 | alpha=0.25, gamma=2.0, name=None, scope=None): 187 | """Compute sigmoid focal loss between logits and onehot labels""" 188 | 189 | #with tf.name_scope(scope, 'focal_loss', [preds_cls_onehot, gt_cls_onehot]) as sc: 190 | #gt_cls = tf.one_hot(indices=gt_cls - 1, depth=FLAGS.num_classes, dtype=tf.float32) 191 | gt_cls = tf.one_hot(gt_cls, FLAGS.num_classes+1, dtype=tf.float32) 192 | gt_cls = gt_cls[:, 1:] 193 | 194 | preds_cls = tf.nn.sigmoid(preds_cls) 195 | # cross-entropy -> if y=1 : pt=p / otherwise : pt=1-p 196 | predictions_pt = tf.where(tf.equal(gt_cls, 1.0), preds_cls, 1.0 - preds_cls) 197 | 198 | # add small value to avoid 0 199 | epsilon = 1e-8 200 | alpha_t = tf.scalar_mul(alpha, tf.ones_like(predictions_pt, dtype=tf.float32)) 201 | alpha_t = tf.where(tf.equal(gt_cls, 1.0), alpha_t, 1.0 - alpha_t) 202 | gamma_t = tf.scalar_mul(gamma, tf.ones_like(predictions_pt, tf.float32)) 203 | 204 | focal_losses = alpha_t * (-tf.pow(1.0 - predictions_pt, gamma_t) * tf.log(predictions_pt)) 205 | #focal_losses = alpha_t * tf.pow(1. - predictions_pt, gamma) * -tf.log(predictions_pt + epsilon) 206 | focal_losses = tf.reduce_sum(focal_losses, axis=1) 207 | return focal_losses 208 | 209 | loc_preds, cls_preds = y_pred 210 | loc_gt, cls_gt = y_true 211 | 212 | # number of positive anchors 213 | valid_anchor_indices = tf.where(tf.greater(cls_gt, 0)) 214 | gt_anchor_nums = tf.shape(valid_anchor_indices)[0] 215 | 216 | """Location Regression loss""" 217 | # skip negative and ignored anchors 218 | valid_loc_preds = tf.gather_nd(loc_preds, valid_anchor_indices) 219 | valid_loc_gt = tf.gather_nd(loc_gt, valid_anchor_indices) 220 | 221 | loc_loss = regression_loss(valid_loc_preds, valid_loc_gt) 222 | loc_loss = tf.truediv(tf.reduce_sum(loc_loss), tf.to_float(gt_anchor_nums)) 223 | 224 | """Classification loss""" 225 | valid_cls_indices = tf.where(tf.greater(cls_gt, -1)) 226 | 227 | # skip ignored anchors (iou belong to 0.4 to 0.5) 228 | valid_cls_preds = tf.gather_nd(cls_preds, valid_cls_indices) 229 | valid_cls_gt = tf.gather_nd(cls_gt, valid_cls_indices) 230 | 231 | cls_loss = focal_loss(valid_cls_preds, valid_cls_gt) 232 | cls_loss = tf.truediv(tf.reduce_sum(cls_loss), tf.to_float(gt_anchor_nums)) 233 | 234 | """Variables""" 235 | scope = self.scope or FLAGS.tune_scope 236 | scope = '|'.join(['train_tower_[0-9]+/' + s for s in scope.split('|')]) 237 | 238 | tvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) 239 | extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 240 | 241 | return loc_loss, cls_loss, tvars, extra_update_ops 242 | 243 | def _get_anchor_hw(self): 244 | 245 | anchor_hw = [] 246 | for s in self.anchor_areas: 247 | for ar in self.aspect_ratios: # w/h = ar 248 | h = np.sqrt(s/ar) 249 | w = ar * h 250 | for sr in self.scale_ratios: # scale 251 | anchor_h = h*sr 252 | anchor_w = w*sr 253 | anchor_hw.append([anchor_h, anchor_w]) 254 | num_fms = len(self.anchor_areas) 255 | anchor_hw = np.array(anchor_hw) 256 | return anchor_hw.reshape(num_fms, -1, 2) 257 | 258 | def _get_anchor_boxes(self): 259 | anchor_hw = self._get_anchor_hw() 260 | num_fms = len(self.anchor_areas) 261 | fm_sizes = [np.ceil(self.input_shape/pow(2.,i+3)) for i in range(num_fms)] # p3 -> p7 feature map sizes 262 | 263 | boxes = [] 264 | for i in range(num_fms): 265 | fm_size = fm_sizes[i] 266 | grid_size = self.input_shape / fm_size 267 | fm_h, fm_w = int(fm_size[0]), int(fm_size[1]) # fm_h == fm_w : True 268 | 269 | meshgrid_x = (np.arange(0, fm_w) + 0.5) * grid_size[0] 270 | meshgrid_y = (np.arange(0, fm_h) + 0.5) * grid_size[1] 271 | meshgrid_x, meshgrid_y = np.meshgrid(meshgrid_x, meshgrid_y) 272 | 273 | yx = np.vstack((meshgrid_y.ravel(), meshgrid_x.ravel())).transpose() 274 | yx = np.tile(yx.reshape((fm_h, fm_w, 1, 2)), (9, 1)) 275 | 276 | hw = np.tile(anchor_hw[i].reshape(1, 1, 9, 2), (fm_h, fm_w, 1, 1)) 277 | box = np.concatenate([yx, hw], 3) # [y,x,h,w] 278 | boxes.append(box.reshape(-1,4)) 279 | 280 | return tf.cast(tf.concat(boxes, 0), tf.float32) 281 | 282 | def encode(self, boxes, labels): 283 | """boxes : yxyx , anchor_boxes : yxhw""" 284 | ious = iou(self.anchor_boxes, boxes) 285 | 286 | max_ids = tf.argmax(ious, axis=1, name="encode_argmax") 287 | max_ious = tf.reduce_max(ious, axis=1) 288 | 289 | boxes = tf.gather(boxes, max_ids) 290 | boxes = change_box_order(boxes, "yxyx2yxhw") 291 | 292 | loc_yx = (boxes[:, :2] - self.anchor_boxes[:, :2]) / self.anchor_boxes[:, 2:] 293 | loc_hw = tf.log(boxes[:, 2:] / self.anchor_boxes[:, 2:]) 294 | 295 | loc_targets = tf.concat([loc_yx, loc_hw], 1) 296 | cls_targets = 1 + tf.gather(labels, max_ids) # labels : (0~19) + 1 -> (1~20) 297 | #cls_targets = tf.gather(labels, max_ids) # VOC labels 1~20 298 | 299 | # iou < 0.4 : background(0) / 0.4 < iou < 0.5 : ignore(-1) 300 | cls_targets = tf.where(tf.less(max_ious, 0.5), -tf.ones_like(cls_targets), cls_targets) 301 | cls_targets = tf.where(tf.less(max_ious, 0.4), tf.zeros_like(cls_targets), cls_targets) 302 | 303 | return loc_targets, cls_targets 304 | 305 | def decode(self, loc_preds, cls_preds): 306 | if len(loc_preds.shape.as_list()) == 3: 307 | loc_preds = tf.squeeze(loc_preds, 0) 308 | cls_preds = tf.squeeze(cls_preds, 0) 309 | 310 | if loc_preds.dtype != tf.float32: 311 | loc_preds = tf.cast(loc_preds, tf.float32) 312 | 313 | loc_yx = loc_preds[:, :2] 314 | loc_hw = loc_preds[:, 2:] 315 | 316 | yx = loc_yx * self.anchor_boxes[:, 2:] + self.anchor_boxes[:, :2] 317 | hw = tf.exp(loc_hw) * self.anchor_boxes[:, 2:] 318 | 319 | boxes = tf.concat([yx-hw/2, yx+hw/2], axis=1) # [#anchors,4], yxyx 320 | boxes = tf.clip_by_value(boxes, 0, self.input_size) 321 | 322 | cls_preds = tf.nn.sigmoid(cls_preds) 323 | labels = tf.argmax(cls_preds, axis=1, name="decode_argmax") 324 | score = tf.reduce_max(cls_preds, axis=1) 325 | 326 | ids = tf.where(score > self.cls_thresh) 327 | ids = tf.squeeze(ids, axis=1) 328 | 329 | boxes = tf.gather(boxes, ids) 330 | score = tf.gather(score, ids) 331 | labels = tf.gather(labels, ids) 332 | 333 | keep = tf.image.non_max_suppression(boxes, score, self.max_detect, self.nms_thresh) 334 | 335 | boxes = tf.gather(boxes, keep) 336 | labels = tf.gather(labels, keep) 337 | score = tf.gather(score, keep) 338 | 339 | return boxes, labels, score 340 | 341 | def get_input(self, 342 | is_train=True, 343 | num_gpus=1): 344 | input_features = [] 345 | 346 | InputFeatures = collections.namedtuple('InputFeatures', ('image', 'loc', 'cls')) 347 | input_producer = InputProducer() 348 | for gpu_indx in range(num_gpus): 349 | with tf.device('/gpu:%d' % gpu_indx): 350 | if is_train: 351 | split_name = 'train_14125' 352 | batch_size = FLAGS.batch_size 353 | else: 354 | split_name = 'val_3000' 355 | batch_size = FLAGS.valid_batch_size 356 | 357 | dataset = input_producer.get_split(split_name, FLAGS.train_path) 358 | 359 | provider = slim.dataset_data_provider.DatasetDataProvider( 360 | dataset, 361 | num_readers=FLAGS.num_input_threads, 362 | common_queue_capacity=20 * batch_size, 363 | common_queue_min=10 * batch_size, 364 | shuffle=True) 365 | _images, _bboxes, _labels = provider.get(['image', 'object/bbox', 'object/label']) 366 | 367 | # pre-processing & encode 368 | _images, _bboxes, _labels = self.preprocess_image(_images, _bboxes, _labels, is_train) 369 | 370 | _bboxes, _labels = self.encode(_bboxes, _labels) 371 | 372 | #images, bboxes, labels = tf.train.batch( 373 | # [_images, _bboxes, _labels], 374 | # batch_size=batch_size, 375 | # num_threads=FLAGS.num_input_threads, 376 | # capacity=2 * batch_size) 377 | 378 | images, bboxes, labels = tf.train.shuffle_batch( 379 | [_images, _bboxes, _labels], 380 | batch_size=batch_size, 381 | num_threads=FLAGS.num_input_threads, 382 | capacity=20*batch_size, 383 | min_after_dequeue=10*batch_size) 384 | 385 | input_features.append(InputFeatures(images, bboxes, labels)) 386 | return input_features 387 | -------------------------------------------------------------------------------- /Detector/input_producer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | 4 | slim = tf.contrib.slim 5 | 6 | class InputProducer(object): 7 | 8 | def __init__(self, preprocess_image_fn=None, vertical_image=False): 9 | self.vertical_image = vertical_image 10 | self._preprocess_image = preprocess_image_fn if preprocess_image_fn is not None \ 11 | else self._default_preprocess_image_fn 12 | 13 | self.ITEMS_TO_DESCRIPTIONS = { 14 | 'image': 'A color image of varying height and width.', 15 | 'shape': 'Shape of the image', 16 | 'object/bbox': 'A list of bounding boxes, one per each object.', 17 | 'object/label': 'A list of labels, one per each object.', 18 | } 19 | 20 | self.SPLITS_TO_SIZES = { 21 | 'train': 9540, 22 | 'val': 2000 23 | } 24 | #self.SPLITS_TO_SIZES = { 25 | # 'train_2000': 2000, 26 | # 'val_500': 500 27 | #} 28 | 29 | self.FILE_PATTERN = '%s.record' 30 | 31 | def num_classes(self): 32 | return 20 33 | 34 | def get_split(self, split_name, dataset_dir): 35 | """Gets a dataset tuple with instructions for reading Pascal VOC dataset. 36 | Args: 37 | split_name: A train/test split name. 38 | dataset_dir: The base directory of the dataset sources. 39 | file_pattern: The file pattern to use when matching the dataset sources. 40 | It is assumed that the pattern contains a '%s' string so that the split 41 | name can be inserted. 42 | reader: The TensorFlow reader type. 43 | Returns: 44 | A `Dataset` namedtuple. 45 | Raises: 46 | ValueError: if `split_name` is not a valid train/test split. 47 | """ 48 | if split_name not in self.SPLITS_TO_SIZES: 49 | raise ValueError('split name %s was not recognized.' % split_name) 50 | 51 | file_pattern = os.path.join(dataset_dir, self.FILE_PATTERN % split_name) 52 | 53 | reader = tf.TFRecordReader 54 | 55 | # Features in Pascal VOC TFRecords. 56 | keys_to_features = { 57 | 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 58 | 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 59 | #'image/height': tf.FixedLenFeature([1], tf.int64), 60 | #'image/width': tf.FixedLenFeature([1], tf.int64), 61 | #'image/channels': tf.FixedLenFeature([1], tf.int64), 62 | #'image/shape': tf.FixedLenFeature([3], tf.int64), 63 | 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 64 | 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 65 | 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 66 | 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 67 | 'image/object/class/label': tf.VarLenFeature(dtype=tf.int64), 68 | #'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64), 69 | #'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64), 70 | } 71 | items_to_handlers = { 72 | 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 73 | #'shape': slim.tfexample_decoder.Tensor('image/shape'), 74 | 'object/bbox': slim.tfexample_decoder.BoundingBox( 75 | ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 76 | 'object/label': slim.tfexample_decoder.Tensor('image/object/class/label'), 77 | #'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'), 78 | #'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'), 79 | } 80 | decoder = slim.tfexample_decoder.TFExampleDecoder( 81 | keys_to_features, items_to_handlers) 82 | 83 | labels_to_names = None 84 | #if has_labels(dataset_dir): 85 | # labels_to_names = read_label_file(dataset_dir) 86 | 87 | return slim.dataset.Dataset( 88 | data_sources=file_pattern, 89 | reader=reader, 90 | decoder=decoder, 91 | num_samples=self.SPLITS_TO_SIZES[split_name], 92 | items_to_descriptions=self.ITEMS_TO_DESCRIPTIONS, 93 | num_classes=self.num_classes(), 94 | labels_to_names=labels_to_names) 95 | 96 | def _default_preprocess_image_fn(self, image, is_train=True): 97 | return image 98 | -------------------------------------------------------------------------------- /Detector/layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib import learn 3 | 4 | def se_block(bottom, ratio=16): 5 | weight_initializer = tf.contrib.layers.variance_scaling_initializer() 6 | bias_initializer = tf.constant_initializer(value=0.0) 7 | 8 | # Bottom [N,H,W,C] 9 | # Global average pooling 10 | #with tf.variable_scope("se_block"): 11 | 12 | channel = bottom.get_shape()[-1] 13 | se = tf.reduce_mean(bottom, axis=[1,2], keepdims=True) 14 | assert se.get_shape()[1:] == (1,1,channel) 15 | se = tf.layers.dense(se, channel//ratio, activation=tf.nn.relu, 16 | kernel_initializer=weight_initializer, 17 | bias_initializer=bias_initializer) 18 | assert se.get_shape()[1:] == (1,1,channel//ratio) 19 | se = tf.layers.dense(se, channel, activation=tf.nn.sigmoid, 20 | kernel_initializer=weight_initializer, 21 | bias_initializer=bias_initializer) 22 | assert se.get_shape()[1:] == (1,1,channel) 23 | top = bottom * se 24 | 25 | return top 26 | 27 | 28 | def res_block(bottom, filters, training, use_bn, use_se_block, strides=1, downsample=False): 29 | 30 | path_2 = bottom 31 | 32 | # conv 1x1 33 | path_1 = conv_layer(bottom, filters[0], kernel_size=1) 34 | path_1 = norm_layer(path_1, training, use_bn) 35 | path_1 = relu(path_1) # activation? 36 | 37 | # conv 3x3 38 | path_1 = conv_layer(path_1, filters[1], kernel_size=3, strides=strides) 39 | path_1 = norm_layer(path_1, training, use_bn) 40 | path_1 = relu(path_1) 41 | 42 | # conv 1x1 43 | path_1 = conv_layer(path_1, filters[2], kernel_size=1) 44 | path_1 = norm_layer(path_1, training, use_bn) 45 | 46 | if use_se_block: 47 | path_1 = se_block(path_1) 48 | 49 | if downsample: 50 | # shortcut 51 | path_2 = conv_layer(path_2, filters[2], kernel_size=1, strides=strides) 52 | path_2 = norm_layer(path_2, training, use_bn) 53 | 54 | top = path_1 + path_2 55 | top = relu(top) 56 | return top 57 | 58 | 59 | def conv_layer(bottom, filters, kernel_size, name=None, 60 | strides=1, padding='same', use_bias=False, kernel_initializer=None): 61 | """Build a convolutional layer using entry from layer_params)""" 62 | if kernel_initializer is None: 63 | kernel_initializer = tf.contrib.layers.variance_scaling_initializer() 64 | 65 | if strides is not 1: 66 | padding = 'valid' 67 | pad_total = kernel_size - 1 68 | pad_beg = pad_total // 2 69 | pad_end = pad_total - pad_beg 70 | bottom = tf.pad(bottom, [[0, 0], [pad_beg, pad_end], 71 | [pad_beg, pad_end], [0, 0]]) 72 | 73 | bias_initializer = tf.constant_initializer(value=0.0) 74 | 75 | top = tf.layers.conv2d(bottom, 76 | filters=filters, 77 | kernel_size=kernel_size, 78 | strides=strides, 79 | padding=padding, 80 | kernel_initializer=kernel_initializer, 81 | bias_initializer=bias_initializer, 82 | use_bias=use_bias, 83 | name=name) 84 | return top 85 | 86 | 87 | def pool_layer(bottom, pool, stride, name=None, padding='same'): 88 | """Short function to build a pooling layer with less syntax""" 89 | top = tf.layers.max_pooling2d( bottom, pool, stride, 90 | padding=padding, 91 | name=name) 92 | return top 93 | 94 | 95 | def relu(bottom, name=None): 96 | """ Relu actication Function""" 97 | top = tf.nn.relu(bottom, name=name) 98 | return top 99 | 100 | def norm_layer(bottom, training, use_bn): 101 | if use_bn: 102 | top = tf.layers.batch_normalization( bottom, axis=3, 103 | training=training) 104 | else: 105 | top = tf.contrib.layers.group_norm(bottom, groups=32, channels_axis=3) 106 | 107 | return top 108 | 109 | 110 | def upsampling(bottom, size, name=None): 111 | """Bilinear Upsampling""" 112 | 113 | out_shape = tf.shape(bottom)[1:3] * tf.constant(size) 114 | top = tf.image.resize_bilinear(bottom, out_shape, align_corners=True, name=name) 115 | return top 116 | 117 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Beomyoung Kim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RetinaNet_tensorflow 2 | For easier and more readable tensorflow codes 3 | 4 | ## How to use 5 | - For Trainig (recommend to use the default parameters) 6 | ``` 7 | python tfrecord/tfrecord_VOC.py 8 | CUDA_VISIBLE_DEVICES=0,1 python train.py 9 | ``` 10 | - For Testing (recommend to use the default parameters) 11 | ``` 12 | CUDA_VISIBLE_DEVICES=0 python test.py 13 | ``` 14 | 15 | ## Results 16 | 17 | ![screensh](https://github.com/qjadud1994/RetinaNet_tensorflow/blob/master/results.PNG) 18 | 19 | ## Todo list: 20 | - [x] multi-gpu code 21 | - [x] Training visualize using Tensorboard 22 | - [x] validation output image visualization using Tensorboard 23 | - [x] Choose BatchNorm model or GroupNorm model 24 | - [x] Choose Trainable BatchNorm(not working!) or Freeze BatchNorm 25 | - [x] (BatchNorm mode) Get Imagenet pre-trained weights from [resnet50.pth](https://download.pytorch.org/models/resnet50-19c8e357.pth) 26 | - [x] (GroupNorm mode) Get Imagenet pre-trained weights from [resnet50_groupnorm32.tar](http://www.cs.unc.edu/~cyfu/resnet50_groupnorm32.tar) 27 | - [x] tf.train.batch -> tf.train.shuffle_batch 28 | - [x] add augmentation ( + random crop) 29 | - [x] use SE-resnet backbone 30 | - [ ] add evaluation (mAP) code 31 | - [ ] change upsample function for 600x600 input 32 | - [ ] Training/Validation Error ( % value) 33 | 34 | 35 | 36 | ## Description 37 | | File |Description | 38 | |----------------|--------------------------------------------------| 39 | |train.py | Train RetinaNet | 40 | |test.py | Inference RetinaNet | 41 | |tfrecord/tfrecord_VOC. py | Make VOC tfrecord | 42 | |Detector/layers. py | layer functions used in RetinaNet | 43 | |Detector/RetinaNet. py | Define RetinaNet | 44 | 45 | ## Environment 46 | 47 | - os : Ubuntu 16.04.4 LTS
48 | - GPU : Tesla P40 (24GB)
49 | - Python : 3.6.6
50 | - Tensorflow : 1.10.0 51 | - CUDA, CUDNN : 9.0, 7.1.3
52 | -------------------------------------------------------------------------------- /results.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qjadud1994/RetinaNet_tensorflow/018837ba8ad9e6b038e60bda3a12ccf639f8ce59/results.PNG -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import cv2, os 5 | from tensorflow.contrib import learn 6 | from PIL import Image, ImageDraw 7 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 8 | 9 | from Detector.RetinaNet import RetinaNet 10 | from utils.bbox import draw_boxes 11 | 12 | FLAGS = tf.app.flags.FLAGS 13 | 14 | tf.logging.set_verbosity(tf.logging.WARN) 15 | tf.app.flags.DEFINE_string('f', '', 'kernel') 16 | #### Input pipeline 17 | tf.app.flags.DEFINE_integer('input_size', 608, 18 | """Input size""") 19 | tf.app.flags.DEFINE_integer('batch_size', 1, 20 | """Train batch size""") 21 | tf.app.flags.DEFINE_integer('num_classes', 20, 22 | """number of classes""") 23 | tf.app.flags.DEFINE_integer('num_gpus', 1, 24 | """The number of gpu""") 25 | tf.app.flags.DEFINE_string('tune_from', 'logs_v2/new_momen2/model.ckpt-82000', 26 | """Path to pre-trained model checkpoint""") 27 | #tf.app.flags.DEFINE_string('tune_from', 'logs_v2/new_momen2/best_models/model-66000', 28 | # """Path to pre-trained model checkpoint""") 29 | 30 | #### Training config 31 | tf.app.flags.DEFINE_boolean('use_bn', True, 32 | """use batchNorm or GroupNorm""") 33 | tf.app.flags.DEFINE_float('cls_thresh', 0.4, 34 | """thresh for class""") 35 | tf.app.flags.DEFINE_float('nms_thresh', 0.3, 36 | """thresh for nms""") 37 | tf.app.flags.DEFINE_integer('max_detect', 300, 38 | """num of max detect (using in nms)""") 39 | 40 | img_dir = "/root/DB/VOC/VOC2012/JPEGImages/" 41 | train_list = open("/root/DB/VOC/VOC2012/ImageSets/Main/train.txt", "r").readlines() 42 | val_list = open("/root/DB/VOC/VOC2012/ImageSets/Main/val.txt", "r").readlines() 43 | 44 | VOC = {1 : "motorbike", 2 : "car", 3 : "person", 4 : "bus", 5 : "bird", 6 : "horse", 7 : "bicycle", 8 : "chair", 9 : "aeroplane", 10 : "diningtable", 11 : "pottedplant", 12 : "cat", 13 : "dog", 14 : "boat", 15 : "sheep", 16 : "sofa", 17 : "cow", 18 : "bottle", 19 : "tvmonitor", 20 : "train"} 45 | 46 | mode = learn.ModeKeys.INFER 47 | 48 | def _get_init_pretrained(sess): 49 | saver_reader = tf.train.Saver(tf.global_variables()) 50 | saver_reader.restore(sess, FLAGS.tune_from) 51 | 52 | with tf.Graph().as_default(): 53 | _image = tf.placeholder(tf.float32, shape=[None, None, 3], name='image') 54 | 55 | with tf.variable_scope('train_tower_0') as scope: 56 | net = RetinaNet("resnet50") 57 | 58 | image = tf.expand_dims(_image, 0) 59 | image = tf.to_float(image) 60 | image /= 255.0 61 | 62 | mean = (0.485, 0.456, 0.406) 63 | var = (0.229, 0.224, 0.225) 64 | 65 | image -= mean 66 | image /= var 67 | 68 | image = tf.image.resize_images(image, (FLAGS.input_size, FLAGS.input_size), 69 | method=tf.image.ResizeMethod.BILINEAR) 70 | 71 | print(mode) 72 | box_head, cls_head = net.get_logits(image, mode) 73 | 74 | decode = net.decode(box_head, cls_head) 75 | 76 | #restore_model = get_init_trained() 77 | init_op = tf.group( tf.global_variables_initializer(), 78 | tf.local_variables_initializer()) 79 | 80 | classes = set() 81 | with tf.Session() as sess: 82 | sess.run(init_op) 83 | _get_init_pretrained(sess) 84 | 85 | for n, _img in enumerate(val_list): 86 | _img = _img[:-1] + ".jpg" 87 | ori_img = Image.open(img_dir + _img) 88 | print(ori_img.size) 89 | img = ori_img.copy() 90 | 91 | box, label, score = sess.run(decode, feed_dict={_image : img}) 92 | 93 | label = [VOC[l+1] for l in label] 94 | ori_img = ori_img.resize((608, 608), Image.BILINEAR) 95 | ori_img = draw_boxes(ori_img, box, label, score) 96 | 97 | plt.figure(figsize =(12, 12)) 98 | plt.imshow(ori_img) 99 | plt.show() 100 | if n==20: 101 | break 102 | -------------------------------------------------------------------------------- /tfrecord/tfrecord_VOC.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | r"""Convert VOC format dataset to TFRecord for object_detection. 17 | For example 18 | Hollywood head dataset: 19 | See: http://www.di.ens.fr/willow/research/headdetection/ 20 | Context-aware CNNs for person head detection 21 | HDA pedestrian dataset: 22 | See: http://vislab.isr.ist.utl.pt/hda-dataset/ 23 | Example usage: 24 | ./create_tf_record_pascal_fmt --data_dir=/startdt_data/HollywoodHeads2 \ 25 | --output_dir=models/head_detector 26 | --label_map_path=data/head_label_map.pbtxt 27 | --mode=train 28 | """ 29 | 30 | import hashlib 31 | import io 32 | import os, sys 33 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))+"/..") 34 | from lxml import etree 35 | import PIL.Image 36 | import tensorflow as tf 37 | import tfrecord_utils 38 | 39 | flags = tf.app.flags 40 | flags.DEFINE_string('data_dir', '/root/DB/VOC/VOC2012/', 'Root directory to raw pet dataset, like /startdt_data/HDA_Dataset_V1.3/VOC_fmt_training_fisheye') 41 | flags.DEFINE_string('output_dir', '/root/DB/VOC/VOC2012/tfrecord', 'Path to directory to output TFRecords, like models/hda_cam_person_fisheye') 42 | flags.DEFINE_string('label_map_path', '/root/DB/VOC/VOC2012/voc_labels.xml', 43 | 'Path to label map proto, like model/deepfashion.xml') 44 | flags.DEFINE_string('mode', 'train', 'generate train or val output: train/val') 45 | FLAGS = flags.FLAGS 46 | 47 | 48 | def dict_to_tf_example(data, 49 | label_map_dict, 50 | image_subdirectory, 51 | ignore_difficult_instances=False): 52 | """Convert XML derived dict to tf.Example proto. 53 | Notice that this function normalizes the bounding box coordinates provided 54 | by the raw data. 55 | Args: 56 | data: dict holding PASCAL XML fields for a single image (obtained by 57 | running dataset_util.recursive_parse_xml_to_dict) 58 | label_map_dict: A map from string label names to integers ids. 59 | image_subdirectory: String specifying subdirectory within the 60 | Pascal dataset (here only head available) directory holding the actual image data. 61 | ignore_difficult_instances: Whether to skip difficult instances in the 62 | dataset (default: False). 63 | Returns: 64 | example: The converted tf.Example. 65 | Raises: 66 | ValueError: if the image pointed to by data['filename'] is not a valid JPEG 67 | """ 68 | img_path = os.path.splitext(os.path.join(image_subdirectory, data['filename']))[0] + ".jpg" 69 | with tf.gfile.GFile(img_path, 'rb') as fid: 70 | encoded_jpg = fid.read() 71 | 72 | encoded_jpg_io = io.BytesIO(encoded_jpg) 73 | image = PIL.Image.open(encoded_jpg_io) 74 | if image.format != 'JPEG': 75 | raise ValueError('Image format not JPEG') 76 | if image.mode != 'RGB': 77 | image = image.convert('RGB') 78 | # generate hash key for image 79 | key = hashlib.sha256(encoded_jpg).hexdigest() 80 | 81 | width = int(data['size']['width']) 82 | height = int(data['size']['height']) 83 | 84 | xmin = [] 85 | ymin = [] 86 | xmax = [] 87 | ymax = [] 88 | classes = [] 89 | classes_text = [] 90 | difficult_obj = [] 91 | for obj in data['object']: 92 | difficult = bool(int(obj['difficult'])) 93 | if ignore_difficult_instances and difficult: 94 | continue 95 | 96 | difficult_obj.append(int(difficult)) 97 | 98 | xmin.append(float(obj['bndbox']['xmin']) / width) 99 | ymin.append(float(obj['bndbox']['ymin']) / height) 100 | xmax.append(float(obj['bndbox']['xmax']) / width) 101 | ymax.append(float(obj['bndbox']['ymax']) / height) 102 | class_name = obj['name'] 103 | classes_text.append(class_name.encode('utf8')) 104 | classes.append(int(label_map_dict[class_name])-1) 105 | 106 | example = tf.train.Example(features=tf.train.Features(feature={ 107 | 'image/height': tfrecord_utils.int64_feature(height), 108 | 'image/width': tfrecord_utils.int64_feature(width), 109 | 'image/filename': tfrecord_utils.bytes_feature( 110 | data['filename'].encode('utf8')), 111 | 'image/source_id': tfrecord_utils.bytes_feature( 112 | data['filename'].encode('utf8')), 113 | 'image/key/sha256': tfrecord_utils.bytes_feature(key.encode('utf8')), 114 | 'image/encoded': tfrecord_utils.bytes_feature(encoded_jpg), 115 | 'image/format': tfrecord_utils.bytes_feature('jpeg'.encode('utf8')), 116 | 'image/object/bbox/xmin': tfrecord_utils.float_list_feature(xmin), 117 | 'image/object/bbox/xmax': tfrecord_utils.float_list_feature(xmax), 118 | 'image/object/bbox/ymin': tfrecord_utils.float_list_feature(ymin), 119 | 'image/object/bbox/ymax': tfrecord_utils.float_list_feature(ymax), 120 | 'image/object/class/text': tfrecord_utils.bytes_list_feature(classes_text), 121 | 'image/object/class/label': tfrecord_utils.int64_list_feature(classes), 122 | 'image/object/difficult': tfrecord_utils.int64_list_feature(difficult_obj), 123 | })) 124 | return example 125 | 126 | 127 | def create_tf_record(output_filename, 128 | label_map_dict, 129 | annotations_dir, 130 | image_dir, 131 | examples): 132 | """Creates a TFRecord file from examples. 133 | Args: 134 | output_filename: Path to where output file is saved. 135 | label_map_dict: The label map dictionary. 136 | annotations_dir: Directory where annotation files are stored. 137 | image_dir: Directory where image files are stored. 138 | examples: Examples to parse and save to tf record. 139 | """ 140 | writer = tf.python_io.TFRecordWriter(output_filename) 141 | for idx, example in enumerate(examples): 142 | if idx % 100 == 0: 143 | print ('On image {} of {}'.format(idx, len(examples)), end='\r') 144 | path = os.path.join(annotations_dir, example + '.xml') 145 | print ("processing...", example, end='\r') 146 | if not os.path.exists(path): 147 | print ('Could not find {}, ignoring example.'.format(path)) 148 | continue 149 | with tf.gfile.GFile(path, 'r') as fid: 150 | #try: 151 | xml_str = fid.read() 152 | xml = etree.fromstring(xml_str) 153 | data = tfrecord_utils.recursive_parse_xml_to_dict(xml)['annotation'] 154 | tf_example = dict_to_tf_example(data, label_map_dict, image_dir) 155 | writer.write(tf_example.SerializeToString()) 156 | #except: 157 | # print ("Fail to open image: ", example) 158 | 159 | writer.close() 160 | 161 | # TODO: Add test for pet/PASCAL main files. 162 | def main(_): 163 | data_dir = FLAGS.data_dir 164 | mode = FLAGS.mode 165 | assert mode in ["train", "val"] 166 | print ("Generate data for model {}!".format(mode)) 167 | label_map_dict = tfrecord_utils.get_label_map_dict(FLAGS.label_map_path) 168 | 169 | image_dir = os.path.join(data_dir, 'JPEGImages') 170 | annotations_dir = os.path.join(data_dir, 'Annotations') 171 | 172 | # Test images are not included in the downloaded data set, so we shall perform 173 | # our own split. 174 | # random.seed(42) 175 | # random.shuffle(examples_list) 176 | # num_examples = len(examples_list) 177 | # num_train = int(num_examples) 178 | # train_examples = examples_list[:num_train] 179 | if not os.path.exists(FLAGS.output_dir): 180 | os.makedirs(FLAGS.output_dir) 181 | if mode == 'train': 182 | examples_path = os.path.join(data_dir, 'ImageSets/Main/train_2000.txt') 183 | examples_list = tfrecord_utils.read_examples_list(examples_path) 184 | print ('{} training examples.', len(examples_list)) 185 | train_output_path = os.path.join(FLAGS.output_dir, 'train_2000.record') 186 | create_tf_record(train_output_path, label_map_dict, annotations_dir, 187 | image_dir, examples_list) 188 | elif mode == 'val': 189 | examples_path = os.path.join(data_dir, 'ImageSets/Main/val_500.txt') 190 | examples_list = tfrecord_utils.read_examples_list(examples_path) 191 | print ('{} validation examples.', len(examples_list)) 192 | val_output_path = os.path.join(FLAGS.output_dir, 'val_500.record') 193 | create_tf_record(val_output_path, label_map_dict, annotations_dir, 194 | image_dir, examples_list) 195 | 196 | if __name__ == '__main__': 197 | tf.app.run() 198 | -------------------------------------------------------------------------------- /tfrecord/tfrecord_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Utility functions for creating TFRecord data sets.""" 17 | 18 | import tensorflow as tf 19 | from lxml import etree 20 | 21 | def int64_feature(value): 22 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 23 | 24 | 25 | def int64_list_feature(value): 26 | return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) 27 | 28 | 29 | def bytes_feature(value): 30 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 31 | 32 | 33 | def bytes_list_feature(value): 34 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) 35 | 36 | 37 | def float_list_feature(value): 38 | return tf.train.Feature(float_list=tf.train.FloatList(value=value)) 39 | 40 | 41 | def read_examples_list(path): 42 | """Read list of training or validation examples. 43 | The file is assumed to contain a single example per line where the first 44 | token in the line is an identifier that allows us to find the image and 45 | annotation xml for that example. 46 | For example, the line: 47 | xyz 3 48 | would allow us to find files xyz.jpg and xyz.xml (the 3 would be ignored). 49 | Args: 50 | path: absolute path to examples list file. 51 | Returns: 52 | list of example identifiers (strings). 53 | """ 54 | with tf.gfile.GFile(path) as fid: 55 | lines = fid.readlines() 56 | return [line.strip().split(' ')[0] for line in lines] 57 | 58 | 59 | def recursive_parse_xml_to_dict(xml): 60 | """Recursively parses XML contents to python dict. 61 | We assume that `object` tags are the only ones that can appear 62 | multiple times at the same level of a tree. 63 | Args: 64 | xml: xml tree obtained by parsing XML file contents using lxml.etree 65 | Returns: 66 | Python dictionary holding XML contents. 67 | """ 68 | if not xml: 69 | return {xml.tag: xml.text} 70 | result = {} 71 | for child in xml: 72 | child_result = recursive_parse_xml_to_dict(child) 73 | if child.tag != 'object': 74 | result[child.tag] = child_result[child.tag] 75 | else: 76 | if child.tag not in result: 77 | result[child.tag] = [] 78 | result[child.tag].append(child_result[child.tag]) 79 | return {xml.tag: result} 80 | 81 | 82 | def get_label_map_dict(label_map_path): 83 | """ 84 | Read in dataset category name vs id mapping 85 | Args: 86 | xml file path which containing category name and ip information 87 | returns: 88 | Dict containing name to id mapping 89 | """ 90 | tree = etree.parse(open(label_map_path, "r")) 91 | name_id_mapping = {} 92 | for node in tree.xpath("category"): 93 | cate_name = node.findtext("name") 94 | cate_id = node.findtext("id") 95 | name_id_mapping[cate_name] = cate_id 96 | return name_id_mapping -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os, sys 3 | import numpy as np 4 | import tensorflow as tf 5 | import collections 6 | from pprint import pprint 7 | from tensorflow.contrib import learn 8 | 9 | from Detector.RetinaNet import RetinaNet 10 | from utils.bbox import draw_bboxes 11 | slim = tf.contrib.slim 12 | FLAGS = tf.app.flags.FLAGS 13 | 14 | #### Input pipeline 15 | tf.app.flags.DEFINE_string('backbone', "se-resnet50", 16 | """select RetinaNet backbone""") 17 | tf.app.flags.DEFINE_integer('input_size', 608, 18 | """Input size""") 19 | tf.app.flags.DEFINE_integer('batch_size', 8, 20 | """Train batch size""") 21 | tf.app.flags.DEFINE_float('learning_rate', 1e-3, 22 | """Learninig rate""") 23 | tf.app.flags.DEFINE_integer('num_input_threads', 2, 24 | """Number of readers for input data""") 25 | tf.app.flags.DEFINE_integer('num_classes', 20, 26 | """number of classes""") 27 | 28 | #### Train dataset 29 | tf.app.flags.DEFINE_string('train_path', '/root/DB/VOC/VOC2012/tfrecord/', 30 | """Base directory for training data""") 31 | 32 | ### Validation dataset (during training) 33 | tf.app.flags.DEFINE_string('valid_dataset','VOC', 34 | """Validation dataset name""") 35 | tf.app.flags.DEFINE_integer('valid_device', 0, 36 | """Device for validation""") 37 | tf.app.flags.DEFINE_integer('valid_batch_size', 8, 38 | """Validation batch size""") 39 | tf.app.flags.DEFINE_boolean('use_validation', True, 40 | """Whether use validation or not""") 41 | tf.app.flags.DEFINE_integer('valid_steps', 300, 42 | """Validation steps""") 43 | 44 | #### Output Path 45 | tf.app.flags.DEFINE_string('output', 'logs_se/new_momen1', 46 | """Directory for event logs and checkpoints""") 47 | #### Training config 48 | tf.app.flags.DEFINE_boolean('use_bn', True, 49 | """use batchNorm or GroupNorm""") 50 | tf.app.flags.DEFINE_boolean('bn_freeze', True, 51 | """Freeze batchNorm or not""") 52 | tf.app.flags.DEFINE_float('cls_thresh', 0.5, 53 | """thresh for class""") 54 | tf.app.flags.DEFINE_float('nms_thresh', 0.5, 55 | """thresh for nms""") 56 | tf.app.flags.DEFINE_integer('max_detect', 300, 57 | """num of max detect (using in nms)""") 58 | tf.app.flags.DEFINE_string('tune_from', '', 59 | """Path to pre-trained model checkpoint""") 60 | tf.app.flags.DEFINE_string('tune_scope', '', 61 | """Variable scope for training""") 62 | tf.app.flags.DEFINE_integer('max_num_steps', 2**21, 63 | """Number of optimization steps to run""") 64 | tf.app.flags.DEFINE_boolean('verbose', False, 65 | """Print log in tensorboard""") 66 | tf.app.flags.DEFINE_boolean('use_profile', False, 67 | """Whether use Tensorflow Profiling""") 68 | tf.app.flags.DEFINE_boolean('use_debug', False, 69 | """Whether use TFDBG or not""") 70 | tf.app.flags.DEFINE_integer('save_steps', 1000, 71 | """Save steps""") 72 | tf.app.flags.DEFINE_integer('summary_steps', 100, 73 | """Save steps""") 74 | tf.app.flags.DEFINE_float('moving_average_decay', 0.9999, 75 | """Moving Average dacay factor""") 76 | tf.app.flags.DEFINE_float('weight_decay', 1e-4, 77 | """weight dacay factor""") 78 | tf.app.flags.DEFINE_float('momentum', 0.9, 79 | """momentum factor""") 80 | 81 | 82 | mode = learn.ModeKeys.TRAIN 83 | 84 | TowerResult = collections.namedtuple('TowerResult', ('tvars', 85 | 'loc_loss', 'cls_loss', 86 | 'grads', 'extra_update_ops', 87 | 'optimizer')) 88 | 89 | ValidTowerResult = collections.namedtuple('ValidTowerResult', ('loc_loss', 'cls_loss')) 90 | 91 | def _get_session(monitored_sess): 92 | session = monitored_sess 93 | while type(session).__name__ != 'Session': 94 | session = session._sess 95 | return session 96 | 97 | 98 | def _get_init_pretrained(): 99 | """Return lambda for reading pretrained initial model""" 100 | 101 | if not FLAGS.tune_from: 102 | return None 103 | saver_reader = tf.train.Saver(tf.global_variables()) 104 | model_path = FLAGS.tune_from 105 | 106 | def init_fn(scaffold, sess): return saver_reader.restore(sess, model_path) 107 | return init_fn 108 | 109 | 110 | def _average_gradients(tower_grads): 111 | average_grads = [] 112 | for grads_and_vars in zip(*tower_grads): 113 | grads = tf.stack([g for g, _ in grads_and_vars]) 114 | grad = tf.reduce_mean(grads, 0) 115 | v = grads_and_vars[0][1] 116 | grad_and_var = (grad, v) 117 | average_grads.append(grad_and_var) 118 | return average_grads 119 | 120 | 121 | def allreduce_grads(all_grads, average=True): 122 | from tensorflow.contrib import nccl 123 | nr_tower = len(all_grads) 124 | if nr_tower == 1: 125 | return all_grads 126 | new_all_grads = [] # N x K 127 | for grads_and_vars in zip(*all_grads): 128 | grads = [g for g, _ in grads_and_vars] 129 | _vars = [v for _, v in grads_and_vars] 130 | summed = nccl.all_sum(grads) 131 | grads_for_devices = [] # K 132 | for g in summed: 133 | with tf.device(g.device): 134 | # tensorflow/benchmarks didn't average gradients 135 | if average: 136 | g = tf.multiply(g, 1.0 / nr_tower, name='allreduce_avg') 137 | grads_for_devices.append(g) 138 | new_all_grads.append(zip(grads_for_devices, _vars)) 139 | 140 | # transpose to K x N 141 | ret = list(zip(*new_all_grads)) 142 | return ret 143 | 144 | def _get_post_init_ops(): 145 | """ 146 | Copy values of variables on GPU 0 to other GPUs. 147 | """ 148 | # literally all variables, because it's better to sync optimizer-internal variables as well 149 | all_vars = tf.global_variables() + tf.local_variables() 150 | var_by_name = dict([(v.name, v) for v in all_vars]) 151 | post_init_ops = [] 152 | for v in all_vars: 153 | if not v.name.find('tower') >= 0: 154 | continue 155 | if v.name.startswith('train_tower_0'): 156 | # no need for copy to tower0 157 | continue 158 | # in this trainer, the master name doesn't have the towerx/ prefix 159 | split_name = v.name.split('/') 160 | prefix = split_name[0] 161 | realname = '/'.join(split_name[1:]) 162 | if prefix in realname: 163 | # logger.warning("variable {} has its prefix {} appears multiple times in its name!".format(v.name, prefix)) 164 | pass 165 | copy_from = var_by_name.get(v.name.replace(prefix, 'train_tower_0')) 166 | if copy_from is not None: 167 | post_init_ops.append(v.assign(copy_from.read_value())) 168 | else: 169 | # logger.warning("Cannot find {} in the graph!".format(realname)) 170 | pass 171 | # logger.info("'sync_variables_from_main_tower' includes {} operations.".format(len(post_init_ops))) 172 | return tf.group(*post_init_ops, name='sync_variables_from_main_tower') 173 | 174 | def load_pytorch_weight(use_bn, use_se_block): 175 | from torch import load 176 | 177 | if use_bn: 178 | if use_se_block: 179 | pt_load = load("weights/se_resnet50-ce0d4300.pth") 180 | else: 181 | pt_load = load("weights/resnet50.pth") 182 | else: 183 | pt_load = load("weights/resnet50_groupnorm32.tar")['state_dict'] 184 | reordered_weights = {} 185 | pre_train_ops = [] 186 | 187 | for key, value in pt_load.items(): 188 | try: 189 | reordered_weights[key] = value.data.cpu().numpy() 190 | except: 191 | reordered_weights[key] = value.cpu().numpy() 192 | 193 | weight_names = list(reordered_weights) 194 | 195 | tf_variables = [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="train_tower_0/resnet_model")] 196 | 197 | if use_bn: # BatchNorm 198 | bn_variables = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="train_tower_0/resnet_model") if 199 | "moving_" in v.name] 200 | tf_counter = 0 201 | tf_bn_counter = 0 202 | 203 | for name in weight_names: 204 | if not use_se_block and "fc" in name: # last fc layer (resnet) 205 | continue 206 | if use_se_block and "last_linear" in name: # last fc layer(se-resnet) 207 | continue 208 | 209 | elif len(reordered_weights[name].shape) == 4: 210 | if "se_module" in name: #se_block 211 | pt_assign = np.squeeze(reordered_weights[name]) 212 | tf_assign = tf_variables[tf_counter] 213 | 214 | pre_train_ops.append(tf_assign.assign(np.transpose(pt_assign))) 215 | tf_counter += 1 216 | else: #conv 217 | weight_var = reordered_weights[name] 218 | tf_weight = tf_variables[tf_counter] 219 | 220 | pre_train_ops.append(tf_weight.assign(np.transpose(weight_var, (2, 3, 1, 0)))) 221 | tf_counter += 1 222 | 223 | elif "running_" in name: #bn mean, var 224 | pt_assign = reordered_weights[name] 225 | tf_assign = bn_variables[tf_bn_counter] 226 | 227 | pre_train_ops.append(tf_assign.assign(pt_assign)) 228 | tf_bn_counter += 1 229 | 230 | else: #bn gamma, beta 231 | pt_assign = reordered_weights[name] 232 | tf_assign = tf_variables[tf_counter] 233 | 234 | pre_train_ops.append(tf_assign.assign(pt_assign)) 235 | tf_counter += 1 236 | 237 | else: #GroupNorm 238 | conv_variables = [v for v in tf_variables if "conv" in v.name] 239 | #gamma_variables = [v for v in tf_variables if "gamma" in v.name] 240 | #beta_variables = [v for v in tf_variables if "beta" in v.name] 241 | 242 | tf_conv_counter = 0 243 | tf_gamma_counter = 0 244 | tf_beta_counter = 0 245 | 246 | for name in weight_names: 247 | if "fc" in name: 248 | continue 249 | 250 | elif len(reordered_weights[name].shape) == 4: #conv 251 | weight_var = reordered_weights[name] 252 | tf_weight = conv_variables[tf_conv_counter] 253 | 254 | pre_train_ops.append(tf_weight.assign(np.transpose(weight_var, (2, 3, 1, 0)))) 255 | tf_conv_counter += 1 256 | 257 | return tf.group(*pre_train_ops, name='load_resnet_pretrain') 258 | 259 | 260 | def _single_tower(net, tower_indx, input_feature, learning_rate=None, name='train'): 261 | _mode = mode if name is 'train' else learn.ModeKeys.INFER 262 | 263 | with tf.device('/gpu:%d' % tower_indx): 264 | with tf.variable_scope('{}_tower_{}'.format(name, tower_indx)) as scope: 265 | #optimizer = tf.train.AdamOptimizer(learning_rate) 266 | 267 | logits = net.get_logits(input_feature.image, _mode) 268 | 269 | loc_loss, cls_loss, tvars, extra_update_ops = net.get_loss(logits, [input_feature.loc, input_feature.cls]) 270 | 271 | # Freeze Batch Normalization 272 | if FLAGS.bn_freeze: 273 | tvars = [t for t in tvars if "batch_normalization" not in t.name] 274 | 275 | #tf.get_variable_scope().reuse_variables() 276 | total_loss = loc_loss + cls_loss 277 | 278 | # Add weight decay to the loss. 279 | l2_loss = FLAGS.weight_decay * tf.add_n( 280 | [tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tvars]) # if loss_filter_fn(v.name)]) 281 | total_loss += l2_loss 282 | 283 | if name is 'train': 284 | #optimizer = tf.train.AdamOptimizer(learning_rate) 285 | optimizer = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum) 286 | grads = optimizer.compute_gradients(total_loss, tvars, colocate_gradients_with_ops=True) 287 | else: 288 | optimizer, grads = None, None 289 | #tf.summary.image("input_image", input_feature.image) 290 | 291 | #if FLAGS.verbose: 292 | # for var in tf.trainable_variables(): 293 | # tf.summary.histogram(var.op.name, var) 294 | 295 | # TODO: Detection output visualize 296 | 297 | if name is 'valid': 298 | summary_images = [] 299 | for i in range(3): 300 | pred_boxes, _, _ = net.decode(logits[0][i], logits[1][i]) 301 | 302 | pred_boxes /= FLAGS.input_size 303 | pred_boxes = tf.clip_by_value(pred_boxes, 0.0, 1.0) 304 | 305 | pred_img = tf.image.draw_bounding_boxes(tf.expand_dims(input_feature.image[i], 0), 306 | tf.expand_dims(pred_boxes, 0)) 307 | summary_images.append(pred_img[0]) 308 | 309 | summary_images = tf.stack(summary_images) 310 | tf.summary.image("pred_img", summary_images) 311 | 312 | return TowerResult(tvars, loc_loss, cls_loss, grads, extra_update_ops, optimizer) 313 | 314 | 315 | def main(argv=None): 316 | 317 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 318 | available_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',') 319 | num_gpus = len(available_gpus) 320 | print("num_gpus : ", num_gpus, available_gpus) 321 | 322 | with tf.Graph().as_default(): 323 | 324 | # Get Network class and Optimizer 325 | global_step = tf.train.get_or_create_global_step() 326 | 327 | # Learning rate decay 328 | boundaries = [60000, 80000] 329 | values = [FLAGS.learning_rate / pow(10, i) for i in range(3)] 330 | learning_rate = tf.train.piecewise_constant(global_step, boundaries, values) 331 | tf.summary.scalar('learning_rate', learning_rate) 332 | 333 | optimizers = [] 334 | net = RetinaNet(FLAGS.backbone) 335 | 336 | # Multi gpu training code (Define graph) 337 | tower_grads = [] 338 | tower_extra_update_ops = [] 339 | #tower_train_errs = [] 340 | tower_loc_losses = [] 341 | tower_cls_losses = [] 342 | input_features = net.get_input(is_train=True, 343 | num_gpus=num_gpus) 344 | 345 | for gpu_indx in range(num_gpus): 346 | tower_output = _single_tower(net, gpu_indx, input_features[gpu_indx], learning_rate) 347 | tower_grads.append([x for x in tower_output.grads if x[0] is not None]) 348 | tower_extra_update_ops.append(tower_output.extra_update_ops) 349 | #tower_train_errs.append(tower_output.error) 350 | tower_loc_losses.append(tower_output.loc_loss) 351 | tower_cls_losses.append(tower_output.cls_loss) 352 | optimizers.append(tower_output.optimizer) 353 | 354 | if FLAGS.use_validation: 355 | valid_input_feature = net.get_input(is_train=False, num_gpus=1) 356 | 357 | # single gpu validation 358 | valid_tower_output = _single_tower(net, FLAGS.valid_device, valid_input_feature[0], 359 | name='valid') 360 | tf.summary.scalar("valid_loc_losses", valid_tower_output.loc_loss) 361 | tf.summary.scalar("valid_cls_losses", valid_tower_output.cls_loss) 362 | 363 | 364 | # Merge results 365 | loc_losses = tf.reduce_mean(tower_loc_losses) 366 | cls_losses = tf.reduce_mean(tower_cls_losses) 367 | grads = allreduce_grads(tower_grads) 368 | train_ops = [] 369 | 370 | tf.summary.scalar("train_loc_losses", loc_losses) 371 | tf.summary.scalar("train_cls_losses", cls_losses) 372 | 373 | # Track the moving averages of all trainable variables. 374 | variable_averages = tf.train.ExponentialMovingAverage(FLAGS.moving_average_decay, global_step) 375 | variables_averages_op = variable_averages.apply(tf.trainable_variables()) 376 | train_ops.append(variables_averages_op) 377 | 378 | # Apply the gradients 379 | for idx, grad_and_vars in enumerate(grads): 380 | with tf.name_scope('apply_gradients'), tf.device(tf.DeviceSpec(device_type="GPU", device_index=idx)): 381 | # apply_gradients may create variables. Make them LOCAL_VARIABLES 382 | from tensorpack.graph_builder.utils import override_to_local_variable 383 | with override_to_local_variable(enable=idx > 0): 384 | train_ops.append(optimizers[idx].apply_gradients(grad_and_vars, name='apply_grad_{}'.format(idx), 385 | global_step=(global_step if idx==0 else None))) 386 | 387 | with tf.control_dependencies(tower_extra_update_ops[-1]): 388 | train_op = tf.group(*train_ops, name='train_op') 389 | 390 | # Summary 391 | summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) 392 | summary_op = tf.summary.merge([s for s in summaries if 'valid_' not in s.name]) 393 | 394 | if FLAGS.use_validation: 395 | valid_summary_op = tf.summary.merge([s for s in summaries if 'valid_' in s.name]) 396 | valid_summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.output, 397 | FLAGS.valid_dataset)) 398 | ''' 399 | # Print network structure 400 | if not os.path.exists(FLAGS.output): 401 | os.makedirs(os.path.join(FLAGS.output,'best_models'), exist_ok=True) 402 | param_stats = tf.profiler.profile(tf.get_default_graph()) 403 | sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) 404 | 405 | # Print configuration 406 | pprint(FLAGS.flag_values_dict()) 407 | 408 | train_info = open(os.path.join(FLAGS.output, 'train_info.txt'),'w') 409 | train_info.write('total_params: %d\n' % param_stats.total_parameters) 410 | train_info.write(str(FLAGS.flag_values_dict())) 411 | train_info.close() 412 | ''' 413 | 414 | # Define config, init_op, scaffold 415 | session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 416 | init_op = tf.group(tf.global_variables_initializer(), 417 | tf.local_variables_initializer()) 418 | pretrain_op = load_pytorch_weight(FLAGS.use_bn, net.use_se_block) 419 | sync_op = _get_post_init_ops() 420 | 421 | # only save global variables 422 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=10) 423 | scaffold = tf.train.Scaffold(saver=saver, 424 | init_op=init_op, 425 | summary_op=summary_op, 426 | init_fn=_get_init_pretrained()) 427 | valid_saver = tf.train.Saver(tf.global_variables(), max_to_keep=10) 428 | best_valid_loss = 1e9 429 | 430 | # Define several hooks 431 | hooks = [] 432 | if FLAGS.use_profile: 433 | profiler_hook = tf.train.ProfilerHook(save_steps=FLAGS.save_steps, 434 | output_dir=FLAGS.output) 435 | hooks.append(profiler_hook) 436 | 437 | if FLAGS.use_debug: 438 | from tensorflow.python import debug as tf_debug 439 | # CLI Debugger 440 | # cli_debug_hook = tf_debug.LocalCLIDebugHook() 441 | # hooks.append(cli_debug_hook) 442 | 443 | # Tensorboard Debugger 444 | tfb_debug_hook = tf_debug.TensorBoardDebugHook("127.0.0.1:9900") 445 | #tfb_debug_hook = tf_debug.TensorBoardDebugHook("a476cc765f91:6007") 446 | hooks.append(tfb_debug_hook) 447 | hooks = None if len(hooks)==0 else hooks 448 | 449 | 450 | print("---------- session start") 451 | with tf.train.MonitoredTrainingSession(checkpoint_dir=FLAGS.output, 452 | scaffold=scaffold, 453 | hooks=hooks, 454 | config=session_config, 455 | save_checkpoint_steps=FLAGS.save_steps, 456 | save_checkpoint_secs=None, 457 | save_summaries_steps=FLAGS.summary_steps, 458 | save_summaries_secs=None,) as sess: 459 | print("---------- open MonitoredTrainingSession") 460 | #sess.graph._unsafe_unfinalize() 461 | #net.load_pytorch_weight(sess) 462 | _step = sess.run(global_step) 463 | 464 | print("---------- run pretrain op") 465 | sess.run(pretrain_op) 466 | 467 | print("---------- run sync op") 468 | sess.run(sync_op) 469 | 470 | while _step < FLAGS.max_num_steps: 471 | if sess.should_stop(): 472 | break 473 | 474 | # Training 475 | [step_loc_loss, step_cls_loss,_ ,_step] = sess.run( 476 | [loc_losses, cls_losses, train_op, global_step]) 477 | 478 | print('STEP : %d\tTRAIN_TOTAL_LOSS : %.8f\tTRAIN_LOC_LOSS : %.8f\tTRAIN_CLS_LOSS : %.5f' 479 | % (_step, step_loc_loss + step_cls_loss, step_loc_loss, step_cls_loss), end='\r') 480 | 481 | #assert not np.isnan(loc_losses + cls_losses), 'Model diverged with loss = NaN' 482 | 483 | if _step % 50 == 0: 484 | print('STEP : %d\tTRAIN_TOTAL_LOSS : %.8f\tTRAIN_LOC_LOSS : %.8f\tTRAIN_CLS_LOSS : %.5f' 485 | % (_step, step_loc_loss + step_cls_loss, step_loc_loss, step_cls_loss)) 486 | 487 | 488 | # Periodic synchronization 489 | if _step % 1000 == 0: 490 | sess.run(sync_op) 491 | 492 | # Print Error (train/valid) 493 | if FLAGS.use_validation and _step > 0 and \ 494 | _step % FLAGS.valid_steps == 0: 495 | print('STEP : %d\tTRAIN_TOTAL_LOSS : %.8f\tTRAIN_LOC_LOSS : %.8f\tTRAIN_CLS_LOSS : %.5f' 496 | % (_step, step_loc_loss + step_cls_loss, step_loc_loss, step_cls_loss)) 497 | 498 | # Train Err / TODO: more search for Detection error 499 | ''' 500 | cls_errors, loc_errors = [], [] 501 | for gpu_indx in range(FLAGS.num_gpus): 502 | label_error, sequence_error = sess.run(tower_train_errs[gpu_indx]) 503 | label_errors.append(label_error) 504 | sequence_errors.append(sequence_error) 505 | train_label_error = np.mean(label_errors) 506 | train_sequence_error = np.mean(sequence_errors) 507 | ''' 508 | # Validation Err 509 | [valid_step_loc_loss, valid_step_cls_loss, valid_summary] = sess.run([valid_tower_output.loc_loss, 510 | valid_tower_output.cls_loss, 511 | valid_summary_op]) 512 | valid_step_loss = valid_step_loc_loss + valid_step_cls_loss 513 | if valid_step_loss < best_valid_loss: 514 | best_valid_loss = valid_step_loss 515 | best_model_dir = os.path.join(FLAGS.output, 'best_models') 516 | valid_saver.save(_get_session(sess), os.path.join(best_model_dir,'model'), global_step=_step) 517 | if valid_summary_writer is not None: valid_summary_writer.add_summary(valid_summary, _step) 518 | #print('STEP : %d\tTRAIN_LOSS : %f\tVALID_LOSS : %f' % (_step, step_loss, valid_step_loss)) 519 | #print('TRAIN_LABEL_ERR : %f\tTRAIN_SEQ_ERR : %f' % (label_error, sequence_error)) 520 | #print('VALID_LABEL_ERR : %f\tVALID_SEQ_ERR : %f' % (valid_label_error, valid_sequence_error)) 521 | print('STEP : %d\tVALID_TOTAL_LOSS : %.8f\tVALID_LOC_LOSS : %.8f\tVALID_CLS_LOSS : %.5f' 522 | % (_step, valid_step_loss, valid_step_loc_loss, valid_step_cls_loss)) 523 | print('='*70) 524 | 525 | 526 | if __name__ == '__main__': 527 | tf.app.run() 528 | -------------------------------------------------------------------------------- /utils/bbox.py: -------------------------------------------------------------------------------- 1 | '''Some helper functions for PyTorch.''' 2 | import cv2 3 | import numpy as np 4 | import tensorflow as tf 5 | from PIL import Image, ImageDraw, ImageFont 6 | 7 | def get_mean_and_std(dataset, max_load=10000): 8 | """Compute the mean and std value of dataset.""" 9 | # dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2) 10 | mean = torch.zeros(3) 11 | std = torch.zeros(3) 12 | print('==> Computing mean and std..') 13 | N = min(max_load, len(dataset)) 14 | for i in range(N): 15 | print(i) 16 | im,_,_ = dataset.load(1) 17 | for j in range(3): 18 | mean[j] += im[:,j,:,:].mean() 19 | std[j] += im[:,j,:,:].std() 20 | mean.div_(N) 21 | std.div_(N) 22 | return mean, std 23 | 24 | def change_box_order(boxes, order): 25 | '''Change box order between (xmin,ymin,xmax,ymax) and (xcenter,ycenter,width,height). 26 | Args: 27 | boxes: (tensor) bounding boxes, sized [num anchors, 4]. 28 | Returns: 29 | (tensor) converted bounding boxes, sized [num anchor, 4]. 30 | ''' 31 | 32 | if order is 'yxyx2yxhw': 33 | y_min, x_min, y_max, x_max = tf.split(value=boxes, num_or_size_splits=4, axis=1) 34 | x = (x_min + x_max) / 2 35 | y = (y_min + y_max) / 2 36 | w = x_max - x_min 37 | h = y_max - y_min 38 | new_boxes = tf.concat([y,x,h,w], axis=1) 39 | 40 | elif order is 'yxhw2yxyx': 41 | y, x, h, w = tf.split(value=boxes, num_or_size_splits=4, axis=1) 42 | x_min = x - w/2 43 | x_max = x + w/2 44 | y_min = y - h/2 45 | y_max = y + h/2 46 | new_boxes = tf.concat([y_min, x_min, y_max, x_max], axis=1) 47 | 48 | elif order is 'xyxy2yxyx': 49 | x_min, y_min, x_max, y_max = tf.split(value=boxes, num_or_size_splits=4, axis=1) 50 | new_boxes = tf.concat([y_min, x_min, y_max, x_max], axis=1) 51 | 52 | elif order is 'yxyx2xyxy': 53 | y_min, x_min, y_max, x_max = tf.split(value=boxes, num_or_size_splits=4, axis=1) 54 | new_boxes = tf.concat([x_min, y_min, x_max, y_max], axis=1) 55 | 56 | return new_boxes 57 | 58 | 59 | def box_iou(box1, box2, order='xyxy'): 60 | '''Compute the intersection over union of two set of boxes. 61 | The default box order is (xmin, ymin, xmax, ymax). 62 | Args: 63 | box1: (tensor) bounding boxes, sized [N,4]. 64 | box2: (tensor) bounding boxes, sized [M,4]. 65 | order: (str) box order, either 'xyxy' or 'xywh'. 66 | Return: 67 | (tensor) iou, sized [N,M]. 68 | Reference: 69 | https://github.com/chainer/chainercv/blob/master/chainercv/utils/bbox/bbox_iou.py 70 | ''' 71 | box1 = change_box_order(box1, "xywh2xyxy") 72 | 73 | lt = tf.reduce_max([box1[:, :2], box2[:, :2]]) # [N,M,2] 74 | rb = tf.reduce_max([box1[:, 2:], box2[:, 2:]]) # [N,M,2] 75 | print(lt, rb) 76 | 77 | wh = tf.clip_by_value(rb-lt+1, 0, float('nan')) 78 | print(wh) 79 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 80 | 81 | area1 = (box1[:, 2]-box1[:, 0]+1) * (box1[:, 3]-box1[:, 1]+1) # [N,] 82 | area2 = (box2[:, 2]-box2[:, 0]+1) * (box2[:, 3]-box2[:, 1]+1) # [M,] 83 | iou = inter / (area1[:, None] + area2 - inter) 84 | return iou 85 | 86 | 87 | def draw_bboxes(image, boxes, labels): 88 | boxes = np.array(boxes, dtype=np.int32) 89 | for box, label in zip(boxes, labels): 90 | ymin, xmin, ymax, xmax = box 91 | image = cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0,255,0), 3) 92 | #image = cv2.putText(image, str(label), (box[0]+15, box[1]), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255), 1) 93 | return image 94 | 95 | def draw_boxes(img, bboxes, classes, scores): 96 | if len(bboxes) == 0: 97 | return img 98 | 99 | #height, width, _ = img.shape 100 | width, height = img.size 101 | #image = Image.fromarray(img) 102 | image = img 103 | font = ImageFont.truetype( 104 | font='/root/FiraMono-Medium.otf', 105 | size=np.floor(3e-2 * image.size[1] + 0.4).astype('int32')) 106 | 107 | thickness = (image.size[0] + image.size[1]) // 300 108 | draw = ImageDraw.Draw(image) 109 | 110 | for box, category, score in zip(bboxes, classes, scores): 111 | y1, x1, y2, x2 = [int(i) for i in box] 112 | 113 | p1 = (x1, y1) 114 | p2 = (x2, y2) 115 | 116 | label = '{} {:.1f}% '.format(category, score * 100) 117 | label_size = draw.textsize(label) 118 | text_origin = np.array([p1[0], p1[1] - label_size[1]]) 119 | 120 | color = np.array([0, 255, 0]) 121 | for i in range(thickness): 122 | draw.rectangle( 123 | [p1[0] + i, p1[1] + i, p2[0] - i, p2[1] - i], 124 | outline=tuple(color)) 125 | 126 | draw.rectangle( 127 | [tuple(text_origin), 128 | tuple(text_origin + label_size)], 129 | fill=tuple(color)) 130 | 131 | draw.text( 132 | tuple(text_origin), 133 | label, fill=(0, 0, 0), 134 | font=font) 135 | 136 | del draw 137 | return np.array(image) 138 | 139 | def area(boxlist, scope=None): 140 | """Computes area of boxes. 141 | Args: 142 | boxlist: BoxList holding N boxes following order [ymin, xmin, ymax, xmax] 143 | scope: name scope. 144 | Returns: 145 | a tensor with shape [N] representing box areas. 146 | """ 147 | with tf.name_scope(scope, 'Area'): 148 | y_min, x_min, y_max, x_max = tf.split( 149 | value=boxlist, num_or_size_splits=4, axis=1) 150 | return tf.squeeze((y_max - y_min) * (x_max - x_min), [1]) 151 | 152 | def intersection(boxlist1, boxlist2, scope=None): 153 | """Compute pairwise intersection areas between boxes. 154 | Args: 155 | boxlist1: BoxList holding N boxes 156 | boxlist2: BoxList holding M boxes 157 | scope: name scope. 158 | Returns: 159 | a tensor with shape [N, M] representing pairwise intersections 160 | """ 161 | 162 | with tf.name_scope(scope, 'Intersection'): 163 | y_min1, x_min1, y_max1, x_max1 = tf.split( 164 | value=boxlist1, num_or_size_splits=4, axis=1) 165 | 166 | y_min2, x_min2, y_max2, x_max2 = tf.split( 167 | value=boxlist2, num_or_size_splits=4, axis=1) 168 | 169 | all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2)) 170 | all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2)) 171 | intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin) 172 | all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2)) 173 | all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2)) 174 | intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin) 175 | return intersect_heights * intersect_widths 176 | 177 | def iou(boxlist1, boxlist2, scope=None): 178 | """Computes pairwise intersection-over-union between box collections. 179 | Args: 180 | boxlist1: BoxList holding N boxes 181 | boxlist2: BoxList holding M boxes 182 | scope: name scope. 183 | Returns: 184 | a tensor with shape [N, M] representing pairwise iou scores. 185 | """ 186 | boxlist1 = change_box_order(boxlist1, "yxhw2yxyx") 187 | 188 | with tf.name_scope(scope, 'IOU'): 189 | intersections = intersection(boxlist1, boxlist2) 190 | areas1 = area(boxlist1) 191 | areas2 = area(boxlist2) 192 | unions = ( 193 | tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections) 194 | return tf.where( 195 | tf.equal(intersections, 0.0), 196 | tf.zeros_like(intersections), tf.truediv(intersections, unions)) 197 | 198 | def bboxes_jaccard(bbox_ref, bboxes, name=None): 199 | """Compute jaccard score between a reference box and a collection 200 | of bounding boxes. 201 | Args: 202 | bbox_ref: (N, 4) or (4,) Tensor with reference bounding box(es). 203 | bboxes: (N, 4) Tensor, collection of bounding boxes. 204 | Return: 205 | (N,) Tensor with Jaccard scores. 206 | """ 207 | with tf.name_scope(name, 'bboxes_jaccard'): 208 | # Should be more efficient to first transpose. 209 | bboxes = tf.transpose(bboxes) 210 | bbox_ref = tf.transpose(bbox_ref) 211 | # Intersection bbox and volume. 212 | int_ymin = tf.maximum(bboxes[0], bbox_ref[0]) 213 | int_xmin = tf.maximum(bboxes[1], bbox_ref[1]) 214 | int_ymax = tf.minimum(bboxes[2], bbox_ref[2]) 215 | int_xmax = tf.minimum(bboxes[3], bbox_ref[3]) 216 | h = tf.maximum(int_ymax - int_ymin, 0.) 217 | w = tf.maximum(int_xmax - int_xmin, 0.) 218 | # Volumes. 219 | # Volumes. 220 | inter_vol = h * w 221 | bboxes_vol = (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1]) 222 | #jaccard = tfe_math.safe_divide(inter_vol, union_vol, 'jaccard') 223 | #return jaccard 224 | return tf.where( 225 | tf.greater(bboxes_vol, 0), 226 | tf.divide(inter_vol, bboxes_vol), 227 | tf.zeros_like(inter_vol), 228 | name='jaccard') 229 | ''' 230 | _, term_width = os.popen('stty size', 'r').read().split() 231 | term_width = int(term_width) 232 | TOTAL_BAR_LENGTH = 86. 233 | last_time = time.time() 234 | begin_time = last_time 235 | def progress_bar(current, total, msg=None): 236 | global last_time, begin_time 237 | if current == 0: 238 | begin_time = time.time() # Reset for new bar. 239 | 240 | cur_len = int(TOTAL_BAR_LENGTH*current/total) 241 | rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1 242 | 243 | sys.stdout.write(' [') 244 | for i in range(cur_len): 245 | sys.stdout.write('=') 246 | sys.stdout.write('>') 247 | for i in range(rest_len): 248 | sys.stdout.write('.') 249 | sys.stdout.write(']') 250 | 251 | cur_time = time.time() 252 | step_time = cur_time - last_time 253 | last_time = cur_time 254 | tot_time = cur_time - begin_time 255 | 256 | L = [] 257 | L.append(' Step: %s' % format_time(step_time)) 258 | L.append(' | Tot: %s' % format_time(tot_time)) 259 | if msg: 260 | L.append(' | ' + msg) 261 | 262 | msg = ''.join(L) 263 | sys.stdout.write(msg) 264 | for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3): 265 | sys.stdout.write(' ') 266 | 267 | # Go back to the center of the bar. 268 | for i in range(term_width-int(TOTAL_BAR_LENGTH/2)): 269 | sys.stdout.write('\b') 270 | sys.stdout.write(' %d/%d ' % (current+1, total)) 271 | 272 | if current < total-1: 273 | sys.stdout.write('\r') 274 | else: 275 | sys.stdout.write('\n') 276 | sys.stdout.flush() 277 | 278 | def format_time(seconds): 279 | days = int(seconds / 3600/24) 280 | seconds = seconds - days*3600*24 281 | hours = int(seconds / 3600) 282 | seconds = seconds - hours*3600 283 | minutes = int(seconds / 60) 284 | seconds = seconds - minutes*60 285 | secondsf = int(seconds) 286 | seconds = seconds - secondsf 287 | millis = int(seconds*1000) 288 | 289 | f = '' 290 | i = 1 291 | if days > 0: 292 | f += str(days) + 'D' 293 | i += 1 294 | if hours > 0 and i <= 2: 295 | f += str(hours) + 'h' 296 | i += 1 297 | if minutes > 0 and i <= 2: 298 | f += str(minutes) + 'm' 299 | i += 1 300 | if secondsf > 0 and i <= 2: 301 | f += str(secondsf) + 's' 302 | i += 1 303 | if millis > 0 and i <= 2: 304 | f += str(millis) + 'ms' 305 | i += 1 306 | if f == '': 307 | f = '0ms' 308 | return f 309 | ''' 310 | -------------------------------------------------------------------------------- /utils/preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Preprocess images and bounding boxes for detection. 17 | We perform two sets of operations in preprocessing stage: 18 | (a) operations that are applied to both training and testing data, 19 | (b) operations that are applied only to training data for the purpose of 20 | data augmentation. 21 | A preprocessing function receives a set of inputs, 22 | e.g. an image and bounding boxes, 23 | performs an operation on them, and returns them. 24 | Some examples are: randomly cropping the image, randomly mirroring the image, 25 | randomly changing the brightness, contrast, hue and 26 | randomly jittering the bounding boxes. 27 | The preprocess function receives a tensor_dict which is a dictionary that maps 28 | different field names to their tensors. For example, 29 | tensor_dict[fields.InputDataFields.image] holds the image tensor. 30 | The image is a rank 4 tensor: [1, height, width, channels] with 31 | dtype=tf.float32. The groundtruth_boxes is a rank 2 tensor: [N, 4] where 32 | in each row there is a box with [ymin xmin ymax xmax]. 33 | Boxes are in normalized coordinates meaning 34 | their coordinate values range in [0, 1] 35 | Important Note: In tensor_dict, images is a rank 4 tensor, but preprocessing 36 | functions receive a rank 3 tensor for processing the image. Thus, inside the 37 | preprocess function we squeeze the image to become a rank 3 tensor and then 38 | we pass it to the functions. At the end of the preprocess we expand the image 39 | back to rank 4. 40 | """ 41 | 42 | import tensorflow as tf 43 | 44 | def tf_summary_image(image, boxes, name='image'): 45 | """Add image with bounding boxes to summary. 46 | """ 47 | image = tf.expand_dims(image, 0) 48 | boxes = tf.expand_dims(boxes, 0) 49 | image_with_box = tf.image.draw_bounding_boxes(image, boxes) 50 | tf.summary.image(name, image_with_box) 51 | 52 | def normalize_image(image, mean=(0.485, 0.456, 0.406), var=(0.229, 0.224, 0.225)): 53 | """Normalizes pixel values in the image. 54 | Moves the pixel values from the current [original_minval, original_maxval] 55 | range to a the [target_minval, target_maxval] range. 56 | Args: 57 | image: rank 3 float32 tensor containing 1 58 | image -> [height, width, channels]. 59 | Returns: 60 | image: image which is the same shape as input image. 61 | """ 62 | with tf.name_scope('NormalizeImage', values=[image]): 63 | image = tf.to_float(image) 64 | image /= 255.0 65 | 66 | image -= mean 67 | image /= var 68 | 69 | return image 70 | 71 | 72 | def resize_image_and_boxes(image, boxes, input_size, 73 | method=tf.image.ResizeMethod.BILINEAR): 74 | with tf.name_scope('ResizeImage', values=[image, input_size, method]): 75 | image_resize = tf.image.resize_images(image, [input_size, input_size], method=method) 76 | boxes_resize = boxes * input_size 77 | 78 | return image_resize, boxes_resize 79 | 80 | 81 | def flip_boxes_horizontally(boxes): 82 | """Left-right flip the boxes. 83 | Args: 84 | boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. 85 | Boxes are in normalized form meaning their coordinates vary 86 | between [0, 1]. 87 | Each row is in the form of [ymin, xmin, ymax, xmax]. 88 | Returns: 89 | Horizontally flipped boxes. 90 | """ 91 | # Flip boxes horizontally. 92 | ymin, xmin, ymax, xmax = tf.split(value=boxes, num_or_size_splits=4, axis=1) 93 | flipped_xmin = tf.subtract(1.0, xmax) 94 | flipped_xmax = tf.subtract(1.0, xmin) 95 | flipped_boxes = tf.concat([ymin, flipped_xmin, ymax, flipped_xmax], 1) 96 | return flipped_boxes 97 | 98 | 99 | def flip_boxes_vertically(boxes): 100 | """Up-down flip the boxes 101 | Args: 102 | boxes: rank 2 float32 tensor containing bounding boxes -> [N, 4]. 103 | Boxes are in normalized form meaning their coordinates vary 104 | between [0, 1] 105 | Each row is in the form of [ymin, xmin, ymax, xmax] 106 | Returns: 107 | Vertically flipped boxes 108 | """ 109 | # Flip boxes vertically 110 | ymin, xmin, ymax, xmax = tf.split(value=boxes, num_or_size_splits=4, axis=1) 111 | flipped_ymin = tf.subtract(1.0, ymax) 112 | flipped_ymax = tf.subtract(1.0, ymin) 113 | flipped_boxes = tf.concat([flipped_ymin, xmin, flipped_ymax, xmax], axis=1) 114 | return flipped_boxes 115 | 116 | 117 | def random_horizontal_flip(image, boxes, seed=None): 118 | """Randomly decides whether to horizontally mirror the image and detections or not. 119 | The probability of flipping the image is 50%. 120 | Args: 121 | image: rank 3 float32 tensor with shape [height, width, channels]. 122 | boxes: (optional) rank 2 float32 tensor with shape [N, 4] 123 | containing the bounding boxes. 124 | Boxes are in normalized form meaning their coordinates vary 125 | between [0, 1]. 126 | Each row is in the form of [ymin, xmin, ymax, xmax]. 127 | seed: random seed 128 | Returns: 129 | image: image which is the same shape as input image. 130 | If boxes, masks, keypoints, and keypoint_flip_permutation is not None, 131 | the function also returns the following tensors. 132 | boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. 133 | Boxes are in normalized form meaning their coordinates vary 134 | between [0, 1]. 135 | Raises: 136 | ValueError: if keypoints are provided but keypoint_flip_permutation is not. 137 | """ 138 | def _flip_image(image): 139 | # flip image 140 | image_flipped = tf.image.flip_left_right(image) 141 | return image_flipped 142 | 143 | with tf.name_scope('RandomHorizontalFlip', values=[image, boxes]): 144 | result = [] 145 | # random variable defining whether to do flip or not 146 | do_a_flip_random = tf.random_uniform([], seed=seed) 147 | # flip only if there are bounding boxes in image! 148 | do_a_flip_random = tf.logical_and( 149 | tf.greater(tf.size(boxes), 0), tf.greater(do_a_flip_random, 0.5)) 150 | 151 | # flip image 152 | image = tf.cond(do_a_flip_random, lambda: _flip_image(image), lambda: image) 153 | result.append(image) 154 | 155 | # flip boxes 156 | if boxes is not None: 157 | boxes = tf.cond( 158 | do_a_flip_random, lambda: flip_boxes_horizontally(boxes), lambda: boxes) 159 | result.append(boxes) 160 | 161 | return tuple(result) 162 | 163 | 164 | def random_vertical_flip(image, boxes, seed=None): 165 | """Randomly decides whether to vertically mirror the image and detections or not. 166 | The probability of flipping the image is 50%. 167 | Args: 168 | image: rank 3 float32 tensor with shape [height, width, channels]. 169 | boxes: (optional) rank 2 float32 tensor with shape [N, 4] 170 | containing the bounding boxes. 171 | Boxes are in normalized form meaning their coordinates vary 172 | between [0, 1]. 173 | Each row is in the form of [ymin, xmin, ymax, xmax]. 174 | seed: random seed 175 | Returns: 176 | image: image which is the same shape as input image. 177 | If boxes, masks, keypoints, and keypoint_flip_permutation is not None, 178 | the function also returns the following tensors. 179 | boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. 180 | Boxes are in normalized form meaning their coordinates vary 181 | between [0, 1]. 182 | Raises: 183 | ValueError: if keypoints are provided but keypoint_flip_permutation is not. 184 | """ 185 | def _flip_image(image): 186 | # flip image 187 | image_flipped = tf.image.flip_up_down(image) 188 | return image_flipped 189 | 190 | with tf.name_scope('RandomVerticalFlip', values=[image, boxes]): 191 | result = [] 192 | # random variable defining whether to do flip or not 193 | do_a_flip_random = tf.random_uniform([], seed=seed) 194 | # flip only if there are bounding boxes in image! 195 | do_a_flip_random = tf.logical_and( 196 | tf.greater(tf.size(boxes), 0), tf.greater(do_a_flip_random, 0.5)) 197 | 198 | # flip image 199 | image = tf.cond(do_a_flip_random, lambda: _flip_image(image), lambda: image) 200 | result.append(image) 201 | 202 | # flip boxes 203 | if boxes is not None: 204 | boxes = tf.cond( 205 | do_a_flip_random, lambda: flip_boxes_vertically(boxes), lambda: boxes) 206 | result.append(boxes) 207 | 208 | return tuple(result) 209 | 210 | def random_pixel_value_scale(image, minval=0.9, maxval=1.1, seed=None): 211 | """Scales each value in the pixels of the image. 212 | This function scales each pixel independent of the other ones. 213 | For each value in image tensor, draws a random number between 214 | minval and maxval and multiples the values with them. 215 | Args: 216 | image: rank 3 float32 tensor contains 1 image -> [height, width, channels] 217 | with pixel values varying between [0, 1]. 218 | minval: lower ratio of scaling pixel values. 219 | maxval: upper ratio of scaling pixel values. 220 | seed: random seed. 221 | Returns: 222 | image: image which is the same shape as input image. 223 | """ 224 | with tf.name_scope('RandomPixelValueScale', values=[image]): 225 | color_coef = tf.random_uniform( 226 | tf.shape(image), 227 | minval=minval, 228 | maxval=maxval, 229 | dtype=tf.float32, 230 | seed=seed) 231 | image = tf.multiply(image, color_coef) 232 | image = tf.clip_by_value(image, 0.0, 1.0) 233 | 234 | return image 235 | 236 | def random_image_scale(image, 237 | masks=None, 238 | min_scale_ratio=0.5, 239 | max_scale_ratio=2.0, 240 | seed=None): 241 | """Scales the image size. 242 | Args: 243 | image: rank 3 float32 tensor contains 1 image -> [height, width, channels]. 244 | masks: (optional) rank 3 float32 tensor containing masks with 245 | size [height, width, num_masks]. The value is set to None if there are no 246 | masks. 247 | min_scale_ratio: minimum scaling ratio. 248 | max_scale_ratio: maximum scaling ratio. 249 | seed: random seed. 250 | Returns: 251 | image: image which is the same rank as input image. 252 | masks: If masks is not none, resized masks which are the same rank as input 253 | masks will be returned. 254 | """ 255 | with tf.name_scope('RandomImageScale', values=[image]): 256 | result = [] 257 | image_shape = tf.shape(image) 258 | image_height = image_shape[0] 259 | image_width = image_shape[1] 260 | size_coef = tf.random_uniform([], 261 | minval=min_scale_ratio, 262 | maxval=max_scale_ratio, 263 | dtype=tf.float32, seed=seed) 264 | image_newysize = tf.to_int32( 265 | tf.multiply(tf.to_float(image_height), size_coef)) 266 | image_newxsize = tf.to_int32( 267 | tf.multiply(tf.to_float(image_width), size_coef)) 268 | image = tf.image.resize_images( 269 | image, [image_newysize, image_newxsize], align_corners=True) 270 | result.append(image) 271 | if masks: 272 | masks = tf.image.resize_nearest_neighbor( 273 | masks, [image_newysize, image_newxsize], align_corners=True) 274 | result.append(masks) 275 | return tuple(result) 276 | 277 | 278 | def random_adjust_brightness(image, max_delta=32. / 255.): 279 | """Randomly adjusts brightness. 280 | Makes sure the output image is still between 0 and 1. 281 | Args: 282 | image: rank 3 float32 tensor contains 1 image -> [height, width, channels] 283 | with pixel values varying between [0, 1]. 284 | max_delta: how much to change the brightness. A value between [0, 1). 285 | Returns: 286 | image: image which is the same shape as input image. 287 | boxes: boxes which is the same shape as input boxes. 288 | """ 289 | def _random_adjust_brightness(image, max_delta): 290 | with tf.name_scope('RandomAdjustBrightness', values=[image]): 291 | image = tf.image.random_brightness(image, max_delta) 292 | image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=1.0) 293 | return image 294 | 295 | do_random = tf.greater(tf.random_uniform([]), 0.35) 296 | image = tf.cond(do_random, lambda: _random_adjust_brightness(image, max_delta), lambda: image) 297 | return image 298 | 299 | def random_adjust_contrast(image, min_delta=0.5, max_delta=1.25): 300 | """Randomly adjusts contrast. 301 | Makes sure the output image is still between 0 and 1. 302 | Args: 303 | image: rank 3 float32 tensor contains 1 image -> [height, width, channels] 304 | with pixel values varying between [0, 1]. 305 | min_delta: see max_delta. 306 | max_delta: how much to change the contrast. Contrast will change with a 307 | value between min_delta and max_delta. This value will be 308 | multiplied to the current contrast of the image. 309 | Returns: 310 | image: image which is the same shape as input image. 311 | """ 312 | def _random_adjust_contrast(image, min_delta, max_delta): 313 | with tf.name_scope('RandomAdjustContrast', values=[image]): 314 | image = tf.image.random_contrast(image, min_delta, max_delta) 315 | image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=1.0) 316 | return image 317 | 318 | do_random = tf.greater(tf.random_uniform([]), 0.35) 319 | image = tf.cond(do_random, lambda: _random_adjust_contrast(image, min_delta, max_delta), lambda: image) 320 | return image 321 | 322 | def random_adjust_hue(image, max_delta=0.02): 323 | """Randomly adjusts hue. 324 | Makes sure the output image is still between 0 and 1. 325 | Args: 326 | image: rank 3 float32 tensor contains 1 image -> [height, width, channels] 327 | with pixel values varying between [0, 1]. 328 | max_delta: change hue randomly with a value between 0 and max_delta. 329 | Returns: 330 | image: image which is the same shape as input image. 331 | """ 332 | def _random_adjust_hue(image, max_delta): 333 | with tf.name_scope('RandomAdjustHue', values=[image]): 334 | image = tf.image.random_hue(image, max_delta) 335 | image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=1.0) 336 | return image 337 | 338 | do_random = tf.greater(tf.random_uniform([]), 0.35) 339 | image = tf.cond(do_random, lambda: _random_adjust_hue(image, max_delta), lambda: image) 340 | return image 341 | 342 | 343 | def random_adjust_saturation(image, min_delta=0.5, max_delta=1.25): 344 | """Randomly adjusts saturation. 345 | Makes sure the output image is still between 0 and 1. 346 | Args: 347 | image: rank 3 float32 tensor contains 1 image -> [height, width, channels] 348 | with pixel values varying between [0, 1]. 349 | min_delta: see max_delta. 350 | max_delta: how much to change the saturation. Saturation will change with a 351 | value between min_delta and max_delta. This value will be 352 | multiplied to the current saturation of the image. 353 | Returns: 354 | image: image which is the same shape as input image. 355 | """ 356 | def _random_adjust_saturation(image, min_delta, max_delta): 357 | with tf.name_scope('RandomAdjustSaturation', values=[image]): 358 | image = tf.image.random_saturation(image, min_delta, max_delta) 359 | image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=1.0) 360 | return image 361 | 362 | do_random = tf.greater(tf.random_uniform([]), 0.35) 363 | image = tf.cond(do_random, lambda: _random_adjust_saturation(image, min_delta, max_delta), lambda: image) 364 | return image 365 | 366 | 367 | def random_distort_color(image, color_ordering=0): 368 | """Randomly distorts color. 369 | Randomly distorts color using a combination of brightness, hue, contrast 370 | and saturation changes. Makes sure the output image is still between 0 and 1. 371 | Args: 372 | image: rank 3 float32 tensor contains 1 image -> [height, width, channels] 373 | with pixel values varying between [0, 1]. 374 | color_ordering: Python int, a type of distortion (valid values: 0, 1). 375 | Returns: 376 | image: image which is the same shape as input image. 377 | Raises: 378 | ValueError: if color_ordering is not in {0, 1}. 379 | """ 380 | with tf.name_scope('RandomDistortColor', values=[image]): 381 | if color_ordering == 0: 382 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 383 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 384 | image = tf.image.random_hue(image, max_delta=0.2) 385 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 386 | elif color_ordering == 1: 387 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 388 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 389 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 390 | image = tf.image.random_hue(image, max_delta=0.2) 391 | else: 392 | raise ValueError('color_ordering must be in {0, 1}') 393 | 394 | # The random_* ops do not necessarily clamp. 395 | image = tf.clip_by_value(image, 0.0, 1.0) 396 | return image 397 | 398 | 399 | def random_jitter_boxes(boxes, ratio=0.05, seed=None): 400 | """Randomly jitter boxes in image. 401 | Args: 402 | boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. 403 | Boxes are in normalized form meaning their coordinates vary 404 | between [0, 1]. 405 | Each row is in the form of [ymin, xmin, ymax, xmax]. 406 | ratio: The ratio of the box width and height that the corners can jitter. 407 | For example if the width is 100 pixels and ratio is 0.05, 408 | the corners can jitter up to 5 pixels in the x direction. 409 | seed: random seed. 410 | Returns: 411 | boxes: boxes which is the same shape as input boxes. 412 | """ 413 | def random_jitter_box(box, ratio, seed): 414 | """Randomly jitter box. 415 | Args: 416 | box: bounding box [1, 1, 4]. 417 | ratio: max ratio between jittered box and original box, 418 | a number between [0, 0.5]. 419 | seed: random seed. 420 | Returns: 421 | jittered_box: jittered box. 422 | """ 423 | rand_numbers = tf.random_uniform( 424 | [1, 1, 4], minval=-ratio, maxval=ratio, dtype=tf.float32, seed=seed) 425 | box_width = tf.subtract(box[0, 0, 3], box[0, 0, 1]) 426 | box_height = tf.subtract(box[0, 0, 2], box[0, 0, 0]) 427 | hw_coefs = tf.stack([box_height, box_width, box_height, box_width]) 428 | hw_rand_coefs = tf.multiply(hw_coefs, rand_numbers) 429 | jittered_box = tf.add(box, hw_rand_coefs) 430 | jittered_box = tf.clip_by_value(jittered_box, 0.0, 1.0) 431 | return jittered_box 432 | 433 | with tf.name_scope('RandomJitterBoxes', values=[boxes]): 434 | # boxes are [N, 4]. Lets first make them [N, 1, 1, 4] 435 | boxes_shape = tf.shape(boxes) 436 | boxes = tf.expand_dims(boxes, 1) 437 | boxes = tf.expand_dims(boxes, 2) 438 | 439 | distorted_boxes = tf.map_fn( 440 | lambda x: random_jitter_box(x, ratio, seed), boxes, dtype=tf.float32) 441 | 442 | distorted_boxes = tf.reshape(distorted_boxes, boxes_shape) 443 | 444 | return distorted_boxes 445 | 446 | 447 | ## Random Crop 448 | 449 | def bboxes_resize(bbox_ref, bboxes, name=None): 450 | """Resize bounding boxes based on a reference bounding box, 451 | assuming that the latter is [0, 0, 1, 1] after transform. Useful for 452 | updating a collection of boxes after cropping an image. 453 | """ 454 | # Bboxes is dictionary. 455 | if isinstance(bboxes, dict): 456 | with tf.name_scope(name, 'bboxes_resize_dict'): 457 | d_bboxes = {} 458 | for c in bboxes.keys(): 459 | d_bboxes[c] = bboxes_resize(bbox_ref, bboxes[c]) 460 | return d_bboxes 461 | 462 | # Tensors inputs. 463 | with tf.name_scope(name, 'bboxes_resize'): 464 | # Translate. 465 | v = tf.stack([bbox_ref[0], bbox_ref[1], bbox_ref[0], bbox_ref[1]]) 466 | bboxes = bboxes - v 467 | # Scale. 468 | s = tf.stack([bbox_ref[2] - bbox_ref[0], 469 | bbox_ref[3] - bbox_ref[1], 470 | bbox_ref[2] - bbox_ref[0], 471 | bbox_ref[3] - bbox_ref[1]]) 472 | bboxes = bboxes / s 473 | return bboxes 474 | 475 | 476 | def bboxes_intersection(bbox_ref, bboxes, name=None): 477 | """Compute relative intersection between a reference box and a 478 | collection of bounding boxes. Namely, compute the quotient between 479 | intersection area and box area. 480 | Args: 481 | bbox_ref: (N, 4) or (4,) Tensor with reference bounding box(es). 482 | bboxes: (N, 4) Tensor, collection of bounding boxes. 483 | Return: 484 | (N,) Tensor with relative intersection. 485 | """ 486 | with tf.name_scope(name, 'bboxes_intersection'): 487 | # Should be more efficient to first transpose. 488 | bboxes = tf.transpose(bboxes) 489 | bbox_ref = tf.transpose(bbox_ref) 490 | # Intersection bbox and volume. 491 | int_ymin = tf.maximum(bboxes[0], bbox_ref[0]) 492 | int_xmin = tf.maximum(bboxes[1], bbox_ref[1]) 493 | int_ymax = tf.minimum(bboxes[2], bbox_ref[2]) 494 | int_xmax = tf.minimum(bboxes[3], bbox_ref[3]) 495 | h = tf.maximum(int_ymax - int_ymin, 0.) 496 | w = tf.maximum(int_xmax - int_xmin, 0.) 497 | # Volumes. 498 | inter_vol = h * w 499 | bboxes_vol = (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1]) 500 | #scores = tfe_math.safe_divide(inter_vol, bboxes_vol, 'intersection') 501 | scores = inter_vol / bboxes_vol 502 | return scores 503 | 504 | def bboxes_filter_overlap(labels, bboxes, threshold=0.3, 505 | scope=None): 506 | """Filter out bounding boxes based on overlap with reference 507 | box [0, 0, 1, 1]. 508 | Return: 509 | labels, bboxes: Filtered elements. 510 | """ 511 | with tf.name_scope(scope, 'bboxes_filter', [labels, bboxes]): 512 | scores = bboxes_intersection(tf.constant([0, 0, 1, 1], bboxes.dtype), 513 | bboxes) 514 | mask = scores > threshold 515 | labels = tf.boolean_mask(labels, mask) 516 | bboxes = tf.boolean_mask(bboxes, mask) 517 | return labels, bboxes 518 | 519 | def distorted_bounding_box_crop(image, 520 | bboxes, 521 | labels, 522 | min_object_covered=0.05, 523 | aspect_ratio_range=(0.8, 1.2), 524 | area_range=(0.1, 1.0), 525 | max_attempts=200, 526 | scope=None): 527 | """Generates cropped_image using a one of the bboxes randomly distorted. 528 | See `tf.image.sample_distorted_bounding_box` for more documentation. 529 | Args: 530 | image: 3-D Tensor of image (it will be converted to floats in [0, 1]). 531 | bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] 532 | where each coordinate is [0, 1) and the coordinates are arranged 533 | as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole 534 | image. 535 | min_object_covered: An optional `float`. Defaults to `0.1`. The cropped 536 | area of the image must contain at least this fraction of any bounding box 537 | supplied. 538 | aspect_ratio_range: An optional list of `floats`. The cropped area of the 539 | image must have an aspect ratio = width / height within this range. 540 | area_range: An optional list of `floats`. The cropped area of the image 541 | must contain a fraction of the supplied image within in this range. 542 | max_attempts: An optional `int`. Number of attempts at generating a cropped 543 | region of the image of the specified constraints. After `max_attempts` 544 | failures, return the entire image. 545 | scope: Optional scope for name_scope. 546 | Returns: 547 | A tuple, a 3-D Tensor cropped_image and the distorted bbox 548 | """ 549 | bboxes = tf.clip_by_value(bboxes, 0.0, 1.0) 550 | 551 | with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bboxes]): 552 | # Each bounding box has shape [1, num_boxes, box coords] and 553 | # the coordinates are ordered [ymin, xmin, ymax, xmax]. 554 | bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box( 555 | tf.shape(image), 556 | bounding_boxes=tf.expand_dims(bboxes, 0), 557 | min_object_covered=min_object_covered, 558 | aspect_ratio_range=aspect_ratio_range, 559 | area_range=area_range, 560 | max_attempts=max_attempts, 561 | use_image_if_no_bounding_boxes=True) 562 | distort_bbox = distort_bbox[0, 0] 563 | 564 | # Crop the image to the specified bounding box. 565 | cropped_image = tf.slice(image, bbox_begin, bbox_size) 566 | # Restore the shape since the dynamic slice loses 3rd dimension. 567 | cropped_image.set_shape([None, None, 3]) 568 | 569 | # Update bounding boxes: resize and filter out. 570 | cropped_bboxes = bboxes_resize(distort_bbox, bboxes) 571 | cropped_labels, cropped_bboxes = bboxes_filter_overlap(labels, cropped_bboxes) 572 | 573 | no_box = tf.equal(tf.shape(cropped_bboxes)[0], 0) # If there is no box in the image, it returns the original image. 574 | image, bboxes, labels = tf.cond(no_box, lambda:(image, bboxes, labels), lambda:(cropped_image, cropped_bboxes, cropped_labels)) 575 | 576 | return image, bboxes, labels 577 | -------------------------------------------------------------------------------- /weights/readme.md: -------------------------------------------------------------------------------- 1 | ### ImageNet Pre-train weights (pytorch) 2 | 3 | - resnet50-bn.pth : https://download.pytorch.org/models/resnet50-19c8e357.pth 4 | - resnet101-bn.pth : https://download.pytorch.org/models/resnet101-5d3b4d8f.pth 5 | - resnet152-bn.pth : https://download.pytorch.org/models/resnet152-b121ed2d.pth 6 | 7 | - resnet50-gn(32).pth : http://www.cs.unc.edu/~cyfu/resnet50_groupnorm32.tar 8 | - resnet50-gn(16).pth : http://www.cs.unc.edu/~cyfu/resnet50_groupnorm16.tar 9 | 10 | - se-resnet50-bn : https://data.lip6.fr/cadene/pretrainedmodels/se_resnet50-ce0d4300.pth 11 | - se-resnet101-bn : https://data.lip6.fr/cadene/pretrainedmodels/se_resnet101-7e38fcc6.pth 12 | - se-resnet152-bn : https://data.lip6.fr/cadene/pretrainedmodels/se_resnet152-d17c99b7.pth 13 | --------------------------------------------------------------------------------