├── tracking ├── __init__.py ├── util.py ├── argmax_tracker.py ├── three_stage_tracker.py └── do_tracking.py ├── .gitignore ├── davis2017_fast_val_ids.txt ├── LICENSE ├── main_trax.py ├── hard_example_utils.py ├── utils ├── box_ops.py ├── generate_anchors.py └── np_box_ops.py ├── viz.py ├── model_mrcnn.py ├── README.md ├── vot_helper.py ├── common.py ├── model_rpn.py ├── model_box.py ├── basemodel.py ├── model_fpn.py ├── model_cascade.py ├── eval_utils.py ├── config.py ├── model_frcnn.py └── dataset.py /tracking/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | train_log 3 | tracking_data 4 | -------------------------------------------------------------------------------- /davis2017_fast_val_ids.txt: -------------------------------------------------------------------------------- 1 | bike-packing__2 2 | bmx-trees__1 3 | bmx-trees__2 4 | dogs-jump__1 5 | dogs-jump__2 6 | gold-fish__1 7 | gold-fish__2 8 | gold-fish__3 9 | gold-fish__4 10 | gold-fish__5 11 | india__1 12 | india__2 13 | india__3 14 | judo__2 15 | kite-surf__1 16 | kite-surf__2 17 | lab-coat__1 18 | lab-coat__2 19 | loading__2 20 | loading__3 21 | motocross-jump__1 22 | paragliding-launch__1 23 | paragliding-launch__2 24 | paragliding-launch__3 25 | pigs__2 26 | shooting__1 27 | shooting__3 28 | soapbox__2 29 | soapbox__3 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Visual Computing Institute 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /main_trax.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from tracking.three_stage_tracker import ThreeStageTracker 4 | import sys 5 | import cv2 6 | import PIL 7 | import numpy as np 8 | import vot_helper 9 | 10 | 11 | class SiamRCNN: 12 | def __init__(self, image, region): 13 | sp = __file__.split("/") 14 | ckpt = "/".join(sp[:-1]) + "/train_log/hard_mining3/model-1360500" 15 | self._tracker = ThreeStageTracker(model="checkpoint:" + ckpt) 16 | x, y, w, h = region 17 | box = np.array([x, y, w, h]) 18 | self._tracker.init(image, box) 19 | 20 | def track(self, image): 21 | new_box, score = self._tracker.update(image, use_confidences=True) 22 | x, y, w, h = new_box 23 | print(new_box, score) 24 | rect = vot_helper.Rectangle(x, y, w, h) 25 | return rect, score 26 | 27 | 28 | handle = vot_helper.VOT("rectangle") 29 | selection = handle.region() 30 | imagefile = handle.frame() 31 | if not imagefile: 32 | sys.exit(0) 33 | 34 | image = np.array(PIL.Image.open(imagefile)) 35 | tracker = SiamRCNN(image, selection) 36 | while True: 37 | imagefile = handle.frame() 38 | if not imagefile: 39 | break 40 | image = np.array(PIL.Image.open(imagefile)) 41 | region, confidence = tracker.track(image) 42 | handle.report(region, confidence) 43 | -------------------------------------------------------------------------------- /hard_example_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def subsample_nns(query_seq, nns, names, n_seqs_to_sample, remove_query=True): 5 | random.shuffle(nns) 6 | 7 | nn_names = [names[n] for n in nns] 8 | nn_seqs = [x.split("/")[-2] for x in nn_names] 9 | 10 | seq_to_nns = {} 11 | for nn, nn_seq in zip(nns, nn_seqs): 12 | if nn_seq not in seq_to_nns: 13 | seq_to_nns[nn_seq] = [] 14 | seq_to_nns[nn_seq].append(nn) 15 | 16 | #seq_to_n = sorted([(k, len(v)) for k, v in seq_to_nns.items()], key=lambda x: x[1], reverse=True) 17 | #n_total = sum(x[1] for x in seq_to_n) 18 | #for seq, n_in_seqs in seq_to_n: 19 | # pct = n_in_seqs * 100 / n_total 20 | # if pct > 1.0: 21 | # print(seq, pct, "%") 22 | #print("n_seqs in nns", len(seq_to_nns)) 23 | 24 | sampled_nns = [] 25 | sample_seqs = set(seq_to_nns.keys()) 26 | if remove_query: 27 | sample_seqs.remove(query_seq) 28 | sample_seqs = list(sample_seqs) 29 | random.shuffle(sample_seqs) 30 | sample_seqs = sample_seqs[:n_seqs_to_sample] 31 | 32 | # get 1 per sequence 33 | for seq in sample_seqs: 34 | seq_nns = seq_to_nns[seq] 35 | nn = random.choice(seq_nns) 36 | sampled_nns.append(nn) 37 | return sampled_nns 38 | 39 | 40 | def subsample_nns_old(name, nns, names, n_seqs_to_sample): 41 | random.shuffle(nns) 42 | 43 | nn_names = [names[n] for n in nns] 44 | nn_seqs = [x.split("/")[-2] for x in nn_names] 45 | seq = name.split("/")[-2] 46 | 47 | sampled_nns = [] 48 | sampled_seqs = set() 49 | sampled_seqs.add(seq) 50 | # get 1 per sequence 51 | for nn, seq in zip(nns, nn_seqs): 52 | if seq not in sampled_seqs: 53 | sampled_seqs.add(seq) 54 | sampled_nns.append(nn) 55 | sampled_nns = sampled_nns[:n_seqs_to_sample] 56 | return sampled_nns 57 | -------------------------------------------------------------------------------- /utils/box_ops.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File: box_ops.py 3 | 4 | import tensorflow as tf 5 | 6 | from tensorpack.tfutils.scope_utils import under_name_scope 7 | 8 | 9 | """ 10 | This file is modified from 11 | https://github.com/tensorflow/models/blob/master/object_detection/core/box_list_ops.py 12 | """ 13 | 14 | 15 | @under_name_scope() 16 | def area(boxes): 17 | """ 18 | Args: 19 | boxes: nx4 floatbox 20 | 21 | Returns: 22 | n 23 | """ 24 | x_min, y_min, x_max, y_max = tf.split(boxes, 4, axis=1) 25 | return tf.squeeze((y_max - y_min) * (x_max - x_min), [1]) 26 | 27 | 28 | @under_name_scope() 29 | def pairwise_intersection(boxlist1, boxlist2): 30 | """Compute pairwise intersection areas between boxes. 31 | 32 | Args: 33 | boxlist1: Nx4 floatbox 34 | boxlist2: Mx4 35 | 36 | Returns: 37 | a tensor with shape [N, M] representing pairwise intersections 38 | """ 39 | x_min1, y_min1, x_max1, y_max1 = tf.split(boxlist1, 4, axis=1) 40 | x_min2, y_min2, x_max2, y_max2 = tf.split(boxlist2, 4, axis=1) 41 | all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2)) 42 | all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2)) 43 | intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin) 44 | all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2)) 45 | all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2)) 46 | intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin) 47 | return intersect_heights * intersect_widths 48 | 49 | 50 | @under_name_scope() 51 | def pairwise_iou(boxlist1, boxlist2): 52 | """Computes pairwise intersection-over-union between box collections. 53 | 54 | Args: 55 | boxlist1: Nx4 floatbox 56 | boxlist2: Mx4 57 | 58 | Returns: 59 | a tensor with shape [N, M] representing pairwise iou scores. 60 | """ 61 | intersections = pairwise_intersection(boxlist1, boxlist2) 62 | areas1 = area(boxlist1) 63 | areas2 = area(boxlist2) 64 | unions = ( 65 | tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections) 66 | return tf.where( 67 | tf.equal(intersections, 0.0), 68 | tf.zeros_like(intersections), tf.truediv(intersections, unions)) 69 | -------------------------------------------------------------------------------- /viz.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File: viz.py 3 | 4 | import numpy as np 5 | from six.moves import zip 6 | 7 | from tensorpack.utils import viz 8 | from tensorpack.utils.palette import PALETTE_RGB 9 | 10 | from config import config as cfg 11 | from utils.np_box_ops import iou as np_iou 12 | 13 | 14 | def draw_annotation(img, boxes, klass, is_crowd=None): 15 | """Will not modify img""" 16 | labels = [] 17 | assert len(boxes) == len(klass) 18 | if is_crowd is not None: 19 | assert len(boxes) == len(is_crowd) 20 | for cls, crd in zip(klass, is_crowd): 21 | clsname = cfg.DATA.CLASS_NAMES[cls] 22 | if crd == 1: 23 | clsname += ';Crowd' 24 | labels.append(clsname) 25 | else: 26 | for cls in klass: 27 | labels.append(cfg.DATA.CLASS_NAMES[cls]) 28 | img = viz.draw_boxes(img, boxes, labels) 29 | return img 30 | 31 | 32 | def draw_proposal_recall(img, proposals, proposal_scores, gt_boxes): 33 | """ 34 | Draw top3 proposals for each gt. 35 | Args: 36 | proposals: NPx4 37 | proposal_scores: NP 38 | gt_boxes: NG 39 | """ 40 | box_ious = np_iou(gt_boxes, proposals) # ng x np 41 | box_ious_argsort = np.argsort(-box_ious, axis=1) 42 | good_proposals_ind = box_ious_argsort[:, :3] # for each gt, find 3 best proposals 43 | good_proposals_ind = np.unique(good_proposals_ind.ravel()) 44 | 45 | proposals = proposals[good_proposals_ind, :] 46 | tags = list(map(str, proposal_scores[good_proposals_ind])) 47 | img = viz.draw_boxes(img, proposals, tags) 48 | return img, good_proposals_ind 49 | 50 | 51 | def draw_predictions(img, boxes, scores): 52 | """ 53 | Args: 54 | boxes: kx4 55 | scores: kxC 56 | """ 57 | if len(boxes) == 0: 58 | return img 59 | labels = scores.argmax(axis=1) 60 | scores = scores.max(axis=1) 61 | tags = ["{},{:.2f}".format(cfg.DATA.CLASS_NAMES[lb], score) for lb, score in zip(labels, scores)] 62 | return viz.draw_boxes(img, boxes, tags) 63 | 64 | 65 | def draw_final_outputs(img, results): 66 | """ 67 | Args: 68 | results: [DetectionResult] 69 | """ 70 | if len(results) == 0: 71 | return img 72 | 73 | tags = [] 74 | for r in results: 75 | tags.append( 76 | "{},{:.2f}".format(cfg.DATA.CLASS_NAMES[r.class_id], r.score)) 77 | boxes = np.asarray([r.box for r in results]) 78 | ret = viz.draw_boxes(img, boxes, tags) 79 | 80 | for r in results: 81 | if r.mask is not None: 82 | ret = draw_mask(ret, r.mask) 83 | return ret 84 | 85 | 86 | def draw_mask(im, mask, alpha=0.5, color=None): 87 | """ 88 | Overlay a mask on top of the image. 89 | 90 | Args: 91 | im: a 3-channel uint8 image in BGR 92 | mask: a binary 1-channel image of the same size 93 | color: if None, will choose automatically 94 | """ 95 | if color is None: 96 | color = PALETTE_RGB[np.random.choice(len(PALETTE_RGB))][::-1] 97 | im = np.where(np.repeat((mask > 0)[:, :, None], 3, axis=2), 98 | im * (1 - alpha) + color * alpha, im) 99 | im = im.astype('uint8') 100 | return im 101 | -------------------------------------------------------------------------------- /utils/generate_anchors.py: -------------------------------------------------------------------------------- 1 | # https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py 2 | 3 | # -------------------------------------------------------- 4 | # Faster R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick and Sean Bell 8 | # -------------------------------------------------------- 9 | 10 | import numpy as np 11 | from six.moves import range 12 | 13 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 14 | # 15 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 16 | # >> anchors 17 | # 18 | # anchors = 19 | # 20 | # -83 -39 100 56 21 | # -175 -87 192 104 22 | # -359 -183 376 200 23 | # -55 -55 72 72 24 | # -119 -119 136 136 25 | # -247 -247 264 264 26 | # -35 -79 52 96 27 | # -79 -167 96 184 28 | # -167 -343 184 360 29 | 30 | # array([[ -83., -39., 100., 56.], 31 | # [-175., -87., 192., 104.], 32 | # [-359., -183., 376., 200.], 33 | # [ -55., -55., 72., 72.], 34 | # [-119., -119., 136., 136.], 35 | # [-247., -247., 264., 264.], 36 | # [ -35., -79., 52., 96.], 37 | # [ -79., -167., 96., 184.], 38 | # [-167., -343., 184., 360.]]) 39 | 40 | 41 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 42 | scales=2**np.arange(3, 6)): 43 | """ 44 | Generate anchor (reference) windows by enumerating aspect ratios X 45 | scales wrt a reference (0, 0, 15, 15) window. 46 | """ 47 | 48 | base_anchor = np.array([1, 1, base_size, base_size], dtype='float32') - 1 49 | ratio_anchors = _ratio_enum(base_anchor, ratios) 50 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 51 | for i in range(ratio_anchors.shape[0])]) 52 | return anchors 53 | 54 | 55 | def _whctrs(anchor): 56 | """ 57 | Return width, height, x center, and y center for an anchor (window). 58 | """ 59 | 60 | w = anchor[2] - anchor[0] + 1 61 | h = anchor[3] - anchor[1] + 1 62 | x_ctr = anchor[0] + 0.5 * (w - 1) 63 | y_ctr = anchor[1] + 0.5 * (h - 1) 64 | return w, h, x_ctr, y_ctr 65 | 66 | 67 | def _mkanchors(ws, hs, x_ctr, y_ctr): 68 | """ 69 | Given a vector of widths (ws) and heights (hs) around a center 70 | (x_ctr, y_ctr), output a set of anchors (windows). 71 | """ 72 | 73 | ws = ws[:, np.newaxis] 74 | hs = hs[:, np.newaxis] 75 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 76 | y_ctr - 0.5 * (hs - 1), 77 | x_ctr + 0.5 * (ws - 1), 78 | y_ctr + 0.5 * (hs - 1))) 79 | return anchors 80 | 81 | 82 | def _ratio_enum(anchor, ratios): 83 | """ 84 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 85 | """ 86 | 87 | w, h, x_ctr, y_ctr = _whctrs(anchor) 88 | size = w * h 89 | size_ratios = size / ratios 90 | ws = np.round(np.sqrt(size_ratios)) 91 | hs = np.round(ws * ratios) 92 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 93 | return anchors 94 | 95 | 96 | def _scale_enum(anchor, scales): 97 | """ 98 | Enumerate a set of anchors for each scale wrt an anchor. 99 | """ 100 | 101 | w, h, x_ctr, y_ctr = _whctrs(anchor) 102 | ws = w * scales 103 | hs = h * scales 104 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 105 | return anchors 106 | -------------------------------------------------------------------------------- /model_mrcnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | from tensorpack.models import Conv2D, Conv2DTranspose, layer_register 6 | from tensorpack.tfutils.argscope import argscope 7 | from tensorpack.tfutils.common import get_tf_version_tuple 8 | from tensorpack.tfutils.scope_utils import under_name_scope 9 | from tensorpack.tfutils.summary import add_moving_summary 10 | 11 | from basemodel import GroupNorm 12 | from config import config as cfg 13 | 14 | 15 | @under_name_scope() 16 | def maskrcnn_loss(mask_logits, fg_labels, fg_target_masks): 17 | """ 18 | Args: 19 | mask_logits: #fg x #category xhxw 20 | fg_labels: #fg, in 1~#class, int64 21 | fg_target_masks: #fgxhxw, float32 22 | """ 23 | num_fg = tf.size(fg_labels, out_type=tf.int64) 24 | indices = tf.stack([tf.range(num_fg), fg_labels - 1], axis=1) # #fgx2 25 | mask_logits = tf.gather_nd(mask_logits, indices) # #fgxhxw 26 | mask_probs = tf.sigmoid(mask_logits) 27 | 28 | # add some training visualizations to tensorboard 29 | with tf.name_scope('mask_viz'): 30 | viz = tf.concat([fg_target_masks, mask_probs], axis=1) 31 | viz = tf.expand_dims(viz, 3) 32 | viz = tf.cast(viz * 255, tf.uint8, name='viz') 33 | tf.summary.image('mask_truth|pred', viz, max_outputs=10) 34 | 35 | loss = tf.nn.sigmoid_cross_entropy_with_logits( 36 | labels=fg_target_masks, logits=mask_logits) 37 | loss = tf.reduce_mean(loss, name='maskrcnn_loss') 38 | 39 | pred_label = mask_probs > 0.5 40 | truth_label = fg_target_masks > 0.5 41 | accuracy = tf.reduce_mean( 42 | tf.cast(tf.equal(pred_label, truth_label), tf.float32), 43 | name='accuracy') 44 | pos_accuracy = tf.logical_and( 45 | tf.equal(pred_label, truth_label), 46 | tf.equal(truth_label, True)) 47 | pos_accuracy = tf.reduce_mean(tf.cast(pos_accuracy, tf.float32), name='pos_accuracy') 48 | fg_pixel_ratio = tf.reduce_mean(tf.cast(truth_label, tf.float32), name='fg_pixel_ratio') 49 | 50 | add_moving_summary(loss, accuracy, fg_pixel_ratio, pos_accuracy) 51 | return loss 52 | 53 | 54 | @layer_register(log_shape=True) 55 | def maskrcnn_upXconv_head(feature, num_category, num_convs, norm=None): 56 | """ 57 | Args: 58 | feature (NxCx s x s): size is 7 in C4 models and 14 in FPN models. 59 | num_category(int): 60 | num_convs (int): number of convolution layers 61 | norm (str or None): either None or 'GN' 62 | 63 | Returns: 64 | mask_logits (N x num_category x 2s x 2s): 65 | """ 66 | assert norm in [None, 'GN'], norm 67 | l = feature 68 | with argscope([Conv2D, Conv2DTranspose], data_format='channels_first', 69 | kernel_initializer=tf.variance_scaling_initializer( 70 | scale=2.0, mode='fan_out', 71 | distribution='untruncated_normal' if get_tf_version_tuple() >= (1, 12) else 'normal')): 72 | # c2's MSRAFill is fan_out 73 | for k in range(num_convs): 74 | l = Conv2D('fcn{}'.format(k), l, cfg.MRCNN.HEAD_DIM, 3, activation=tf.nn.relu) 75 | if norm is not None: 76 | l = GroupNorm('gn{}'.format(k), l) 77 | l = Conv2DTranspose('deconv', l, cfg.MRCNN.HEAD_DIM, 2, strides=2, activation=tf.nn.relu) 78 | l = Conv2D('conv', l, num_category, 1) 79 | return l 80 | 81 | 82 | def maskrcnn_up4conv_head(*args, **kwargs): 83 | return maskrcnn_upXconv_head(*args, num_convs=4, **kwargs) 84 | 85 | 86 | def maskrcnn_up4conv_gn_head(*args, **kwargs): 87 | return maskrcnn_upXconv_head(*args, num_convs=4, norm='GN', **kwargs) 88 | -------------------------------------------------------------------------------- /utils/np_box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Operations for [N, 4] numpy arrays representing bounding boxes. 17 | 18 | Example box operations that are supported: 19 | * Areas: compute bounding box areas 20 | * IOU: pairwise intersection-over-union scores 21 | """ 22 | import numpy as np 23 | 24 | 25 | def area(boxes): 26 | """Computes area of boxes. 27 | 28 | Args: 29 | boxes: Numpy array with shape [N, 4] holding N boxes 30 | 31 | Returns: 32 | a numpy array with shape [N*1] representing box areas 33 | """ 34 | return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) 35 | 36 | 37 | def intersection(boxes1, boxes2): 38 | """Compute pairwise intersection areas between boxes. 39 | 40 | Args: 41 | boxes1: a numpy array with shape [N, 4] holding N boxes 42 | boxes2: a numpy array with shape [M, 4] holding M boxes 43 | 44 | Returns: 45 | a numpy array with shape [N*M] representing pairwise intersection area 46 | """ 47 | [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1) 48 | [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1) 49 | 50 | all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2)) 51 | all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2)) 52 | intersect_heights = np.maximum( 53 | np.zeros(all_pairs_max_ymin.shape, dtype='f4'), 54 | all_pairs_min_ymax - all_pairs_max_ymin) 55 | all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2)) 56 | all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2)) 57 | intersect_widths = np.maximum( 58 | np.zeros(all_pairs_max_xmin.shape, dtype='f4'), 59 | all_pairs_min_xmax - all_pairs_max_xmin) 60 | return intersect_heights * intersect_widths 61 | 62 | 63 | def iou(boxes1, boxes2): 64 | """Computes pairwise intersection-over-union between box collections. 65 | 66 | Args: 67 | boxes1: a numpy array with shape [N, 4] holding N boxes. 68 | boxes2: a numpy array with shape [M, 4] holding M boxes. 69 | 70 | Returns: 71 | a numpy array with shape [N, M] representing pairwise iou scores. 72 | """ 73 | intersect = intersection(boxes1, boxes2) 74 | area1 = area(boxes1) 75 | area2 = area(boxes2) 76 | union = np.expand_dims(area1, axis=1) + np.expand_dims( 77 | area2, axis=0) - intersect 78 | return intersect / union 79 | 80 | 81 | def ioa(boxes1, boxes2): 82 | """Computes pairwise intersection-over-area between box collections. 83 | 84 | Intersection-over-area (ioa) between two boxes box1 and box2 is defined as 85 | their intersection area over box2's area. Note that ioa is not symmetric, 86 | that is, IOA(box1, box2) != IOA(box2, box1). 87 | 88 | Args: 89 | boxes1: a numpy array with shape [N, 4] holding N boxes. 90 | boxes2: a numpy array with shape [M, 4] holding N boxes. 91 | 92 | Returns: 93 | a numpy array with shape [N, M] representing pairwise ioa scores. 94 | """ 95 | intersect = intersection(boxes1, boxes2) 96 | inv_areas = np.expand_dims(1.0 / area(boxes2), axis=0) 97 | return intersect * inv_areas 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Siam R-CNN: Visual Tracking by Re-Detection 2 | ### [Paul Voigtlaender](https://www.vision.rwth-aachen.de/person/197/), [Jonathon Luiten](https://www.vision.rwth-aachen.de/person/216/), [Philip H.S. Torr](https://www.robots.ox.ac.uk/~tvg/), [Bastian Leibe](https://www.vision.rwth-aachen.de/) 3 | The corresponding project page can be found here: https://www.vision.rwth-aachen.de/page/siamrcnn 4 | 5 | This software is written in Python3 and powered by TensorFlow 1. 6 | 7 | We borrow a lot of code from TensorPack's Faster R-CNN example: https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN 8 | 9 | ## Installation 10 | 11 | ### Download necessary libraries 12 | Here we will put all external libraries and this repository into /home/${USERNAME}/vision and use 13 | pip to install common libraries 14 | ``` 15 | mkdir /home/${USERNAME}/vision 16 | cd /home/${USERNAME}/vision 17 | 18 | git clone https://github.com/VisualComputingInstitute/SiamR-CNN.git 19 | git clone https://github.com/pvoigtlaender/got10k-toolkit.git 20 | git clone https://github.com/tensorpack/tensorpack.git 21 | 22 | cd tensorpack 23 | git checkout d24a9230d50b1dea1712a4c2765a11876f1e193c 24 | cd .. 25 | 26 | pip3 install cython 27 | pip3 install tensorflow-gpu==1.15 28 | pip3 install wget shapely msgpack msgpack_numpy tabulate xmltodict pycocotools opencv-python tqdm zmq annoy 29 | ``` 30 | ### Add libraries to your PYTHONPATH 31 | ``` 32 | export PYTHONPATH=${PYTHONPATH}:/home/${USERNAME}/vision/got10k-toolkit/:/home/${USERNAME}/vision/tensorpack/ 33 | ``` 34 | 35 | ### Make Folder for models and logs and download pre-trained model 36 | ``` 37 | cd SiamR-CNN/ 38 | mkdir train_log 39 | cd train_log 40 | wget --no-check-certificate -r -nH --cut-dirs=2 --no-parent --reject="index.html*" https://omnomnom.vision.rwth-aachen.de/data/siamrcnn/hard_mining3/ 41 | cd .. 42 | ``` 43 | ## Evaluation 44 | For evaluation, first set the path to the dataset on which you want to evaluate in tracking/do_tracking.py, e.g. 45 | ``` 46 | OTB_2015_ROOT_DIR = '/data/otb2015/' 47 | ``` 48 | 49 | Then run tracking/do_tracking.py and specify the dataset you want to evaluate on using the main function for this dataset using e.g. --main main_otb 50 | 51 | ``` 52 | python3 tracking/do_tracking.py --main main_otb 53 | ``` 54 | 55 | The result will then be written to tracking_data/results/ 56 | 57 | ## Training 58 | Download the pre-trained Mask R-CNN model from http://models.tensorpack.com/FasterRCNN/COCO-MaskRCNN-R101FPN9xGNCasAugScratch.npz 59 | 60 | Now change the paths to the training datasets in config.py, e.g. 61 | ``` 62 | _C.DATA.IMAGENET_VID_ROOT = "/globalwork/data/ILSVRC_VID/ILSVRC/" 63 | ``` 64 | there you can also enable and disable different datasets, e.g. 65 | ``` 66 | _C.DATA.IMAGENET_VID = True 67 | ``` 68 | 69 | To run the main training (without hard example mining): 70 | ``` 71 | python3 train.py --load /path/to/COCO-R101FPN-MaskRCNN-ScratchGN.npz 72 | ``` 73 | 74 | ## Hints about the code 75 | In the code, we sometimes use the terminology "ThreeStageTracker" or three stages. This refers to the Tracklet Dynamic Programming Algorithm (TDPA). 76 | 77 | In order to make the code more readable, we removed some parts before publishing. If there's an important feature which you are missing, please write us an email at voigtlaender@vision.rwth-aachen.de 78 | 79 | In the current version of the code, the functions to pre-compute the features for hard example mining are not available, but we can share the pre-computed data on request. 80 | 81 | ## References 82 | If you find this code useful, please cite 83 | ``` 84 | Siam R-CNN: Visual Tracking by Re-Detection 85 | Paul Voigtlaender, Jonathon Luiten, Philip H.S. Torr, Bastian Leibe. 86 | IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2020. 87 | ``` 88 | -------------------------------------------------------------------------------- /tracking/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import PIL.Image 3 | 4 | from examples.FasterRCNN.common import clip_boxes 5 | 6 | 7 | def xyxy_to_cxcywh_np(boxes_xyxy): 8 | wh = boxes_xyxy[:, 2:] - boxes_xyxy[:, :2] 9 | c = boxes_xyxy[:, :2] + wh / 2 10 | boxes_cwh = np.concatenate((c, wh), axis=1) 11 | return boxes_cwh 12 | 13 | 14 | def cxcywh_to_xyxy_np(boxes_cxcywh): 15 | boxes_xyxy = boxes_cxcywh.copy() 16 | boxes_xyxy[:, :2] -= 0.5 * boxes_xyxy[:, 2:] 17 | boxes_xyxy[:, 2:] += boxes_xyxy[:, :2] 18 | return boxes_xyxy 19 | 20 | 21 | def resize_and_clip_boxes(img, resized_img, boxes): 22 | scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1]) 23 | orig_shape = img.shape[:2] 24 | boxes = boxes / scale 25 | boxes = clip_boxes(boxes, orig_shape) 26 | return boxes 27 | 28 | 29 | # adapted from https://github.com/matterport/Mask_RCNN/blob/master/mrcnn/visualize.py 30 | def generate_colors(): 31 | """ 32 | Generate random colors. 33 | To get visually distinct colors, generate them in HSV space then 34 | convert to RGB. 35 | """ 36 | N = 30 37 | brightness = 0.7 38 | hsv = [(i / N, 1, brightness) for i in range(N)] 39 | import colorsys 40 | colors = list(map(lambda c: colorsys.hsv_to_rgb(*c), hsv)) 41 | perm = [15, 13, 25, 12, 19, 8, 22, 24, 29, 17, 28, 20, 2, 27, 11, 26, 21, 4, 3, 18, 9, 5, 14, 1, 16, 0, 23, 7, 6, 10] 42 | colors = [colors[idx] for idx in perm] 43 | return colors 44 | 45 | 46 | def postproc_seq_name_otb(seq_name): 47 | if seq_name == "Human4": 48 | seq_name_postproc = "Human4-2" 49 | elif seq_name == "Skating2_1": 50 | seq_name_postproc = "Skating2-1" 51 | elif seq_name == "Skating2_2": 52 | seq_name_postproc = "Skating2-2" 53 | elif seq_name == "Jogging_1": 54 | seq_name_postproc = "Jogging-1" 55 | elif seq_name == "Jogging_2": 56 | seq_name_postproc = "Jogging-2" 57 | else: 58 | seq_name_postproc = seq_name 59 | return seq_name_postproc 60 | 61 | 62 | def read_gt_otb(gt_file): 63 | boxes = [] 64 | with open(gt_file) as f: 65 | for l in f: 66 | l = l.strip() 67 | assert "," in l 68 | sp = l.split(",") 69 | x1, y1, w, h = [float(x) for x in sp] 70 | x2 = x1 + w 71 | y2 = y1 + h 72 | box = [x1, y1, x2, y2] 73 | boxes.append(box) 74 | boxes = np.array(boxes) 75 | return boxes 76 | 77 | 78 | pascal_colormap = [ 79 | 0, 0, 0, 80 | 0.5020, 0, 0, 81 | 0, 0.5020, 0, 82 | 0.5020, 0.5020, 0, 83 | 0, 0, 0.5020, 84 | 0.5020, 0, 0.5020, 85 | 0, 0.5020, 0.5020, 86 | 0.5020, 0.5020, 0.5020, 87 | 0.2510, 0, 0, 88 | 0.7529, 0, 0, 89 | 0.2510, 0.5020, 0, 90 | 0.7529, 0.5020, 0, 91 | 0.2510, 0, 0.5020, 92 | 0.7529, 0, 0.5020, 93 | 0.2510, 0.5020, 0.5020, 94 | 0.7529, 0.5020, 0.5020, 95 | 0, 0.2510, 0, 96 | 0.5020, 0.2510, 0, 97 | 0, 0.7529, 0, 98 | 0.5020, 0.7529, 0, 99 | 0, 0.2510, 0.5020, 100 | 0.5020, 0.2510, 0.5020, 101 | 0, 0.7529, 0.5020, 102 | 0.5020, 0.7529, 0.5020, 103 | 0.2510, 0.2510, 0] 104 | 105 | 106 | def save_segmentation_with_colormap(filename, img): 107 | """Saves a segmentation with the pascal colormap as expected for DAVIS eval. 108 | Args: 109 | filename: Where to store the segmentation. 110 | img: A numpy array of the segmentation to be saved. 111 | """ 112 | if img.shape[-1] == 1: 113 | img = img[..., 0] 114 | 115 | # Save with colormap. 116 | colormap = (np.array(pascal_colormap) * 255).round().astype('uint8') 117 | colormap_image = PIL.Image.new('P', (16, 16)) 118 | colormap_image.putpalette(colormap) 119 | pil_image = PIL.Image.fromarray(img.astype('uint8')) 120 | pil_image_with_colormap = pil_image.quantize(palette=colormap_image) 121 | pil_image_with_colormap.save(filename) 122 | 123 | -------------------------------------------------------------------------------- /vot_helper.py: -------------------------------------------------------------------------------- 1 | """ 2 | \file vot.py 3 | 4 | @brief Python utility functions for VOT integration 5 | 6 | @author Luka Cehovin, Alessio Dore 7 | 8 | @date 2016 9 | 10 | """ 11 | 12 | import sys 13 | import copy 14 | import collections 15 | import numpy as np 16 | 17 | try: 18 | import trax 19 | except ImportError: 20 | raise Exception('TraX support not found. Please add trax module to Python path.') 21 | 22 | Rectangle = collections.namedtuple('Rectangle', ['x', 'y', 'width', 'height']) 23 | Point = collections.namedtuple('Point', ['x', 'y']) 24 | Polygon = collections.namedtuple('Polygon', ['points']) 25 | 26 | class VOT(object): 27 | """ Base class for Python VOT integration """ 28 | def __init__(self, region_format, channels=None): 29 | """ Constructor 30 | 31 | Args: 32 | region_format: Region format options 33 | """ 34 | assert(region_format in [trax.Region.RECTANGLE, trax.Region.POLYGON, trax.Region.MASK]) 35 | 36 | if channels is None: 37 | channels = ['color'] 38 | elif channels == 'rgbd': 39 | channels = ['color', 'depth'] 40 | elif channels == 'rgbt': 41 | channels = ['color', 'ir'] 42 | elif channels == 'ir': 43 | channels = ['ir'] 44 | else: 45 | raise Exception('Illegal configuration {}.'.format(channels)) 46 | 47 | self._trax = trax.Server([region_format], [trax.Image.PATH], channels, customMetadata=dict(vot="python")) 48 | 49 | request = self._trax.wait() 50 | assert(request.type == 'initialize') 51 | if isinstance(request.region, trax.Polygon): 52 | self._region = Polygon([Point(x[0], x[1]) for x in request.region]) 53 | elif isinstance(request.region, trax.Mask): 54 | self._region = request.region.array(True) 55 | else: 56 | self._region = Rectangle(*request.region.bounds()) 57 | self._image = [x.path() for k, x in request.image.items()] 58 | if len(self._image) == 1: 59 | self._image = self._image[0] 60 | 61 | self._trax.status(request.region) 62 | 63 | def region(self): 64 | """ 65 | Send configuration message to the client and receive the initialization 66 | region and the path of the first image 67 | 68 | Returns: 69 | initialization region 70 | """ 71 | 72 | return self._region 73 | 74 | def report(self, region, confidence = None): 75 | """ 76 | Report the tracking results to the client 77 | 78 | Arguments: 79 | region: region for the frame 80 | """ 81 | assert(isinstance(region, (Rectangle, Polygon, np.ndarray))) 82 | if isinstance(region, Polygon): 83 | tregion = trax.Polygon.create([(x.x, x.y) for x in region.points]) 84 | elif isinstance(region, np.ndarray): 85 | tregion = trax.Mask.create(region) 86 | else: 87 | tregion = trax.Rectangle.create(region.x, region.y, region.width, region.height) 88 | properties = {} 89 | if not confidence is None: 90 | properties['confidence'] = confidence 91 | self._trax.status(tregion, properties) 92 | 93 | def frame(self): 94 | """ 95 | Get a frame (image path) from client 96 | 97 | Returns: 98 | absolute path of the image 99 | """ 100 | if hasattr(self, "_image"): 101 | image = self._image 102 | del self._image 103 | return image 104 | 105 | request = self._trax.wait() 106 | 107 | if request.type == 'frame': 108 | image = [x.path() for k, x in request.image.items()] 109 | if len(image) == 1: 110 | return image[0] 111 | return image 112 | else: 113 | return None 114 | 115 | 116 | def quit(self): 117 | if hasattr(self, '_trax'): 118 | self._trax.quit() 119 | 120 | def __del__(self): 121 | self.quit() 122 | 123 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File: common.py 3 | 4 | import numpy as np 5 | import cv2 6 | 7 | from tensorpack.dataflow import RNGDataFlow 8 | from tensorpack.dataflow.imgaug import transform 9 | 10 | 11 | class DataFromListOfDict(RNGDataFlow): 12 | def __init__(self, lst, keys, shuffle=False): 13 | self._lst = lst 14 | self._keys = keys 15 | self._shuffle = shuffle 16 | self._size = len(lst) 17 | 18 | def __len__(self): 19 | return self._size 20 | 21 | def __iter__(self): 22 | if self._shuffle: 23 | self.rng.shuffle(self._lst) 24 | for dic in self._lst: 25 | dp = [dic[k] for k in self._keys] 26 | yield dp 27 | 28 | 29 | class CustomResize(transform.TransformAugmentorBase): 30 | """ 31 | Try resizing the shortest edge to a certain number 32 | while avoiding the longest edge to exceed max_size. 33 | """ 34 | 35 | def __init__(self, short_edge_length, max_size, interp=cv2.INTER_LINEAR): 36 | """ 37 | Args: 38 | short_edge_length ([int, int]): a [min, max] interval from which to sample the 39 | shortest edge length. 40 | max_size (int): maximum allowed longest edge length. 41 | """ 42 | super(CustomResize, self).__init__() 43 | if isinstance(short_edge_length, int): 44 | short_edge_length = (short_edge_length, short_edge_length) 45 | self._init(locals()) 46 | 47 | def _get_augment_params(self, img): 48 | h, w = img.shape[:2] 49 | size = self.rng.randint( 50 | self.short_edge_length[0], self.short_edge_length[1] + 1) 51 | scale = size * 1.0 / min(h, w) 52 | if h < w: 53 | newh, neww = size, scale * w 54 | else: 55 | newh, neww = scale * h, size 56 | if max(newh, neww) > self.max_size: 57 | scale = self.max_size * 1.0 / max(newh, neww) 58 | newh = newh * scale 59 | neww = neww * scale 60 | neww = int(neww + 0.5) 61 | newh = int(newh + 0.5) 62 | return transform.ResizeTransform(h, w, newh, neww, self.interp) 63 | 64 | 65 | def box_to_point8(boxes): 66 | """ 67 | Args: 68 | boxes: nx4 69 | 70 | Returns: 71 | (nx4)x2 72 | """ 73 | b = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]] 74 | b = b.reshape((-1, 2)) 75 | return b 76 | 77 | 78 | def point8_to_box(points): 79 | """ 80 | Args: 81 | points: (nx4)x2 82 | Returns: 83 | nx4 boxes (x1y1x2y2) 84 | """ 85 | p = points.reshape((-1, 4, 2)) 86 | minxy = p.min(axis=1) # nx2 87 | maxxy = p.max(axis=1) # nx2 88 | return np.concatenate((minxy, maxxy), axis=1) 89 | 90 | 91 | def segmentation_to_mask(polys, height, width): 92 | """ 93 | Convert polygons to binary masks. 94 | 95 | Args: 96 | polys: a list of nx2 float array. Each array contains many (x, y) coordinates. 97 | 98 | Returns: 99 | a binary matrix of (height, width) 100 | """ 101 | polys = [p.flatten().tolist() for p in polys] 102 | assert len(polys) > 0, "Polygons are empty!" 103 | 104 | import pycocotools.mask as cocomask 105 | rles = cocomask.frPyObjects(polys, height, width) 106 | rle = cocomask.merge(rles) 107 | return cocomask.decode(rle) 108 | 109 | 110 | def clip_boxes(boxes, shape): 111 | """ 112 | Args: 113 | boxes: (...)x4, float 114 | shape: h, w 115 | """ 116 | orig_shape = boxes.shape 117 | boxes = boxes.reshape([-1, 4]) 118 | h, w = shape 119 | boxes[:, [0, 1]] = np.maximum(boxes[:, [0, 1]], 0) 120 | boxes[:, 2] = np.minimum(boxes[:, 2], w) 121 | boxes[:, 3] = np.minimum(boxes[:, 3], h) 122 | return boxes.reshape(orig_shape) 123 | 124 | 125 | def filter_boxes_inside_shape(boxes, shape): 126 | """ 127 | Args: 128 | boxes: (nx4), float 129 | shape: (h, w) 130 | 131 | Returns: 132 | indices: (k, ) 133 | selection: (kx4) 134 | """ 135 | assert boxes.ndim == 2, boxes.shape 136 | assert len(shape) == 2, shape 137 | h, w = shape 138 | indices = np.where( 139 | (boxes[:, 0] >= 0) & 140 | (boxes[:, 1] >= 0) & 141 | (boxes[:, 2] <= w) & 142 | (boxes[:, 3] <= h))[0] 143 | return indices, boxes[indices, :] 144 | 145 | 146 | try: 147 | import pycocotools.mask as cocomask 148 | 149 | # Much faster than utils/np_box_ops 150 | def np_iou(A, B): 151 | def to_xywh(box): 152 | box = box.copy() 153 | box[:, 2] -= box[:, 0] 154 | box[:, 3] -= box[:, 1] 155 | return box 156 | 157 | ret = cocomask.iou( 158 | to_xywh(A), to_xywh(B), 159 | np.zeros((len(B),), dtype=np.bool)) 160 | # can accelerate even more, if using float32 161 | return ret.astype('float32') 162 | 163 | except ImportError: 164 | from utils.np_box_ops import iou as np_iou # noqa 165 | -------------------------------------------------------------------------------- /model_rpn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | from tensorpack.models import Conv2D, layer_register 6 | from tensorpack.tfutils.argscope import argscope 7 | from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope, under_name_scope 8 | from tensorpack.tfutils.summary import add_moving_summary 9 | 10 | from config import config as cfg 11 | from model_box import clip_boxes 12 | 13 | 14 | @layer_register(log_shape=True) 15 | @auto_reuse_variable_scope 16 | def rpn_head(featuremap, channel, num_anchors): 17 | """ 18 | Returns: 19 | label_logits: fHxfWxNA 20 | box_logits: fHxfWxNAx4 21 | """ 22 | with argscope(Conv2D, data_format='channels_first', 23 | kernel_initializer=tf.random_normal_initializer(stddev=0.01)): 24 | hidden = Conv2D('conv0', featuremap, channel, 3, activation=tf.nn.relu) 25 | 26 | label_logits = Conv2D('class', hidden, num_anchors, 1) 27 | box_logits = Conv2D('box', hidden, 4 * num_anchors, 1) 28 | # 1, NA(*4), im/16, im/16 (NCHW) 29 | 30 | label_logits = tf.transpose(label_logits, [0, 2, 3, 1]) # 1xfHxfWxNA 31 | label_logits = tf.squeeze(label_logits, 0) # fHxfWxNA 32 | 33 | shp = tf.shape(box_logits) # 1x(NAx4)xfHxfW 34 | box_logits = tf.transpose(box_logits, [0, 2, 3, 1]) # 1xfHxfWx(NAx4) 35 | box_logits = tf.reshape(box_logits, tf.stack([shp[2], shp[3], num_anchors, 4])) # fHxfWxNAx4 36 | return label_logits, box_logits 37 | 38 | 39 | @under_name_scope() 40 | def rpn_losses(anchor_labels, anchor_boxes, label_logits, box_logits): 41 | """ 42 | Args: 43 | anchor_labels: fHxfWxNA 44 | anchor_boxes: fHxfWxNAx4, encoded 45 | label_logits: fHxfWxNA 46 | box_logits: fHxfWxNAx4 47 | 48 | Returns: 49 | label_loss, box_loss 50 | """ 51 | with tf.device('/cpu:0'): 52 | valid_mask = tf.stop_gradient(tf.not_equal(anchor_labels, -1)) 53 | pos_mask = tf.stop_gradient(tf.equal(anchor_labels, 1)) 54 | nr_valid = tf.stop_gradient(tf.count_nonzero(valid_mask, dtype=tf.int32), name='num_valid_anchor') 55 | nr_pos = tf.identity(tf.count_nonzero(pos_mask, dtype=tf.int32), name='num_pos_anchor') 56 | # nr_pos is guaranteed >0 in C4. But in FPN. even nr_valid could be 0. 57 | 58 | valid_anchor_labels = tf.boolean_mask(anchor_labels, valid_mask) 59 | valid_label_logits = tf.boolean_mask(label_logits, valid_mask) 60 | 61 | with tf.name_scope('label_metrics'): 62 | valid_label_prob = tf.nn.sigmoid(valid_label_logits) 63 | summaries = [] 64 | with tf.device('/cpu:0'): 65 | for th in [0.5, 0.2, 0.1]: 66 | valid_prediction = tf.cast(valid_label_prob > th, tf.int32) 67 | nr_pos_prediction = tf.reduce_sum(valid_prediction, name='num_pos_prediction') 68 | pos_prediction_corr = tf.count_nonzero( 69 | tf.logical_and( 70 | valid_label_prob > th, 71 | tf.equal(valid_prediction, valid_anchor_labels)), 72 | dtype=tf.int32) 73 | placeholder = 0.5 # A small value will make summaries appear lower. 74 | recall = tf.cast(tf.truediv(pos_prediction_corr, nr_pos), tf.float32) 75 | recall = tf.where(tf.equal(nr_pos, 0), placeholder, recall, name='recall_th{}'.format(th)) 76 | precision = tf.cast(tf.truediv(pos_prediction_corr, nr_pos_prediction), tf.float32) 77 | precision = tf.where(tf.equal(nr_pos_prediction, 0), 78 | placeholder, precision, name='precision_th{}'.format(th)) 79 | summaries.extend([precision, recall]) 80 | add_moving_summary(*summaries) 81 | 82 | # Per-level loss summaries in FPN may appear lower due to the use of a small placeholder. 83 | # But the total RPN loss will be fine. TODO make the summary op smarter 84 | placeholder = 0. 85 | label_loss = tf.nn.sigmoid_cross_entropy_with_logits( 86 | labels=tf.cast(valid_anchor_labels, tf.float32), logits=valid_label_logits) 87 | label_loss = tf.reduce_sum(label_loss) * (1. / cfg.RPN.BATCH_PER_IM) 88 | label_loss = tf.where(tf.equal(nr_valid, 0), placeholder, label_loss, name='label_loss') 89 | 90 | pos_anchor_boxes = tf.boolean_mask(anchor_boxes, pos_mask) 91 | pos_box_logits = tf.boolean_mask(box_logits, pos_mask) 92 | delta = 1.0 / 9 93 | box_loss = tf.losses.huber_loss( 94 | pos_anchor_boxes, pos_box_logits, delta=delta, 95 | reduction=tf.losses.Reduction.SUM) / delta 96 | box_loss = box_loss * (1. / cfg.RPN.BATCH_PER_IM) 97 | box_loss = tf.where(tf.equal(nr_pos, 0), placeholder, box_loss, name='box_loss') 98 | 99 | add_moving_summary(label_loss, box_loss, nr_valid, nr_pos) 100 | return [label_loss, box_loss] 101 | 102 | 103 | @under_name_scope() 104 | def generate_rpn_proposals(boxes, scores, img_shape, 105 | pre_nms_topk, post_nms_topk=None): 106 | """ 107 | Sample RPN proposals by the following steps: 108 | 1. Pick top k1 by scores 109 | 2. NMS them 110 | 3. Pick top k2 by scores. Default k2 == k1, i.e. does not filter the NMS output. 111 | 112 | Args: 113 | boxes: nx4 float dtype, the proposal boxes. Decoded to floatbox already 114 | scores: n float, the logits 115 | img_shape: [h, w] 116 | pre_nms_topk, post_nms_topk (int): See above. 117 | 118 | Returns: 119 | boxes: kx4 float 120 | scores: k logits 121 | """ 122 | assert boxes.shape.ndims == 2, boxes.shape 123 | if post_nms_topk is None: 124 | post_nms_topk = pre_nms_topk 125 | 126 | topk = tf.minimum(pre_nms_topk, tf.size(scores)) 127 | topk_scores, topk_indices = tf.nn.top_k(scores, k=topk, sorted=False) 128 | topk_boxes = tf.gather(boxes, topk_indices) 129 | topk_boxes = clip_boxes(topk_boxes, img_shape) 130 | 131 | topk_boxes_x1y1x2y2 = tf.reshape(topk_boxes, (-1, 2, 2)) 132 | topk_boxes_x1y1, topk_boxes_x2y2 = tf.split(topk_boxes_x1y1x2y2, 2, axis=1) 133 | # nx1x2 each 134 | wbhb = tf.squeeze(topk_boxes_x2y2 - topk_boxes_x1y1, axis=1) 135 | valid = tf.reduce_all(wbhb > cfg.RPN.MIN_SIZE, axis=1) # n, 136 | topk_valid_boxes_x1y1x2y2 = tf.boolean_mask(topk_boxes_x1y1x2y2, valid) 137 | topk_valid_scores = tf.boolean_mask(topk_scores, valid) 138 | 139 | # TODO not needed 140 | topk_valid_boxes_y1x1y2x2 = tf.reshape( 141 | tf.reverse(topk_valid_boxes_x1y1x2y2, axis=[2]), 142 | (-1, 4), name='nms_input_boxes') 143 | nms_indices = tf.image.non_max_suppression( 144 | topk_valid_boxes_y1x1y2x2, 145 | topk_valid_scores, 146 | max_output_size=post_nms_topk, 147 | iou_threshold=cfg.RPN.PROPOSAL_NMS_THRESH) 148 | 149 | topk_valid_boxes = tf.reshape(topk_valid_boxes_x1y1x2y2, (-1, 4)) 150 | proposal_boxes = tf.gather(topk_valid_boxes, nms_indices) 151 | proposal_scores = tf.gather(topk_valid_scores, nms_indices) 152 | tf.sigmoid(proposal_scores, name='probs') # for visualization 153 | return tf.stop_gradient(proposal_boxes, name='boxes'), tf.stop_gradient(proposal_scores, name='scores') 154 | -------------------------------------------------------------------------------- /model_box.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File: model_box.py 3 | 4 | import numpy as np 5 | from collections import namedtuple 6 | import tensorflow as tf 7 | 8 | from tensorpack.tfutils.scope_utils import under_name_scope 9 | 10 | from config import config 11 | 12 | 13 | @under_name_scope() 14 | def clip_boxes(boxes, window, name=None): 15 | """ 16 | Args: 17 | boxes: nx4, xyxy 18 | window: [h, w] 19 | """ 20 | boxes = tf.maximum(boxes, 0.0) 21 | m = tf.tile(tf.reverse(window, [0]), [2]) # (4,) 22 | boxes = tf.minimum(boxes, tf.cast(m, tf.float32), name=name) 23 | return boxes 24 | 25 | 26 | @under_name_scope() 27 | def decode_bbox_target(box_predictions, anchors): 28 | """ 29 | Args: 30 | box_predictions: (..., 4), logits 31 | anchors: (..., 4), floatbox. Must have the same shape 32 | 33 | Returns: 34 | box_decoded: (..., 4), float32. With the same shape. 35 | """ 36 | orig_shape = tf.shape(anchors) 37 | box_pred_txtytwth = tf.reshape(box_predictions, (-1, 2, 2)) 38 | box_pred_txty, box_pred_twth = tf.split(box_pred_txtytwth, 2, axis=1) 39 | # each is (...)x1x2 40 | anchors_x1y1x2y2 = tf.reshape(anchors, (-1, 2, 2)) 41 | anchors_x1y1, anchors_x2y2 = tf.split(anchors_x1y1x2y2, 2, axis=1) 42 | 43 | waha = anchors_x2y2 - anchors_x1y1 44 | xaya = (anchors_x2y2 + anchors_x1y1) * 0.5 45 | 46 | clip = np.log(config.PREPROC.MAX_SIZE / 16.) 47 | wbhb = tf.exp(tf.minimum(box_pred_twth, clip)) * waha 48 | xbyb = box_pred_txty * waha + xaya 49 | x1y1 = xbyb - wbhb * 0.5 50 | x2y2 = xbyb + wbhb * 0.5 # (...)x1x2 51 | out = tf.concat([x1y1, x2y2], axis=-2) 52 | return tf.reshape(out, orig_shape) 53 | 54 | 55 | @under_name_scope() 56 | def encode_bbox_target(boxes, anchors): 57 | """ 58 | Args: 59 | boxes: (..., 4), float32 60 | anchors: (..., 4), float32 61 | 62 | Returns: 63 | box_encoded: (..., 4), float32 with the same shape. 64 | """ 65 | anchors_x1y1x2y2 = tf.reshape(anchors, (-1, 2, 2)) 66 | anchors_x1y1, anchors_x2y2 = tf.split(anchors_x1y1x2y2, 2, axis=1) 67 | waha = anchors_x2y2 - anchors_x1y1 68 | xaya = (anchors_x2y2 + anchors_x1y1) * 0.5 69 | 70 | boxes_x1y1x2y2 = tf.reshape(boxes, (-1, 2, 2)) 71 | boxes_x1y1, boxes_x2y2 = tf.split(boxes_x1y1x2y2, 2, axis=1) 72 | wbhb = boxes_x2y2 - boxes_x1y1 73 | xbyb = (boxes_x2y2 + boxes_x1y1) * 0.5 74 | 75 | # Note that here not all boxes are valid. Some may be zero 76 | txty = (xbyb - xaya) / waha 77 | twth = tf.log(wbhb / waha) # may contain -inf for invalid boxes 78 | encoded = tf.concat([txty, twth], axis=1) # (-1x2x2) 79 | return tf.reshape(encoded, tf.shape(boxes)) 80 | 81 | 82 | @under_name_scope() 83 | def crop_and_resize(image, boxes, box_ind, crop_size, pad_border=True): 84 | """ 85 | Aligned version of tf.image.crop_and_resize, following our definition of floating point boxes. 86 | 87 | Args: 88 | image: NCHW 89 | boxes: nx4, x1y1x2y2 90 | box_ind: (n,) 91 | crop_size (int): 92 | Returns: 93 | n,C,size,size 94 | """ 95 | assert isinstance(crop_size, int), crop_size 96 | boxes = tf.stop_gradient(boxes) 97 | 98 | # TF's crop_and_resize produces zeros on border 99 | if pad_border: 100 | # this can be quite slow 101 | image = tf.pad(image, [[0, 0], [0, 0], [1, 1], [1, 1]], mode='SYMMETRIC') 102 | boxes = boxes + 1 103 | 104 | @under_name_scope() 105 | def transform_fpcoor_for_tf(boxes, image_shape, crop_shape): 106 | """ 107 | The way tf.image.crop_and_resize works (with normalized box): 108 | Initial point (the value of output[0]): x0_box * (W_img - 1) 109 | Spacing: w_box * (W_img - 1) / (W_crop - 1) 110 | Use the above grid to bilinear sample. 111 | 112 | However, what we want is (with fpcoor box): 113 | Spacing: w_box / W_crop 114 | Initial point: x0_box + spacing/2 - 0.5 115 | (-0.5 because bilinear sample (in my definition) assumes floating point coordinate 116 | (0.0, 0.0) is the same as pixel value (0, 0)) 117 | 118 | This function transform fpcoor boxes to a format to be used by tf.image.crop_and_resize 119 | 120 | Returns: 121 | y1x1y2x2 122 | """ 123 | x0, y0, x1, y1 = tf.split(boxes, 4, axis=1) 124 | 125 | spacing_w = (x1 - x0) / tf.cast(crop_shape[1], tf.float32) 126 | spacing_h = (y1 - y0) / tf.cast(crop_shape[0], tf.float32) 127 | 128 | imshape = [tf.cast(image_shape[0] - 1, tf.float32), tf.cast(image_shape[1] - 1, tf.float32)] 129 | nx0 = (x0 + spacing_w / 2 - 0.5) / imshape[1] 130 | ny0 = (y0 + spacing_h / 2 - 0.5) / imshape[0] 131 | 132 | nw = spacing_w * tf.cast(crop_shape[1] - 1, tf.float32) / imshape[1] 133 | nh = spacing_h * tf.cast(crop_shape[0] - 1, tf.float32) / imshape[0] 134 | 135 | return tf.concat([ny0, nx0, ny0 + nh, nx0 + nw], axis=1) 136 | 137 | # Expand bbox to a minium size of 1 138 | # boxes_x1y1, boxes_x2y2 = tf.split(boxes, 2, axis=1) 139 | # boxes_wh = boxes_x2y2 - boxes_x1y1 140 | # boxes_center = tf.reshape((boxes_x2y2 + boxes_x1y1) * 0.5, [-1, 2]) 141 | # boxes_newwh = tf.maximum(boxes_wh, 1.) 142 | # boxes_x1y1new = boxes_center - boxes_newwh * 0.5 143 | # boxes_x2y2new = boxes_center + boxes_newwh * 0.5 144 | # boxes = tf.concat([boxes_x1y1new, boxes_x2y2new], axis=1) 145 | 146 | image_shape = tf.shape(image)[2:] 147 | boxes = transform_fpcoor_for_tf(boxes, image_shape, [crop_size, crop_size]) 148 | image = tf.transpose(image, [0, 2, 3, 1]) # nhwc 149 | ret = tf.image.crop_and_resize( 150 | image, boxes, tf.cast(box_ind, tf.int32), 151 | crop_size=[crop_size, crop_size]) 152 | ret = tf.transpose(ret, [0, 3, 1, 2]) # ncss 153 | return ret 154 | 155 | 156 | @under_name_scope() 157 | def roi_align(featuremap, boxes, resolution): 158 | """ 159 | Args: 160 | featuremap: 1xCxHxW 161 | boxes: Nx4 floatbox 162 | resolution: output spatial resolution 163 | 164 | Returns: 165 | NxCx res x res 166 | """ 167 | # sample 4 locations per roi bin 168 | ret = crop_and_resize( 169 | featuremap, boxes, 170 | tf.zeros([tf.shape(boxes)[0]], dtype=tf.int32), 171 | resolution * 2) 172 | ret = tf.nn.avg_pool(ret, [1, 1, 2, 2], [1, 1, 2, 2], padding='SAME', data_format='NCHW') 173 | return ret 174 | 175 | 176 | class RPNAnchors(namedtuple('_RPNAnchors', ['boxes', 'gt_labels', 'gt_boxes'])): 177 | """ 178 | boxes (FS x FS x NA x 4): The anchor boxes. 179 | gt_labels (FS x FS x NA): 180 | gt_boxes (FS x FS x NA x 4): Groundtruth boxes corresponding to each anchor. 181 | """ 182 | def encoded_gt_boxes(self): 183 | return encode_bbox_target(self.gt_boxes, self.boxes) 184 | 185 | def decode_logits(self, logits): 186 | return decode_bbox_target(logits, self.boxes) 187 | 188 | @under_name_scope() 189 | def narrow_to(self, featuremap): 190 | """ 191 | Slice anchors to the spatial size of this featuremap. 192 | """ 193 | shape2d = tf.shape(featuremap)[2:] # h,w 194 | slice3d = tf.concat([shape2d, [-1]], axis=0) 195 | slice4d = tf.concat([shape2d, [-1, -1]], axis=0) 196 | boxes = tf.slice(self.boxes, [0, 0, 0, 0], slice4d) 197 | gt_labels = tf.slice(self.gt_labels, [0, 0, 0], slice3d) 198 | gt_boxes = tf.slice(self.gt_boxes, [0, 0, 0, 0], slice4d) 199 | return RPNAnchors(boxes, gt_labels, gt_boxes) 200 | 201 | 202 | if __name__ == '__main__': 203 | """ 204 | Demonstrate what's wrong with tf.image.crop_and_resize: 205 | """ 206 | import tensorflow.contrib.eager as tfe 207 | tfe.enable_eager_execution() 208 | 209 | # want to crop 2x2 out of a 5x5 image, and resize to 4x4 210 | image = np.arange(25).astype('float32').reshape(5, 5) 211 | boxes = np.asarray([[1, 1, 3, 3]], dtype='float32') 212 | target = 4 213 | 214 | print(crop_and_resize( 215 | image[None, None, :, :], boxes, [0], target)[0][0]) 216 | """ 217 | Expected values: 218 | 4.5 5 5.5 6 219 | 7 7.5 8 8.5 220 | 9.5 10 10.5 11 221 | 12 12.5 13 13.5 222 | 223 | You cannot easily get the above results with tf.image.crop_and_resize. 224 | Try out yourself here: 225 | """ 226 | print(tf.image.crop_and_resize( 227 | image[None, :, :, None], 228 | np.asarray([[1, 1, 2, 2]]) / 4.0, [0], [target, target])[0][:, :, 0]) 229 | -------------------------------------------------------------------------------- /tracking/argmax_tracker.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import random 3 | import numpy as np 4 | from got10k.trackers import Tracker 5 | from config import config as cfg, finalize_configs 6 | from tensorpack import PredictConfig, get_model_loader, OfflinePredictor, logger 7 | 8 | from train import ResNetFPNModel 9 | from common import CustomResize, box_to_point8, point8_to_box 10 | 11 | 12 | class PrecomputingReferenceTracker(Tracker): 13 | def __init__(self, name, need_network=True, need_img=True, model="best"): 14 | super().__init__(name=name, is_deterministic=True) 15 | self._resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) 16 | self._prev_box = None 17 | self._ff_gt_feats = None 18 | self._need_network = need_network 19 | self._need_img = need_img 20 | self._rotated_bbox = None 21 | 22 | if need_network: 23 | logger.set_logger_dir("/tmp/test_log_/" + str(random.randint(0, 10000)), 'd') 24 | if model == "best": 25 | load = "train_log/hard_mining3/model-1360500" 26 | elif model == "nohardexamples": 27 | load = "train_log/condrcnn_all_2gpu_lrreduce2/model-1200500" 28 | elif model == "newrpn": 29 | load = "train_log/newrpn1/model" 30 | elif model =="resnet50_nohardexamples": 31 | load = "train_log/condrcnn_all_resnet50/model-1200500" 32 | cfg.BACKBONE.RESNET_NUM_BLOCKS = [3, 4, 6, 3] 33 | elif model =="resnet50": 34 | load = "train_log/hard_mining3_resnet50/model-1360500" 35 | cfg.BACKBONE.RESNET_NUM_BLOCKS = [3, 4, 6, 3] 36 | elif model == "gotonly": 37 | load = "train_log/hard_mining3_onlygot/model-1361000" 38 | elif model.startswith("checkpoint:"): 39 | load = model.replace("checkpoint:", "") 40 | else: 41 | assert False, ("unknown model", model) 42 | from dataset import DetectionDataset 43 | # init tensorpack model 44 | # cfg.freeze(False) 45 | DetectionDataset() # initialize the config with information from our dataset 46 | 47 | cfg.EXTRACT_GT_FEATURES = True 48 | cfg.MODE_TRACK = False 49 | extract_model = ResNetFPNModel() 50 | extract_ff_feats_cfg = PredictConfig( 51 | model=extract_model, 52 | session_init=get_model_loader(load), 53 | input_names=['image', 'roi_boxes'], 54 | output_names=['rpn/feature']) 55 | finalize_configs(is_training=False) 56 | self._extract_func = OfflinePredictor(extract_ff_feats_cfg) 57 | 58 | cfg.EXTRACT_GT_FEATURES = False 59 | cfg.MODE_TRACK = True 60 | cfg.USE_PRECOMPUTED_REF_FEATURES = True 61 | self._pred_func = self._make_pred_func(load) 62 | 63 | def _resize_image_together_with_boxes(self, img, *list_of_box_or_boxes): 64 | resized_img, params = self._resizer.augment_return_params(img) 65 | res_boxes = [] 66 | for box_or_boxes in list_of_box_or_boxes: 67 | expand = len(box_or_boxes.shape) == 1 68 | if expand: 69 | boxes = box_or_boxes[np.newaxis] 70 | else: 71 | boxes = box_or_boxes 72 | points = box_to_point8(boxes) 73 | points = self._resizer.augment_coords(points, params) 74 | resized_boxes = point8_to_box(points) 75 | if expand: 76 | resized_boxes = np.squeeze(resized_boxes, axis=0) 77 | res_boxes.append(resized_boxes) 78 | if len(res_boxes) == 1: 79 | res_boxes = res_boxes[0] 80 | return resized_img, res_boxes 81 | 82 | def _make_pred_func(self, load): 83 | from train import ResNetFPNTrackModel 84 | pred_model = ResNetFPNTrackModel() 85 | predcfg = PredictConfig( 86 | model=pred_model, 87 | session_init=get_model_loader(load), 88 | input_names=pred_model.get_inference_tensor_names()[0], 89 | output_names=pred_model.get_inference_tensor_names()[1]) 90 | return OfflinePredictor(predcfg) 91 | 92 | def init(self, image, box): 93 | ref_img = np.array(image)[..., ::-1] 94 | if ref_img is None: 95 | raise ValueError("failed to load img" + image.filename) 96 | box[2] += box[0] 97 | box[3] += box[1] 98 | ref_bbox = box 99 | self._prev_box = box 100 | if self._need_network: 101 | resized_ref_img, resized_ref_box = self._resize_image_together_with_boxes(ref_img, ref_bbox) 102 | feats, = self._extract_func(resized_ref_img, resized_ref_box[np.newaxis]) 103 | self._ff_gt_feats = feats[0] 104 | 105 | def update(self, image, use_confidences=False): 106 | if self._need_img: 107 | target_img = np.array(image)[..., ::-1] 108 | if target_img is None: 109 | raise ValueError("failed to load img" + str(target_img)) 110 | else: 111 | target_img = None 112 | 113 | new_box, score = self._update(target_img) 114 | if new_box is not None: 115 | self._prev_box = new_box 116 | 117 | ret_box = self._prev_box.copy() 118 | ret_box[2] -= ret_box[0] 119 | ret_box[3] -= ret_box[1] 120 | if self._rotated_bbox is not None: 121 | ret_box = self._rotated_bbox 122 | if use_confidences: 123 | return ret_box, score 124 | else: 125 | return ret_box 126 | 127 | 128 | class ArgmaxTracker(PrecomputingReferenceTracker): 129 | def __init__(self): 130 | super().__init__("ArgmaxTracker") 131 | 132 | def _update(self, img): 133 | from eval import predict_image_track_with_precomputed_ref_features 134 | results = predict_image_track_with_precomputed_ref_features(img, self._ff_gt_feats, self._pred_func) 135 | det_boxes = np.array([r.box for r in results]) 136 | det_scores = np.array([r.score for r in results]) 137 | if len(det_boxes) > 0: 138 | return det_boxes[0], det_scores[0] 139 | else: 140 | return None, None 141 | 142 | 143 | # just there to test the precomputing on against 144 | # not intended to be used anymore 145 | class NonPrecomputingArgmaxTracker(Tracker): 146 | def __init__(self): 147 | super().__init__(name='ArgmaxTracker', is_deterministic=True) 148 | self._ref_img = None 149 | self._ref_bbox = None 150 | self._prev_box = None 151 | model = self._init_model() 152 | load = "train_log/condrcnn_onlygot/model-460000" 153 | predcfg = PredictConfig( 154 | model=model, 155 | session_init=get_model_loader(load), 156 | input_names=model.get_inference_tensor_names()[0], 157 | output_names=model.get_inference_tensor_names()[1]) 158 | self._pred_func = OfflinePredictor(predcfg) 159 | 160 | def _init_model(self): 161 | logger.set_logger_dir("/tmp/test_log/", 'd') 162 | from dataset import DetectionDataset 163 | from train import ResNetFPNTrackModel 164 | # init tensorpack model 165 | cfg.freeze(False) 166 | model = ResNetFPNTrackModel() 167 | DetectionDataset() # initialize the config with information from our dataset 168 | finalize_configs(is_training=False) 169 | return model 170 | 171 | def init(self, image, box): 172 | self._ref_img = cv2.imread(image.filename, cv2.IMREAD_COLOR) 173 | if self._ref_img is None: 174 | raise ValueError("failed to load img" + str(self._ref_img)) 175 | box[2] += box[0] 176 | box[3] += box[1] 177 | self._ref_bbox = box 178 | self._prev_box = box 179 | 180 | def update(self, image): 181 | target_img = cv2.imread(image.filename, cv2.IMREAD_COLOR) 182 | # assert target_img is not None 183 | if target_img is None: 184 | raise ValueError("failed to load img" + str(target_img)) 185 | from eval import predict_image_track 186 | results = predict_image_track(target_img, self._ref_img, self._ref_bbox, self._pred_func) 187 | det_boxes = np.array([r.box for r in results]) 188 | det_scores = np.array([r.score for r in results]) 189 | if len(det_boxes) > 0: 190 | self._prev_box = det_boxes[0] 191 | 192 | ret_box = self._prev_box.copy() 193 | ret_box[2] -= ret_box[0] 194 | ret_box[3] -= ret_box[1] 195 | return ret_box 196 | -------------------------------------------------------------------------------- /basemodel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File: basemodel.py 3 | 4 | import numpy as np 5 | from contextlib import ExitStack, contextmanager 6 | import tensorflow as tf 7 | 8 | from tensorpack.models import BatchNorm, Conv2D, MaxPooling, layer_register 9 | from tensorpack.tfutils import argscope 10 | from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope 11 | from tensorpack.tfutils.varreplace import custom_getter_scope, freeze_variables 12 | 13 | from config import config as cfg 14 | 15 | 16 | @layer_register(log_shape=True) 17 | def GroupNorm(x, group=32, gamma_initializer=tf.constant_initializer(1.)): 18 | shape = x.get_shape().as_list() 19 | ndims = len(shape) 20 | assert ndims == 4, shape 21 | chan = shape[1] 22 | assert chan % group == 0, chan 23 | group_size = chan // group 24 | 25 | orig_shape = tf.shape(x) 26 | h, w = orig_shape[2], orig_shape[3] 27 | 28 | x = tf.reshape(x, tf.stack([-1, group, group_size, h, w])) 29 | 30 | mean, var = tf.nn.moments(x, [2, 3, 4], keep_dims=True) 31 | 32 | new_shape = [1, group, group_size, 1, 1] 33 | 34 | beta = tf.get_variable('beta', [chan], initializer=tf.constant_initializer()) 35 | beta = tf.reshape(beta, new_shape) 36 | 37 | gamma = tf.get_variable('gamma', [chan], initializer=gamma_initializer) 38 | gamma = tf.reshape(gamma, new_shape) 39 | 40 | out = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-5, name='output') 41 | return tf.reshape(out, orig_shape, name='output') 42 | 43 | 44 | def freeze_affine_getter(getter, *args, **kwargs): 45 | # custom getter to freeze affine params inside bn 46 | name = args[0] if len(args) else kwargs.get('name') 47 | if name.endswith('/gamma') or name.endswith('/beta'): 48 | kwargs['trainable'] = False 49 | ret = getter(*args, **kwargs) 50 | tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, ret) 51 | else: 52 | ret = getter(*args, **kwargs) 53 | return ret 54 | 55 | 56 | def maybe_reverse_pad(topleft, bottomright): 57 | if cfg.BACKBONE.TF_PAD_MODE: 58 | return [topleft, bottomright] 59 | return [bottomright, topleft] 60 | 61 | 62 | @contextmanager 63 | def backbone_scope(freeze): 64 | """ 65 | Args: 66 | freeze (bool): whether to freeze all the variables under the scope 67 | """ 68 | def nonlin(x): 69 | x = get_norm()(x) 70 | return tf.nn.relu(x) 71 | 72 | with argscope([Conv2D, MaxPooling, BatchNorm], data_format='channels_first'), \ 73 | argscope(Conv2D, use_bias=False, activation=nonlin, 74 | kernel_initializer=tf.variance_scaling_initializer( 75 | scale=2.0, mode='fan_out')), \ 76 | ExitStack() as stack: 77 | if cfg.BACKBONE.NORM in ['FreezeBN', 'SyncBN']: 78 | if freeze or cfg.BACKBONE.NORM == 'FreezeBN': 79 | stack.enter_context(argscope(BatchNorm, training=False)) 80 | else: 81 | stack.enter_context(argscope( 82 | BatchNorm, sync_statistics='nccl' if cfg.TRAINER == 'replicated' else 'horovod')) 83 | 84 | if freeze: 85 | stack.enter_context(freeze_variables(stop_gradient=False, skip_collection=True)) 86 | else: 87 | # the layers are not completely freezed, but we may want to only freeze the affine 88 | if cfg.BACKBONE.FREEZE_AFFINE: 89 | stack.enter_context(custom_getter_scope(freeze_affine_getter)) 90 | yield 91 | 92 | 93 | def image_preprocess(image, bgr=True): 94 | with tf.name_scope('image_preprocess'): 95 | if image.dtype.base_dtype != tf.float32: 96 | image = tf.cast(image, tf.float32) 97 | 98 | mean = cfg.PREPROC.PIXEL_MEAN 99 | std = np.asarray(cfg.PREPROC.PIXEL_STD) 100 | if bgr: 101 | mean = mean[::-1] 102 | std = std[::-1] 103 | image_mean = tf.constant(mean, dtype=tf.float32) 104 | image_invstd = tf.constant(1.0 / std, dtype=tf.float32) 105 | image = (image - image_mean) * image_invstd 106 | return image 107 | 108 | 109 | def get_norm(zero_init=False): 110 | if cfg.BACKBONE.NORM == 'None': 111 | return lambda x: x 112 | if cfg.BACKBONE.NORM == 'GN': 113 | Norm = GroupNorm 114 | layer_name = 'gn' 115 | else: 116 | Norm = BatchNorm 117 | layer_name = 'bn' 118 | return lambda x: Norm(layer_name, x, gamma_initializer=tf.zeros_initializer() if zero_init else None) 119 | 120 | 121 | def resnet_shortcut(l, n_out, stride, activation=tf.identity): 122 | n_in = l.shape[1] 123 | if n_in != n_out: # change dimension when channel is not the same 124 | # TF's SAME mode output ceil(x/stride), which is NOT what we want when x is odd and stride is 2 125 | # In FPN mode, the images are pre-padded already. 126 | if not cfg.MODE_FPN and stride == 2: 127 | l = l[:, :, :-1, :-1] 128 | return Conv2D('convshortcut', l, n_out, 1, 129 | strides=stride, activation=activation) 130 | else: 131 | return l 132 | 133 | 134 | def resnet_bottleneck(l, ch_out, stride): 135 | shortcut = l 136 | if cfg.BACKBONE.STRIDE_1X1: 137 | if stride == 2: 138 | l = l[:, :, :-1, :-1] 139 | l = Conv2D('conv1', l, ch_out, 1, strides=stride) 140 | l = Conv2D('conv2', l, ch_out, 3, strides=1) 141 | else: 142 | l = Conv2D('conv1', l, ch_out, 1, strides=1) 143 | if stride == 2: 144 | l = tf.pad(l, [[0, 0], [0, 0], maybe_reverse_pad(0, 1), maybe_reverse_pad(0, 1)]) 145 | l = Conv2D('conv2', l, ch_out, 3, strides=2, padding='VALID') 146 | else: 147 | l = Conv2D('conv2', l, ch_out, 3, strides=stride) 148 | if cfg.BACKBONE.NORM != 'None': 149 | l = Conv2D('conv3', l, ch_out * 4, 1, activation=get_norm(zero_init=True)) 150 | else: 151 | l = Conv2D('conv3', l, ch_out * 4, 1, activation=tf.identity, 152 | kernel_initializer=tf.constant_initializer()) 153 | ret = l + resnet_shortcut(shortcut, ch_out * 4, stride, activation=get_norm(zero_init=False)) 154 | return tf.nn.relu(ret, name='output') 155 | 156 | 157 | def resnet_group(name, l, block_func, features, count, stride): 158 | with tf.variable_scope(name): 159 | for i in range(0, count): 160 | with tf.variable_scope('block{}'.format(i)): 161 | l = block_func(l, features, stride if i == 0 else 1) 162 | return l 163 | 164 | 165 | def resnet_c4_backbone(image, num_blocks): 166 | assert len(num_blocks) == 3 167 | freeze_at = cfg.BACKBONE.FREEZE_AT 168 | with backbone_scope(freeze=freeze_at > 0): 169 | l = tf.pad(image, [[0, 0], [0, 0], maybe_reverse_pad(2, 3), maybe_reverse_pad(2, 3)]) 170 | l = Conv2D('conv0', l, 64, 7, strides=2, padding='VALID') 171 | l = tf.pad(l, [[0, 0], [0, 0], maybe_reverse_pad(0, 1), maybe_reverse_pad(0, 1)]) 172 | l = MaxPooling('pool0', l, 3, strides=2, padding='VALID') 173 | 174 | with backbone_scope(freeze=freeze_at > 1): 175 | c2 = resnet_group('group0', l, resnet_bottleneck, 64, num_blocks[0], 1) 176 | with backbone_scope(freeze=False): 177 | c3 = resnet_group('group1', c2, resnet_bottleneck, 128, num_blocks[1], 2) 178 | c4 = resnet_group('group2', c3, resnet_bottleneck, 256, num_blocks[2], 2) 179 | # 16x downsampling up to now 180 | return c4 181 | 182 | 183 | @auto_reuse_variable_scope 184 | def resnet_conv5(image, num_block): 185 | with backbone_scope(freeze=False): 186 | l = resnet_group('group3', image, resnet_bottleneck, 512, num_block, 2) 187 | return l 188 | 189 | 190 | def resnet_fpn_backbone(image, num_blocks): 191 | freeze_at = cfg.BACKBONE.FREEZE_AT 192 | shape2d = tf.shape(image)[2:] 193 | mult = float(cfg.FPN.RESOLUTION_REQUIREMENT) 194 | new_shape2d = tf.cast(tf.ceil(tf.cast(shape2d, tf.float32) / mult) * mult, tf.int32) 195 | pad_shape2d = new_shape2d - shape2d 196 | assert len(num_blocks) == 4, num_blocks 197 | with backbone_scope(freeze=freeze_at > 0): 198 | chan = image.shape[1] 199 | pad_base = maybe_reverse_pad(2, 3) 200 | l = tf.pad(image, tf.stack( 201 | [[0, 0], [0, 0], 202 | [pad_base[0], pad_base[1] + pad_shape2d[0]], 203 | [pad_base[0], pad_base[1] + pad_shape2d[1]]])) 204 | l.set_shape([None, chan, None, None]) 205 | l = Conv2D('conv0', l, 64, 7, strides=2, padding='VALID') 206 | l = tf.pad(l, [[0, 0], [0, 0], maybe_reverse_pad(0, 1), maybe_reverse_pad(0, 1)]) 207 | l = MaxPooling('pool0', l, 3, strides=2, padding='VALID') 208 | with backbone_scope(freeze=freeze_at > 1): 209 | c2 = resnet_group('group0', l, resnet_bottleneck, 64, num_blocks[0], 1) 210 | with backbone_scope(freeze=freeze_at > 2): 211 | c3 = resnet_group('group1', c2, resnet_bottleneck, 128, num_blocks[1], 2) 212 | c4 = resnet_group('group2', c3, resnet_bottleneck, 256, num_blocks[2], 2) 213 | c5 = resnet_group('group3', c4, resnet_bottleneck, 512, num_blocks[3], 2) 214 | # 32x downsampling up to now 215 | # size of c5: ceil(input/32) 216 | return c2, c3, c4, c5 217 | -------------------------------------------------------------------------------- /model_fpn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | from tensorpack.models import Conv2D, FixedUnPooling, MaxPooling, layer_register 8 | from tensorpack.tfutils.argscope import argscope 9 | from tensorpack.tfutils.scope_utils import under_name_scope 10 | from tensorpack.tfutils.summary import add_moving_summary 11 | from tensorpack.tfutils.tower import get_current_tower_context 12 | 13 | from basemodel import GroupNorm 14 | from config import config as cfg 15 | from model_box import roi_align 16 | from model_rpn import generate_rpn_proposals, rpn_losses 17 | from utils.box_ops import area as tf_area 18 | 19 | 20 | @layer_register(log_shape=True) 21 | def fpn_model(features): 22 | """ 23 | Args: 24 | features ([tf.Tensor]): ResNet features c2-c5 25 | 26 | Returns: 27 | [tf.Tensor]: FPN features p2-p6 28 | """ 29 | assert len(features) == 4, features 30 | num_channel = cfg.FPN.NUM_CHANNEL 31 | 32 | use_gn = cfg.FPN.NORM == 'GN' 33 | 34 | def upsample2x(name, x): 35 | return FixedUnPooling( 36 | name, x, 2, unpool_mat=np.ones((2, 2), dtype='float32'), 37 | data_format='channels_first') 38 | 39 | # tf.image.resize is, again, not aligned. 40 | # with tf.name_scope(name): 41 | # shape2d = tf.shape(x)[2:] 42 | # x = tf.transpose(x, [0, 2, 3, 1]) 43 | # x = tf.image.resize_nearest_neighbor(x, shape2d * 2, align_corners=True) 44 | # x = tf.transpose(x, [0, 3, 1, 2]) 45 | # return x 46 | 47 | with argscope(Conv2D, data_format='channels_first', 48 | activation=tf.identity, use_bias=True, 49 | kernel_initializer=tf.variance_scaling_initializer(scale=1.)): 50 | lat_2345 = [Conv2D('lateral_1x1_c{}'.format(i + 2), c, num_channel, 1) 51 | for i, c in enumerate(features)] 52 | if use_gn: 53 | lat_2345 = [GroupNorm('gn_c{}'.format(i + 2), c) for i, c in enumerate(lat_2345)] 54 | lat_sum_5432 = [] 55 | for idx, lat in enumerate(lat_2345[::-1]): 56 | if idx == 0: 57 | lat_sum_5432.append(lat) 58 | else: 59 | lat = lat + upsample2x('upsample_lat{}'.format(6 - idx), lat_sum_5432[-1]) 60 | lat_sum_5432.append(lat) 61 | p2345 = [Conv2D('posthoc_3x3_p{}'.format(i + 2), c, num_channel, 3) 62 | for i, c in enumerate(lat_sum_5432[::-1])] 63 | if use_gn: 64 | p2345 = [GroupNorm('gn_p{}'.format(i + 2), c) for i, c in enumerate(p2345)] 65 | p6 = MaxPooling('maxpool_p6', p2345[-1], pool_size=1, strides=2, data_format='channels_first', padding='VALID') 66 | return p2345 + [p6] 67 | 68 | 69 | @under_name_scope() 70 | def fpn_map_rois_to_levels(boxes): 71 | """ 72 | Assign boxes to level 2~5. 73 | 74 | Args: 75 | boxes (nx4): 76 | 77 | Returns: 78 | [tf.Tensor]: 4 tensors for level 2-5. Each tensor is a vector of indices of boxes in its level. 79 | [tf.Tensor]: 4 tensors, the gathered boxes in each level. 80 | 81 | Be careful that the returned tensor could be empty. 82 | """ 83 | sqrtarea = tf.sqrt(tf_area(boxes)) 84 | level = tf.cast(tf.floor( 85 | 4 + tf.log(sqrtarea * (1. / 224) + 1e-6) * (1.0 / np.log(2))), tf.int32) 86 | 87 | # RoI levels range from 2~5 (not 6) 88 | level_ids = [ 89 | tf.where(level <= 2), 90 | tf.where(tf.equal(level, 3)), # == is not supported 91 | tf.where(tf.equal(level, 4)), 92 | tf.where(level >= 5)] 93 | level_ids = [tf.reshape(x, [-1], name='roi_level{}_id'.format(i + 2)) 94 | for i, x in enumerate(level_ids)] 95 | num_in_levels = [tf.size(x, name='num_roi_level{}'.format(i + 2)) 96 | for i, x in enumerate(level_ids)] 97 | add_moving_summary(*num_in_levels) 98 | 99 | level_boxes = [tf.gather(boxes, ids) for ids in level_ids] 100 | return level_ids, level_boxes 101 | 102 | 103 | @under_name_scope() 104 | def multilevel_roi_align(features, rcnn_boxes, resolution): 105 | """ 106 | Args: 107 | features ([tf.Tensor]): 4 FPN feature level 2-5 108 | rcnn_boxes (tf.Tensor): nx4 boxes 109 | resolution (int): output spatial resolution 110 | Returns: 111 | NxC x res x res 112 | """ 113 | assert len(features) == 4, features 114 | # Reassign rcnn_boxes to levels 115 | level_ids, level_boxes = fpn_map_rois_to_levels(rcnn_boxes) 116 | all_rois = [] 117 | 118 | # Crop patches from corresponding levels 119 | for i, boxes, featuremap in zip(itertools.count(), level_boxes, features): 120 | with tf.name_scope('roi_level{}'.format(i + 2)): 121 | boxes_on_featuremap = boxes * (1.0 / cfg.FPN.ANCHOR_STRIDES[i]) 122 | all_rois.append(roi_align(featuremap, boxes_on_featuremap, resolution)) 123 | 124 | # this can fail if using TF<=1.8 with MKL build 125 | all_rois = tf.concat(all_rois, axis=0) # NCHW 126 | # Unshuffle to the original order, to match the original samples 127 | level_id_perm = tf.concat(level_ids, axis=0) # A permutation of 1~N 128 | level_id_invert_perm = tf.invert_permutation(level_id_perm) 129 | all_rois = tf.gather(all_rois, level_id_invert_perm) 130 | return all_rois 131 | 132 | 133 | @under_name_scope() 134 | def neck_roi_align(features, rcnn_boxes, resolution): 135 | """ 136 | Args: 137 | features ([tf.Tensor]): 4 FPN feature level 2-5 138 | rcnn_boxes (tf.Tensor): nx4 boxes 139 | resolution (int): output spatial resolution 140 | Returns: 141 | NxC x res x res 142 | """ 143 | assert len(features) == 4, features 144 | aligned_features = None 145 | for i in range(4): 146 | with tf.name_scope('roi_level{}'.format(i + 2)): 147 | boxes_on_featuremap = rcnn_boxes * (1.0 / cfg.FPN.ANCHOR_STRIDES[i]) 148 | level_features = roi_align(features[i], boxes_on_featuremap, resolution) 149 | if aligned_features is None: 150 | aligned_features = level_features 151 | else: 152 | aligned_features += level_features 153 | return aligned_features 154 | 155 | 156 | def multilevel_rpn_losses( 157 | multilevel_anchors, multilevel_label_logits, multilevel_box_logits): 158 | """ 159 | Args: 160 | multilevel_anchors: #lvl RPNAnchors 161 | multilevel_label_logits: #lvl tensors of shape HxWxA 162 | multilevel_box_logits: #lvl tensors of shape HxWxAx4 163 | 164 | Returns: 165 | label_loss, box_loss 166 | """ 167 | num_lvl = len(cfg.FPN.ANCHOR_STRIDES) 168 | assert len(multilevel_anchors) == num_lvl 169 | assert len(multilevel_label_logits) == num_lvl 170 | assert len(multilevel_box_logits) == num_lvl 171 | 172 | losses = [] 173 | with tf.name_scope('rpn_losses'): 174 | for lvl in range(num_lvl): 175 | anchors = multilevel_anchors[lvl] 176 | label_loss, box_loss = rpn_losses( 177 | anchors.gt_labels, anchors.encoded_gt_boxes(), 178 | multilevel_label_logits[lvl], multilevel_box_logits[lvl], 179 | name_scope='level{}'.format(lvl + 2)) 180 | losses.extend([label_loss, box_loss]) 181 | 182 | total_label_loss = tf.add_n(losses[::2], name='label_loss') 183 | total_box_loss = tf.add_n(losses[1::2], name='box_loss') 184 | add_moving_summary(total_label_loss, total_box_loss) 185 | return [total_label_loss, total_box_loss] 186 | 187 | 188 | @under_name_scope() 189 | def generate_fpn_proposals( 190 | multilevel_pred_boxes, multilevel_label_logits, image_shape2d): 191 | """ 192 | Args: 193 | multilevel_pred_boxes: #lvl HxWxAx4 boxes 194 | multilevel_label_logits: #lvl tensors of shape HxWxA 195 | 196 | Returns: 197 | boxes: kx4 float 198 | scores: k logits 199 | """ 200 | num_lvl = len(cfg.FPN.ANCHOR_STRIDES) 201 | assert len(multilevel_pred_boxes) == num_lvl 202 | assert len(multilevel_label_logits) == num_lvl 203 | 204 | training = get_current_tower_context().is_training 205 | all_boxes = [] 206 | all_scores = [] 207 | if cfg.FPN.PROPOSAL_MODE == 'Level': 208 | fpn_nms_topk = cfg.RPN.TRAIN_PER_LEVEL_NMS_TOPK if training else cfg.RPN.TEST_PER_LEVEL_NMS_TOPK 209 | for lvl in range(num_lvl): 210 | with tf.name_scope('Lvl{}'.format(lvl + 2)): 211 | pred_boxes_decoded = multilevel_pred_boxes[lvl] 212 | proposal_boxes, proposal_scores = generate_rpn_proposals( 213 | tf.reshape(pred_boxes_decoded, [-1, 4]), 214 | tf.reshape(multilevel_label_logits[lvl], [-1]), 215 | image_shape2d, fpn_nms_topk) 216 | all_boxes.append(proposal_boxes) 217 | all_scores.append(proposal_scores) 218 | 219 | proposal_boxes = tf.concat(all_boxes, axis=0) # nx4 220 | proposal_scores = tf.concat(all_scores, axis=0) # n 221 | # Here we are different from Detectron. 222 | # Detectron picks top-k within the batch, rather than within an image. However we do not have a batch. 223 | proposal_topk = tf.minimum(tf.size(proposal_scores), fpn_nms_topk) 224 | proposal_scores, topk_indices = tf.nn.top_k(proposal_scores, k=proposal_topk, sorted=False) 225 | proposal_boxes = tf.gather(proposal_boxes, topk_indices) 226 | else: 227 | for lvl in range(num_lvl): 228 | with tf.name_scope('Lvl{}'.format(lvl + 2)): 229 | pred_boxes_decoded = multilevel_pred_boxes[lvl] 230 | all_boxes.append(tf.reshape(pred_boxes_decoded, [-1, 4])) 231 | all_scores.append(tf.reshape(multilevel_label_logits[lvl], [-1])) 232 | all_boxes = tf.concat(all_boxes, axis=0) 233 | all_scores = tf.concat(all_scores, axis=0) 234 | proposal_boxes, proposal_scores = generate_rpn_proposals( 235 | all_boxes, all_scores, image_shape2d, 236 | cfg.RPN.TRAIN_PRE_NMS_TOPK if training else cfg.RPN.TEST_PRE_NMS_TOPK, 237 | cfg.RPN.TRAIN_POST_NMS_TOPK if training else cfg.RPN.TEST_POST_NMS_TOPK) 238 | 239 | tf.sigmoid(proposal_scores, name='probs') # for visualization 240 | return tf.stop_gradient(proposal_boxes, name='boxes'), \ 241 | tf.stop_gradient(proposal_scores, name='scores') 242 | -------------------------------------------------------------------------------- /model_cascade.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from tensorpack.tfutils import get_current_tower_context 4 | from tensorpack.tfutils.summary import add_moving_summary 5 | 6 | from config import config as cfg 7 | from model_box import clip_boxes 8 | from model_frcnn import BoxProposals, FastRCNNHead, fastrcnn_outputs 9 | from utils.box_ops import pairwise_iou 10 | 11 | 12 | class CascadeRCNNHead(object): 13 | def __init__(self, proposals, 14 | roi_func, fastrcnn_head_func, gt_targets, image_shape2d, num_classes): 15 | """ 16 | Args: 17 | proposals: BoxProposals 18 | roi_func (boxes -> features): a function to crop features with rois 19 | fastrcnn_head_func (features -> features): the fastrcnn head to apply on the cropped features 20 | gt_targets (gt_boxes, gt_labels): 21 | """ 22 | for k, v in locals().items(): 23 | if k != 'self': 24 | setattr(self, k, v) 25 | self.gt_boxes, self.gt_labels = gt_targets 26 | del self.gt_targets 27 | 28 | self.num_cascade_stages = len(cfg.CASCADE.IOUS) 29 | 30 | self.is_training = get_current_tower_context().is_training 31 | if self.is_training: 32 | @tf.custom_gradient 33 | def scale_gradient(x): 34 | return x, lambda dy: dy * (1.0 / self.num_cascade_stages) 35 | self.scale_gradient = scale_gradient 36 | else: 37 | self.scale_gradient = tf.identity 38 | 39 | ious = cfg.CASCADE.IOUS 40 | # It's unclear how to do >3 stages, so it does not make sense to implement them 41 | assert self.num_cascade_stages == 3, "Only 3-stage cascade was implemented!" 42 | with tf.variable_scope('cascade_rcnn_stage1'): 43 | H1, B1 = self.run_head(self.proposals, 0) 44 | 45 | with tf.variable_scope('cascade_rcnn_stage2'): 46 | B1_proposal = self.match_box_with_gt(B1, ious[1]) 47 | H2, B2 = self.run_head(B1_proposal, 1) 48 | 49 | with tf.variable_scope('cascade_rcnn_stage3'): 50 | B2_proposal = self.match_box_with_gt(B2, ious[2]) 51 | H3, B3 = self.run_head(B2_proposal, 2) 52 | self._cascade_boxes = [B1, B2, B3] 53 | self._heads = [H1, H2, H3] 54 | 55 | def run_head(self, proposals, stage): 56 | """ 57 | Args: 58 | proposals: BoxProposals 59 | stage: 0, 1, 2 60 | 61 | Returns: 62 | FastRCNNHead 63 | Nx4, updated boxes 64 | """ 65 | reg_weights = tf.constant(cfg.CASCADE.BBOX_REG_WEIGHTS[stage], dtype=tf.float32) 66 | pooled_feature = self.roi_func(proposals.boxes) # N,C,S,S 67 | pooled_feature = self.scale_gradient(pooled_feature) 68 | head_feature = self.fastrcnn_head_func('head', pooled_feature) 69 | # changed by Paul 70 | label_logits, box_logits = fastrcnn_outputs( 71 | 'outputs_new', head_feature, self.num_classes, class_agnostic_regression=True) 72 | head = FastRCNNHead(proposals, box_logits, label_logits, self.gt_boxes, reg_weights) 73 | 74 | refined_boxes = head.decoded_output_boxes_class_agnostic() 75 | refined_boxes = clip_boxes(refined_boxes, self.image_shape2d) 76 | return head, tf.stop_gradient(refined_boxes, name='output_boxes') 77 | 78 | def match_box_with_gt(self, boxes, iou_threshold): 79 | """ 80 | Args: 81 | boxes: Nx4 82 | Returns: 83 | BoxProposals 84 | """ 85 | if self.is_training: 86 | with tf.name_scope('match_box_with_gt_{}'.format(iou_threshold)): 87 | iou = pairwise_iou(boxes, self.gt_boxes) # NxM 88 | max_iou_per_box = tf.reduce_max(iou, axis=1) # N 89 | best_iou_ind = tf.argmax(iou, axis=1) # N 90 | labels_per_box = tf.gather(self.gt_labels, best_iou_ind) 91 | fg_mask = max_iou_per_box >= iou_threshold 92 | fg_inds_wrt_gt = tf.boolean_mask(best_iou_ind, fg_mask) 93 | labels_per_box = tf.stop_gradient(labels_per_box * tf.cast(fg_mask, tf.int64)) 94 | return BoxProposals(boxes, labels_per_box, fg_inds_wrt_gt) 95 | else: 96 | return BoxProposals(boxes) 97 | 98 | def losses(self): 99 | ret = [] 100 | for idx, head in enumerate(self._heads): 101 | with tf.name_scope('cascade_loss_stage{}'.format(idx + 1)): 102 | ret.extend(head.losses()) 103 | return ret 104 | 105 | def decoded_output_boxes(self): 106 | """ 107 | Returns: 108 | Nx#classx4 109 | """ 110 | ret = self._cascade_boxes[-1] 111 | ret = tf.expand_dims(ret, 1) # class-agnostic 112 | return tf.tile(ret, [1, self.num_classes, 1]) 113 | 114 | def output_scores(self, name=None): 115 | """ 116 | Returns: 117 | Nx#class 118 | """ 119 | scores = [head.output_scores('cascade_scores_stage{}'.format(idx + 1)) 120 | for idx, head in enumerate(self._heads)] 121 | return tf.multiply(tf.add_n(scores), (1.0 / self.num_cascade_stages), name=name) 122 | 123 | 124 | class CascadeRCNNHeadWithHardExamples(CascadeRCNNHead): 125 | def __init__(self, proposals, roi_func, fastrcnn_head_func, gt_targets, image_shape2d, num_classes, 126 | hard_negative_features, hard_positive_features, hard_negative_loss_scaling_factor, 127 | hard_positive_loss_scaling_factor, hard_positive_ious, hard_positive_gt_boxes, 128 | hard_positive_jitter_boxes): 129 | super().__init__(proposals, roi_func, fastrcnn_head_func, gt_targets, image_shape2d, num_classes) 130 | self._hard_negative_features = hard_negative_features 131 | self._hard_positive_features = hard_positive_features 132 | self._hard_negative_loss_scaling_factor = hard_negative_loss_scaling_factor 133 | self._hard_positive_loss_scaling_factor = hard_positive_loss_scaling_factor 134 | self._hard_positive_ious = hard_positive_ious 135 | self._hard_positive_gt_boxes = hard_positive_gt_boxes 136 | self._hard_positive_jitter_boxes = hard_positive_jitter_boxes 137 | 138 | def _hard_losses(self, negative=True): 139 | if negative: 140 | hard_features = self._hard_negative_features 141 | desc = "neg" 142 | else: 143 | hard_features = self._hard_positive_features 144 | desc = "pos" 145 | losses = [] 146 | for cascade_idx, iou_thres in enumerate(cfg.CASCADE.IOUS): 147 | with tf.name_scope('cascade_loss_{}_stage{}'.format(desc, cascade_idx + 1)): 148 | with tf.variable_scope('cascade_rcnn_stage' + str(cascade_idx + 1), reuse=True): 149 | pooled_feature = self.roi_func(None, hard_features[:, cascade_idx]) 150 | pooled_feature = self.scale_gradient(pooled_feature) 151 | head_feature = self.fastrcnn_head_func('head', pooled_feature) 152 | # changed by Paul 153 | label_logits, box_logits = fastrcnn_outputs( 154 | 'outputs_new', head_feature, self.num_classes, class_agnostic_regression=True) 155 | mean_label = None 156 | box_loss = None 157 | if negative: 158 | labels = tf.zeros((tf.shape(label_logits)[0],), dtype=tf.int64) 159 | else: 160 | labels = tf.cast(tf.greater_equal(self._hard_positive_ious[:, cascade_idx], iou_thres), 161 | tf.int64) 162 | mean_label = tf.reduce_mean(tf.cast(labels, tf.float32), 163 | name='hard_{}_label_mean{}'.format(desc, cascade_idx + 1)) 164 | if cfg.USE_REGRESSION_LOSS_ON_HARD_POSITIVES: 165 | labels_bool = tf.cast(labels, tf.bool) 166 | valid = tf.reduce_any(labels_bool) 167 | 168 | def make_box_loss(): 169 | gt_boxes = tf.boolean_mask(self._hard_positive_gt_boxes, labels_bool) 170 | inp_boxes = tf.boolean_mask(self._hard_positive_jitter_boxes[:, cascade_idx], 171 | labels_bool) 172 | box_logits_masked = tf.boolean_mask(box_logits, labels_bool) 173 | from examples.FasterRCNN.model_box import encode_bbox_target 174 | reg_targets = encode_bbox_target(gt_boxes, 175 | inp_boxes) * cfg.CASCADE.BBOX_REG_WEIGHTS[cascade_idx] 176 | _box_loss = tf.losses.huber_loss( 177 | reg_targets, tf.squeeze(box_logits_masked, axis=1), 178 | reduction=tf.losses.Reduction.SUM) 179 | _box_loss = tf.truediv( 180 | _box_loss, tf.cast(tf.shape(reg_targets)[0], tf.float32)) 181 | return _box_loss 182 | 183 | box_loss = tf.cond(valid, make_box_loss, lambda: tf.constant(0, dtype=tf.float32)) 184 | box_loss = tf.multiply(box_loss, cfg.HARD_POSITIVE_BOX_LOSS_SCALING_FACTOR, 185 | name='hard_{}_box_loss{}'.format(desc, cascade_idx + 1)) 186 | losses.append(box_loss) 187 | label_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( 188 | labels=labels, logits=label_logits) 189 | if negative: 190 | label_loss *= self._hard_negative_loss_scaling_factor 191 | else: 192 | label_loss *= self._hard_positive_loss_scaling_factor 193 | label_loss = tf.reduce_mean(label_loss, name='hard_{}_label_loss{}'.format(desc, cascade_idx + 1)) 194 | prediction = tf.argmax(label_logits, axis=1, name='label_prediction_hard_{}'.format(desc)) 195 | correct = tf.cast(tf.equal(prediction, labels), tf.float32) 196 | accuracy = tf.reduce_mean(correct, name='hard_{}_label_accuracy{}'.format(desc, cascade_idx + 1)) 197 | losses.append(label_loss) 198 | if mean_label is not None: 199 | add_moving_summary(mean_label) 200 | if box_loss is not None: 201 | add_moving_summary(box_loss) 202 | add_moving_summary(accuracy) 203 | add_moving_summary(label_loss) 204 | return losses 205 | 206 | def losses(self): 207 | normal_losses = super().losses() 208 | if self.is_training: 209 | hnl = self._hard_losses(negative=True) 210 | if self._hard_positive_features is not None: 211 | hpl = self._hard_losses(negative=False) 212 | else: 213 | hpl = [] 214 | return normal_losses + hnl + hpl 215 | else: 216 | return normal_losses 217 | -------------------------------------------------------------------------------- /eval_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File: eval.py 3 | 4 | import itertools 5 | import random 6 | import sys 7 | import os 8 | import json 9 | import PIL 10 | import numpy as np 11 | import glob 12 | from collections import namedtuple 13 | from concurrent.futures import ThreadPoolExecutor 14 | from contextlib import ExitStack 15 | import cv2 16 | import pycocotools.mask as cocomask 17 | import tqdm 18 | import tensorflow as tf 19 | import xmltodict 20 | 21 | from tensorpack.callbacks import Callback 22 | from tensorpack.tfutils.common import get_tf_version_tuple 23 | from tensorpack.utils import logger 24 | from tensorpack.utils.utils import get_tqdm 25 | 26 | from common import CustomResize, clip_boxes, box_to_point8, point8_to_box 27 | from data import get_eval_dataflow 28 | from dataset import DetectionDataset 29 | from config import config as cfg 30 | 31 | try: 32 | import horovod.tensorflow as hvd 33 | except ImportError: 34 | pass 35 | 36 | 37 | DetectionResult = namedtuple( 38 | 'DetectionResult', 39 | ['box', 'score', 'class_id', 'mask']) 40 | """ 41 | box: 4 float 42 | score: float 43 | class_id: int, 1~NUM_CLASS 44 | mask: None, or a binary image of the original image shape 45 | """ 46 | 47 | 48 | def _paste_mask(box, mask, shape): 49 | """ 50 | Args: 51 | box: 4 float 52 | mask: MxM floats 53 | shape: h,w 54 | Returns: 55 | A uint8 binary image of hxw. 56 | """ 57 | # int() is floor 58 | # box fpcoor=0.0 -> intcoor=0.0 59 | x0, y0 = list(map(int, box[:2] + 0.5)) 60 | # box fpcoor=h -> intcoor=h-1, inclusive 61 | x1, y1 = list(map(int, box[2:] - 0.5)) # inclusive 62 | x1 = max(x0, x1) # require at least 1x1 63 | y1 = max(y0, y1) 64 | 65 | w = x1 + 1 - x0 66 | h = y1 + 1 - y0 67 | 68 | # rounding errors could happen here, because masks were not originally computed for this shape. 69 | # but it's hard to do better, because the network does not know the "original" scale 70 | mask = (cv2.resize(mask, (w, h)) > 0.5).astype('uint8') 71 | ret = np.zeros(shape, dtype='uint8') 72 | ret[y0:y1 + 1, x0:x1 + 1] = mask 73 | return ret 74 | 75 | 76 | def predict_image(img, model_func): 77 | """ 78 | Run detection on one image, using the TF callable. 79 | This function should handle the preprocessing internally. 80 | 81 | Args: 82 | img: an image 83 | model_func: a callable from the TF model. 84 | It takes image and returns (boxes, probs, labels, [masks]) 85 | 86 | Returns: 87 | [DetectionResult] 88 | """ 89 | 90 | orig_shape = img.shape[:2] 91 | resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) 92 | resized_img = resizer.augment(img) 93 | scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1]) 94 | boxes, probs, labels, *masks = model_func(resized_img) 95 | boxes = boxes / scale 96 | # boxes are already clipped inside the graph, but after the floating point scaling, this may not be true any more. 97 | boxes = clip_boxes(boxes, orig_shape) 98 | 99 | if masks: 100 | # has mask 101 | full_masks = [_paste_mask(box, mask, orig_shape) 102 | for box, mask in zip(boxes, masks[0])] 103 | masks = full_masks 104 | else: 105 | # fill with none 106 | masks = [None] * len(boxes) 107 | 108 | results = [DetectionResult(*args) for args in zip(boxes, probs, labels, masks)] 109 | return results 110 | 111 | 112 | def predict_image_track_with_precomputed_ref_features(img, ref_features, model_func): 113 | orig_shape = img.shape[:2] 114 | resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) 115 | resized_img = resizer.augment(img) 116 | scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1]) 117 | boxes, probs, labels, *masks = model_func(resized_img, ref_features) 118 | boxes = boxes / scale 119 | # boxes are already clipped inside the graph, but after the floating point scaling, this may not be true any more. 120 | boxes = clip_boxes(boxes, orig_shape) 121 | 122 | if masks: 123 | # has mask 124 | full_masks = [_paste_mask(box, mask, orig_shape) 125 | for box, mask in zip(boxes, masks[0])] 126 | masks = full_masks 127 | else: 128 | # fill with none 129 | masks = [None] * len(boxes) 130 | 131 | results = [DetectionResult(*args) for args in zip(boxes, probs, labels, masks)] 132 | return results 133 | 134 | 135 | def predict_image_track(img, ref_img, ref_bbox, model_func): 136 | """ 137 | Run detection on one image, using the TF callable. 138 | This function should handle the preprocessing internally. 139 | 140 | Args: 141 | img: an image 142 | model_func: a callable from the TF model. 143 | It takes image and returns (boxes, probs, labels, [masks]) 144 | 145 | Returns: 146 | [DetectionResult] 147 | """ 148 | 149 | orig_shape = img.shape[:2] 150 | resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE) 151 | resized_img = resizer.augment(img) 152 | resized_ref_img, params = resizer.augment_return_params(ref_img) 153 | 154 | ref_points = box_to_point8(ref_bbox[np.newaxis]) 155 | ref_points = resizer.augment_coords(ref_points, params) 156 | resized_ref_boxes = point8_to_box(ref_points) 157 | resized_ref_bbox = resized_ref_boxes[0] 158 | 159 | scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1]) 160 | boxes, probs, labels, *masks = model_func(resized_img, resized_ref_img, resized_ref_bbox) 161 | boxes = boxes / scale 162 | # boxes are already clipped inside the graph, but after the floating point scaling, this may not be true any more. 163 | boxes = clip_boxes(boxes, orig_shape) 164 | 165 | if masks: 166 | # has mask 167 | full_masks = [_paste_mask(box, mask, orig_shape) 168 | for box, mask in zip(boxes, masks[0])] 169 | masks = full_masks 170 | else: 171 | # fill with none 172 | masks = [None] * len(boxes) 173 | 174 | results = [DetectionResult(*args) for args in zip(boxes, probs, labels, masks)] 175 | return results 176 | 177 | 178 | def predict_dataflow(df, model_func, tqdm_bar=None): 179 | """ 180 | Args: 181 | df: a DataFlow which produces (image, image_id) 182 | model_func: a callable from the TF model. 183 | It takes image and returns (boxes, probs, labels, [masks]) 184 | tqdm_bar: a tqdm object to be shared among multiple evaluation instances. If None, 185 | will create a new one. 186 | 187 | Returns: 188 | list of dict, in the format used by 189 | `DetectionDataset.eval_or_save_inference_results` 190 | """ 191 | df.reset_state() 192 | all_results = [] 193 | with ExitStack() as stack: 194 | # tqdm is not quite thread-safe: https://github.com/tqdm/tqdm/issues/323 195 | if tqdm_bar is None: 196 | tqdm_bar = stack.enter_context(get_tqdm(total=df.size())) 197 | for ref_img, ref_bbox, target_img, target_bbox, gt_file in df: 198 | results = predict_image_track(target_img, ref_img, ref_bbox, model_func) 199 | all_results.append((gt_file, results, target_bbox)) 200 | tqdm_bar.update(1) 201 | return all_results 202 | 203 | 204 | def multithread_predict_dataflow(dataflows, model_funcs): 205 | """ 206 | Running multiple `predict_dataflow` in multiple threads, and aggregate the results. 207 | 208 | Args: 209 | dataflows: a list of DataFlow to be used in :func:`predict_dataflow` 210 | model_funcs: a list of callable to be used in :func:`predict_dataflow` 211 | 212 | Returns: 213 | list of dict, in the format used by 214 | `DetectionDataset.eval_or_save_inference_results` 215 | """ 216 | num_worker = len(model_funcs) 217 | assert len(dataflows) == num_worker 218 | if num_worker == 1: 219 | return predict_dataflow(dataflows[0], model_funcs[0]) 220 | kwargs = {'thread_name_prefix': 'EvalWorker'} if sys.version_info.minor >= 6 else {} 221 | with ThreadPoolExecutor(max_workers=num_worker, **kwargs) as executor, \ 222 | tqdm.tqdm(total=sum([df.size() for df in dataflows])) as pbar: 223 | futures = [] 224 | for dataflow, pred in zip(dataflows, model_funcs): 225 | futures.append(executor.submit(predict_dataflow, dataflow, pred, pbar)) 226 | all_results = list(itertools.chain(*[fut.result() for fut in futures])) 227 | return all_results 228 | 229 | 230 | class EvalCallback(Callback): 231 | """ 232 | A callback that runs evaluation once a while. 233 | It supports multi-gpu evaluation. 234 | """ 235 | 236 | _chief_only = False 237 | 238 | def __init__(self, eval_dataset, in_names, out_names, output_dir): 239 | self._eval_dataset = eval_dataset 240 | self._in_names, self._out_names = in_names, out_names 241 | self._output_dir = output_dir 242 | 243 | def _setup_graph(self): 244 | num_gpu = cfg.TRAIN.NUM_GPUS 245 | if cfg.TRAINER == 'replicated': 246 | # TF bug in version 1.11, 1.12: https://github.com/tensorflow/tensorflow/issues/22750 247 | buggy_tf = get_tf_version_tuple() in [(1, 11), (1, 12)] 248 | 249 | # Use two predictor threads per GPU to get better throughput 250 | self.num_predictor = num_gpu if buggy_tf else num_gpu * 2 251 | self.predictors = [self._build_predictor(k % num_gpu) for k in range(self.num_predictor)] 252 | self.dataflows = [get_eval_dataflow(self._eval_dataset, 253 | shard=k, num_shards=self.num_predictor) 254 | for k in range(self.num_predictor)] 255 | else: 256 | # Only eval on the first machine. 257 | # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs 258 | self._horovod_run_eval = hvd.rank() == hvd.local_rank() 259 | if self._horovod_run_eval: 260 | self.predictor = self._build_predictor(0) 261 | self.dataflow = get_eval_dataflow(self._eval_dataset, 262 | shard=hvd.local_rank(), num_shards=hvd.local_size()) 263 | 264 | self.barrier = hvd.allreduce(tf.random_normal(shape=[1])) 265 | 266 | def _build_predictor(self, idx): 267 | return self.trainer.get_predictor(self._in_names, self._out_names, device=idx) 268 | 269 | def _before_train(self): 270 | eval_period = cfg.TRAIN.EVAL_PERIOD 271 | self.epochs_to_eval = set() 272 | for k in itertools.count(1): 273 | if k * eval_period > self.trainer.max_epoch: 274 | break 275 | self.epochs_to_eval.add(k * eval_period) 276 | self.epochs_to_eval.add(self.trainer.max_epoch) 277 | logger.info("[EvalCallback] Will evaluate every {} epochs".format(eval_period)) 278 | 279 | def _eval(self): 280 | logdir = self._output_dir 281 | if cfg.TRAINER == 'replicated': 282 | all_results = multithread_predict_dataflow(self.dataflows, self.predictors) 283 | else: 284 | filenames = [os.path.join( 285 | logdir, 'outputs{}-part{}.json'.format(self.global_step, rank) 286 | ) for rank in range(hvd.local_size())] 287 | 288 | if self._horovod_run_eval: 289 | local_results = predict_dataflow(self.dataflow, self.predictor) 290 | fname = filenames[hvd.local_rank()] 291 | with open(fname, 'w') as f: 292 | json.dump(local_results, f) 293 | self.barrier.eval() 294 | if hvd.rank() > 0: 295 | return 296 | all_results = [] 297 | for fname in filenames: 298 | with open(fname, 'r') as f: 299 | obj = json.load(f) 300 | all_results.extend(obj) 301 | os.unlink(fname) 302 | 303 | output_file = os.path.join( 304 | logdir, '{}-outputs{}.json'.format(self._eval_dataset, self.global_step)) 305 | 306 | scores = DetectionDataset().eval_or_save_inference_results( 307 | all_results, self._eval_dataset, output_file) 308 | for k, v in scores.items(): 309 | self.trainer.monitors.put_scalar(k, v) 310 | 311 | def _trigger_epoch(self): 312 | if self.epoch_num in self.epochs_to_eval: 313 | logger.info("Running evaluation ...") 314 | self._eval() 315 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File: config.py 3 | 4 | import numpy as np 5 | import os 6 | import six 7 | import pprint 8 | 9 | from tensorpack.utils import logger 10 | from tensorpack.utils.gpu import get_num_gpu 11 | 12 | __all__ = ['config', 'finalize_configs'] 13 | 14 | 15 | class AttrDict(): 16 | 17 | _freezed = False 18 | """ Avoid accidental creation of new hierarchies. """ 19 | 20 | def __getattr__(self, name): 21 | if self._freezed: 22 | raise AttributeError(name) 23 | ret = AttrDict() 24 | setattr(self, name, ret) 25 | return ret 26 | 27 | def __setattr__(self, name, value): 28 | if self._freezed and name not in self.__dict__: 29 | raise AttributeError( 30 | "Config was freezed! Unknown config: {}".format(name)) 31 | super().__setattr__(name, value) 32 | 33 | def __str__(self): 34 | return pprint.pformat(self.to_dict(), indent=1, width=100, compact=True) 35 | 36 | __repr__ = __str__ 37 | 38 | def to_dict(self): 39 | """Convert to a nested dict. """ 40 | return {k: v.to_dict() if isinstance(v, AttrDict) else v 41 | for k, v in self.__dict__.items() if not k.startswith('_')} 42 | 43 | def update_args(self, args): 44 | """Update from command line args. """ 45 | for cfg in args: 46 | keys, v = cfg.split('=', maxsplit=1) 47 | keylist = keys.split('.') 48 | 49 | dic = self 50 | for i, k in enumerate(keylist[:-1]): 51 | assert k in dir(dic), "Unknown config key: {}".format(keys) 52 | dic = getattr(dic, k) 53 | key = keylist[-1] 54 | 55 | oldv = getattr(dic, key) 56 | if not isinstance(oldv, str): 57 | v = eval(v) 58 | setattr(dic, key, v) 59 | 60 | def freeze(self, freezed=True): 61 | self._freezed = freezed 62 | for v in self.__dict__.values(): 63 | if isinstance(v, AttrDict): 64 | v.freeze(freezed) 65 | 66 | # avoid silent bugs 67 | def __eq__(self, _): 68 | raise NotImplementedError() 69 | 70 | def __ne__(self, _): 71 | raise NotImplementedError() 72 | 73 | 74 | config = AttrDict() 75 | _C = config # short alias to avoid coding 76 | 77 | # paths to datasets! changes these! 78 | _C.DATA.IMAGENET_VID_ROOT = "/globalwork/data/ILSVRC_VID/ILSVRC/" 79 | _C.DATA.GOT10K_ROOT = "/globalwork/data/GOT10k/" 80 | _C.DATA.LASOT_ROOT = "/globalwork/data/LaSOTBenchmark/" 81 | _C.DATA.YOUTUBE_VOS_ROOT = "/globalwork/data/youtube-vos/" 82 | _C.DATA.DAVIS2017_ROOT = "/globalwork/data/DAVIS2017/" 83 | _C.DATA.YOUTUBE_BB_ROOT = "/globalwork/data/youtube-bb/yt_bb_detection_train/" 84 | _C.DATA.TRACKINGNET_ROOT = "/globalwork/data/TrackingNet/" 85 | _C.HARD_MINING_DATA_PATH = "/globalwork/data/hard_example_mining_index/" 86 | 87 | _C.DATA.IMAGENET_VID = True 88 | _C.DATA.GOT10K = True 89 | _C.DATA.LASOT = True 90 | _C.DATA.YOUTUBE_VOS = True 91 | _C.DATA.YOUTUBE_BB = False 92 | _C.DATA.DAVIS2017 = False 93 | _C.DATA.TRACKINGNET = False 94 | 95 | # mode flags --------------------- 96 | _C.TRAINER = 'replicated' # options: 'horovod', 'replicated' 97 | _C.MODE_MASK = False # FasterRCNN or MaskRCNN 98 | _C.MODE_FPN = True 99 | _C.MODE_TRACK = True 100 | _C.TRACK_VIDEO_ID = None 101 | 102 | # new flags by us 103 | _C.MODE_SHARED_CONV_REDUCE = False 104 | _C.USE_PRECOMPUTED_REF_FEATURES = False 105 | _C.EXTRACT_GT_FEATURES = False 106 | _C.MODE_THIRD_STAGE = False 107 | _C.EXTEND_PROPOSALS_BY_ACTIVE_TRACKLETS = True 108 | 109 | # hard mining stuff 110 | _C.MODE_HARD_MINING = False 111 | _C.MODE_IF_HARD_MINING_THEN_ALSO_POSITIVES = True 112 | _C.MODE_HARD_NEGATIVES_ONLY_CROSSOVER = False 113 | _C.MODE_HARD_NEGATIVES_ONLY_CROSSOVER_YOUTUBEVOS = False 114 | _C.USE_REGRESSION_LOSS_ON_HARD_POSITIVES = False 115 | _C.HARD_NEGATIVE_LOSS_SCALING_FACTOR = 0.12 116 | _C.HARD_POSITIVE_LOSS_SCALING_FACTOR = 0.1 117 | _C.HARD_POSITIVE_BOX_LOSS_SCALING_FACTOR = 0.1 118 | _C.N_HARD_NEGATIVES_TO_SAMPLE = 100 119 | _C.N_HARD_POS_TO_SAMPLE = 30 120 | _C.HARD_MINING_KNN = 10000 121 | _C.HARD_MINING_KNN_LASOT = 50000 122 | 123 | _C.FORWARD_VIDEO_RANGE_START = None 124 | _C.FORWARD_VIDEO_RANGE_END = None 125 | 126 | # might lead to minor slowdown, but gives useful information 127 | _C.MEASURE_IOU_DURING_TRAINING = True 128 | 129 | # dataset ----------------------- 130 | _C.DATA.BASEDIR = '/path/to/your/DATA/DIR' 131 | # All TRAIN dataset will be concatenated for training. 132 | _C.DATA.TRAIN = ['train2014', 'valminusminival2014'] # i.e. trainval35k, AKA train2017 133 | # Each VAL dataset will be evaluated separately (instead of concatenated) 134 | #_C.DATA.VAL = ('minival2014', ) # AKA val2017 135 | _C.DATA.VAL = () 136 | # This two config will be populated later by the dataset loader: 137 | _C.DATA.NUM_CATEGORY = 1 # without the background class (e.g., 80 for COCO) 138 | _C.DATA.CLASS_NAMES = [] # NUM_CLASS (NUM_CATEGORY+1) strings, the first is "BG". 139 | 140 | _C.DATA.DEBUG_VIS = False 141 | _C.DATA.MULTITHREAD = True 142 | 143 | _C.DATA.GRAYSCALE_AUGMENTATIONS = True 144 | _C.DATA.MOTION_BLUR_AUGMENTATIONS = True 145 | 146 | # basemodel ---------------------- 147 | _C.BACKBONE.WEIGHTS = '' # /path/to/weights.npz 148 | _C.BACKBONE.RESNET_NUM_BLOCKS = [3, 4, 23, 3] # for resnet50 149 | # RESNET_NUM_BLOCKS = [3, 4, 23, 3] # for resnet101 150 | _C.BACKBONE.FREEZE_AFFINE = False # do not train affine parameters inside norm layers 151 | _C.BACKBONE.NORM = 'GN' # options: FreezeBN, SyncBN, GN, None 152 | _C.BACKBONE.FREEZE_AT = 4 # options: 0, 1, 2 153 | 154 | # Use a base model with TF-preferred padding mode, 155 | # which may pad more pixels on right/bottom than top/left. 156 | # See https://github.com/tensorflow/tensorflow/issues/18213 157 | # In tensorpack model zoo, ResNet models with TF_PAD_MODE=False are marked with "-AlignPadding". 158 | # All other models under `ResNet/` in the model zoo are using TF_PAD_MODE=True. 159 | # Using either one should probably give the same performance. 160 | # We use the "AlignPadding" one just to be consistent with caffe2. 161 | _C.BACKBONE.TF_PAD_MODE = False 162 | _C.BACKBONE.STRIDE_1X1 = False # True for MSRA models 163 | 164 | # schedule ----------------------- 165 | _C.TRAIN.NUM_GPUS = None # by default, will be set from code 166 | _C.TRAIN.WEIGHT_DECAY = 1e-4 167 | _C.TRAIN.BASE_LR = 1e-2 # defined for total batch size=8. Otherwise it will be adjusted automatically 168 | _C.TRAIN.WARMUP = 1000 # in terms of iterations. This is not affected by #GPUs 169 | _C.TRAIN.WARMUP_INIT_LR = 1e-2 * 0.33 # defined for total batch size=8. Otherwise it will be adjusted automatically 170 | _C.TRAIN.STEPS_PER_EPOCH = 500 171 | _C.TRAIN.STARTING_EPOCH = 1 # the first epoch to start with, useful to continue a training 172 | _C.TRAIN.MAX_NUM_EPOCHS = 1000000000000 173 | 174 | # LR_SCHEDULE means equivalent steps when the total batch size is 8. 175 | # When the total bs!=8, the actual iterations to decrease learning rate, and 176 | # the base learning rate are computed from BASE_LR and LR_SCHEDULE. 177 | # Therefore, there is *no need* to modify the config if you only change the number of GPUs. 178 | 179 | # _C.TRAIN.LR_SCHEDULE = [120000, 160000, 180000 ] # "1x" schedule in detectron 180 | #_C.TRAIN.LR_SCHEDULE = [240000, 320000, 360000] # "2x" schedule in detectron 181 | # Longer schedules for from-scratch training (https://arxiv.org/abs/1811.08883): 182 | # _C.TRAIN.LR_SCHEDULE = [960000, 1040000, 1080000] # "6x" schedule in detectron 183 | # _C.TRAIN.LR_SCHEDULE = [1500000, 1580000, 1620000] # "9x" schedule in detectron 184 | #_C.TRAIN.LR_SCHEDULE = [1500000, 1580000, 1620000] 185 | _C.TRAIN.LR_SCHEDULE = [250000, 280000, 300000] # for main training, afterwards we can do hard example training 186 | _C.TRAIN.EVAL_PERIOD = 20 # period (epochs) to run evaluation 187 | 188 | # preprocessing -------------------- 189 | # Alternative old (worse & faster) setting: 600 190 | _C.PREPROC.TRAIN_SHORT_EDGE_SIZE = [640, 800] # [min, max] to sample from 191 | _C.PREPROC.TEST_SHORT_EDGE_SIZE = 800 192 | _C.PREPROC.MAX_SIZE = 1333 193 | # mean and std in RGB order. 194 | # Un-scaled version: [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] 195 | _C.PREPROC.PIXEL_MEAN = [123.675, 116.28, 103.53] 196 | _C.PREPROC.PIXEL_STD = [58.395, 57.12, 57.375] 197 | 198 | # anchors ------------------------- 199 | _C.RPN.ANCHOR_STRIDE = 16 200 | _C.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512) # sqrtarea of the anchor box 201 | _C.RPN.ANCHOR_RATIOS = (0.5, 1., 2.) 202 | _C.RPN.POSITIVE_ANCHOR_THRESH = 0.7 203 | _C.RPN.NEGATIVE_ANCHOR_THRESH = 0.3 204 | 205 | # rpn training ------------------------- 206 | _C.RPN.FG_RATIO = 0.5 # fg ratio among selected RPN anchors 207 | _C.RPN.BATCH_PER_IM = 256 # total (across FPN levels) number of anchors that are marked valid 208 | _C.RPN.MIN_SIZE = 0 209 | _C.RPN.PROPOSAL_NMS_THRESH = 0.7 210 | # Anchors which overlap with a crowd box (IOA larger than threshold) will be ignored. 211 | # Setting this to a value larger than 1.0 will disable the feature. 212 | # It is disabled by default because Detectron does not do this. 213 | _C.RPN.CROWD_OVERLAP_THRESH = 9.99 214 | _C.RPN.HEAD_DIM = 1024 # used in C4 only 215 | 216 | # RPN proposal selection ------------------------------- 217 | # for C4 218 | _C.RPN.TRAIN_PRE_NMS_TOPK = 12000 219 | _C.RPN.TRAIN_POST_NMS_TOPK = 2000 220 | _C.RPN.TEST_PRE_NMS_TOPK = 6000 221 | _C.RPN.TEST_POST_NMS_TOPK = 1000 # if you encounter OOM in inference, set this to a smaller number 222 | # for FPN, #proposals per-level and #proposals after merging are (for now) the same 223 | # if FPN.PROPOSAL_MODE = 'Joint', these options have no effect 224 | _C.RPN.TRAIN_PER_LEVEL_NMS_TOPK = 2000 225 | #_C.RPN.TEST_PER_LEVEL_NMS_TOPK = 1000 226 | # seems we need a lot of proposals for tracking with fixed RPN. Note that this makes it quite slow! 227 | # to prevent OOM let's do 8k for validating during training 228 | # for actual forwarding, we could do 15k 229 | _C.RPN.TEST_PER_LEVEL_NMS_TOPK = 1000 230 | _C.RPN.TEST_ALTERNATIVE_ANCHOR_SAMPLING = False 231 | 232 | # fastrcnn training --------------------- 233 | _C.FRCNN.BATCH_PER_IM = 512 234 | _C.FRCNN.BBOX_REG_WEIGHTS = [10., 10., 5., 5.] # Better but non-standard setting: [20, 20, 10, 10] 235 | _C.FRCNN.FG_THRESH = 0.5 236 | _C.FRCNN.FG_RATIO = 0.25 # fg ratio in a ROI batch 237 | 238 | _C.FRCNN.USE_FOCAL_LOSS = False 239 | _C.FRCNN.FG_LOSS_WEIGHTING_FACTOR = 3.0 240 | _C.FRCNN.BOX_LOSS_WEIGHTING_FACTOR = 1.0 241 | 242 | # FPN ------------------------- 243 | _C.FPN.ANCHOR_STRIDES = (4, 8, 16, 32, 64) # strides for each FPN level. Must be the same length as ANCHOR_SIZES 244 | _C.FPN.PROPOSAL_MODE = 'Level' # 'Level', 'Joint' 245 | _C.FPN.NUM_CHANNEL = 256 246 | _C.FPN.NORM = 'GN' # 'None', 'GN' 247 | # The head option is only used in FPN. For C4 models, the head is C5 248 | _C.FPN.FRCNN_HEAD_FUNC = 'fastrcnn_4conv1fc_gn_head' 249 | # choices: fastrcnn_2fc_head, fastrcnn_4conv1fc_{,gn_}head 250 | _C.FPN.FRCNN_CONV_HEAD_DIM = 256 251 | _C.FPN.FRCNN_FC_HEAD_DIM = 1024 252 | _C.FPN.MRCNN_HEAD_FUNC = 'maskrcnn_up4conv_gn_head' # choices: maskrcnn_up4conv_{,gn_}head 253 | 254 | # Mask-RCNN 255 | _C.MRCNN.HEAD_DIM = 256 256 | 257 | # Cascade-RCNN, only available in FPN mode 258 | _C.FPN.CASCADE = True 259 | _C.CASCADE.IOUS = [0.5, 0.6, 0.7] 260 | _C.CASCADE.BBOX_REG_WEIGHTS = [[10., 10., 5., 5.], [20., 20., 10., 10.], [30., 30., 15., 15.]] 261 | 262 | # testing ----------------------- 263 | _C.TEST.FRCNN_NMS_THRESH = 0.5 264 | 265 | # Smaller threshold value gives significantly better mAP. But we use 0.05 for consistency with Detectron. 266 | # mAP with 1e-4 threshold can be found at https://github.com/tensorpack/tensorpack/commit/26321ae58120af2568bdbf2269f32aa708d425a8#diff-61085c48abee915b584027e1085e1043 # noqa 267 | _C.TEST.RESULT_SCORE_THRESH = 0.005 268 | _C.TEST.RESULT_SCORE_THRESH_VIS = 0.005 # only visualize confident results 269 | _C.TEST.RESULTS_PER_IM = 100 270 | 271 | _C.freeze() # avoid typo / wrong config keys 272 | 273 | 274 | def finalize_configs(is_training): 275 | """ 276 | Run some sanity checks, and populate some configs from others 277 | """ 278 | _C.freeze(False) # populate new keys now 279 | _C.DATA.NUM_CLASS = _C.DATA.NUM_CATEGORY + 1 # +1 background 280 | _C.DATA.BASEDIR = os.path.expanduser(_C.DATA.BASEDIR) 281 | if isinstance(_C.DATA.VAL, six.string_types): # support single string (the typical case) as well 282 | _C.DATA.VAL = (_C.DATA.VAL, ) 283 | 284 | assert _C.BACKBONE.NORM in ['FreezeBN', 'SyncBN', 'GN', 'None'], _C.BACKBONE.NORM 285 | if _C.BACKBONE.NORM != 'FreezeBN': 286 | assert not _C.BACKBONE.FREEZE_AFFINE 287 | assert _C.BACKBONE.FREEZE_AT in [0, 1, 2, 3, 4] 288 | 289 | _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS) 290 | assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES) 291 | # image size into the backbone has to be multiple of this number 292 | _C.FPN.RESOLUTION_REQUIREMENT = _C.FPN.ANCHOR_STRIDES[3] # [3] because we build FPN with features r2,r3,r4,r5 293 | 294 | if _C.MODE_FPN: 295 | size_mult = _C.FPN.RESOLUTION_REQUIREMENT * 1. 296 | _C.PREPROC.MAX_SIZE = np.ceil(_C.PREPROC.MAX_SIZE / size_mult) * size_mult 297 | assert _C.FPN.PROPOSAL_MODE in ['Level', 'Joint'] 298 | assert _C.FPN.FRCNN_HEAD_FUNC.endswith('_head') 299 | assert _C.FPN.MRCNN_HEAD_FUNC.endswith('_head') 300 | assert _C.FPN.NORM in ['None', 'GN'] 301 | 302 | if _C.FPN.CASCADE: 303 | # the first threshold is the proposal sampling threshold 304 | assert _C.CASCADE.IOUS[0] == _C.FRCNN.FG_THRESH 305 | assert len(_C.CASCADE.BBOX_REG_WEIGHTS) == len(_C.CASCADE.IOUS) 306 | 307 | if is_training: 308 | train_scales = _C.PREPROC.TRAIN_SHORT_EDGE_SIZE 309 | if isinstance(train_scales, (list, tuple)) and train_scales[1] - train_scales[0] > 100: 310 | # don't autotune if augmentation is on 311 | os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0' 312 | os.environ['TF_AUTOTUNE_THRESHOLD'] = '1' 313 | assert _C.TRAINER in ['horovod', 'replicated'], _C.TRAINER 314 | 315 | # setup NUM_GPUS 316 | if _C.TRAINER == 'horovod': 317 | import horovod.tensorflow as hvd 318 | ngpu = hvd.size() 319 | 320 | if ngpu == hvd.local_size(): 321 | logger.warn("It's not recommended to use horovod for single-machine training. " 322 | "Replicated trainer is more stable and has the same efficiency.") 323 | else: 324 | assert 'OMPI_COMM_WORLD_SIZE' not in os.environ 325 | ngpu = get_num_gpu() 326 | assert ngpu % 8 == 0 or 8 % ngpu == 0, "Can only train with 1,2,4 or >=8 GPUs, but found {} GPUs".format(ngpu) 327 | else: 328 | # autotune is too slow for inference 329 | os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0' 330 | ngpu = get_num_gpu() 331 | 332 | assert ngpu > 0, "Has to run with GPU!" 333 | if _C.TRAIN.NUM_GPUS is None: 334 | _C.TRAIN.NUM_GPUS = ngpu 335 | else: 336 | if _C.TRAINER == 'horovod': 337 | assert _C.TRAIN.NUM_GPUS == ngpu 338 | else: 339 | assert _C.TRAIN.NUM_GPUS <= ngpu 340 | 341 | _C.freeze() 342 | logger.info("Config: ------------------------------------------\n" + str(_C)) 343 | -------------------------------------------------------------------------------- /tracking/three_stage_tracker.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import time 3 | import numpy as np 4 | import scipy.sparse 5 | from tensorpack import PredictConfig, get_model_loader, OfflinePredictor 6 | from config import config as cfg 7 | 8 | from tracking.argmax_tracker import PrecomputingReferenceTracker 9 | from tracking.util import resize_and_clip_boxes, generate_colors, xyxy_to_cxcywh_np 10 | 11 | VIZ_WITH_OPENCV = True 12 | 13 | 14 | class Tracklet: 15 | def __init__(self, start_time): 16 | self.start_time = start_time 17 | self.end_time = start_time 18 | self.feats = [] 19 | self.boxes = [] 20 | self.ff_gt_scores = [] 21 | self.ff_gt_tracklet_scores = [] 22 | 23 | def add_detection(self, feat, box, ff_gt_score, ff_gt_tracklet_score): 24 | self.feats = [feat] 25 | self.boxes.append(box) 26 | self.ff_gt_scores.append(ff_gt_score) 27 | self.ff_gt_tracklet_scores.append(ff_gt_tracklet_score) 28 | self.end_time += 1 29 | 30 | 31 | class ThreeStageTracker(PrecomputingReferenceTracker): 32 | def __init__(self, tracklet_distance_threshold=0.06, tracklet_merging_threshold=0.3, 33 | tracklet_merging_second_best_relative_threshold=0.3, ff_gt_score_weight=0.1, 34 | ff_gt_tracklet_score_weight=0.9, location_score_weight=7.0, do_viz=False, 35 | name="ThreeStageTracker", model="best", n_proposals=None, resolution=None): 36 | """ 37 | :param tracklet_merging_threshold: minimum score required to merge a detection into tracklet 38 | :param tracklet_merging_second_best_relative_threshold: minimum score gap to second best match allowed to merge the best detection into tracklet 39 | """ 40 | if n_proposals is not None: 41 | cfg.RPN.TEST_PER_LEVEL_NMS_TOPK = n_proposals 42 | if resolution is not None: 43 | if resolution == "full": 44 | # nothing do do... 45 | pass 46 | elif resolution == "half": 47 | cfg.PREPROC.TEST_SHORT_EDGE_SIZE = 400 48 | cfg.PREPROC.MAX_SIZE = 667 49 | else: 50 | assert False, ("unknown resolution", resolution) 51 | super().__init__(name=name, need_network=True, need_img=True, model=model) 52 | self._n_proposals = n_proposals 53 | self._resolution = resolution 54 | self._ff_box = None 55 | self._ff_gt_tracklet = None 56 | self._all_tracklets = None 57 | self._time_idx = None 58 | self._imgs_for_viz = None 59 | self._ff_img_noresize = None 60 | self._ax = None 61 | self._cv_img = None 62 | self._do_viz = do_viz 63 | self._video_idx = -1 64 | self._video_name = None 65 | 66 | self._dynprog_scores = None 67 | self._tracklet_merging_threshold = tracklet_merging_threshold 68 | self._tracklet_merging_second_best_relative_threshold = tracklet_merging_second_best_relative_threshold 69 | self._tracklet_distance_threshold = tracklet_distance_threshold 70 | 71 | self._ff_gt_score_weight = ff_gt_score_weight 72 | self._ff_gt_tracklet_score_weight = ff_gt_tracklet_score_weight 73 | self._location_score_weight = location_score_weight 74 | 75 | def set_video_name(self, vid_name): 76 | self._video_name = vid_name 77 | 78 | def init(self, image, box): 79 | self._ff_box = None 80 | self._ff_gt_tracklet = None 81 | self._all_tracklets = None 82 | self._time_idx = 0 83 | self._ff_img_noresize = np.array(image)[..., ::-1] 84 | if self._do_viz: 85 | self._imgs_for_viz = [self._ff_img_noresize] 86 | self._video_idx += 1 87 | self._dynprog_scores = None 88 | 89 | super().init(image, box) 90 | self._ff_box = self._prev_box.copy() 91 | self._ff_gt_tracklet = Tracklet(start_time=0) 92 | self._ff_gt_tracklet.add_detection(self._ff_gt_feats, self._ff_box, 1.0, 1.0) 93 | self._all_tracklets = [self._ff_gt_tracklet] 94 | 95 | def _make_pred_func(self, load): 96 | cfg.MODE_THIRD_STAGE = True 97 | from train import ResNetFPNTrackModel 98 | pred_model = ResNetFPNTrackModel() 99 | predcfg = PredictConfig( 100 | model=pred_model, 101 | session_init=get_model_loader(load), 102 | input_names=pred_model.get_inference_tensor_names()[0], 103 | output_names=pred_model.get_inference_tensor_names()[1]) 104 | return OfflinePredictor(predcfg) 105 | 106 | def _update(self, img): 107 | if self._do_viz: 108 | # we currently only need the most recent frame for viz 109 | self._imgs_for_viz = [img] 110 | self._time_idx += 1 111 | start = time.time() 112 | self._update_tracklets(img) 113 | best_box, score = self._track() 114 | end = time.time() 115 | # print("tracking step elapsed (with network)", end - start) 116 | if self._do_viz: 117 | self._viz_tracklets() 118 | self._viz_result(best_box) 119 | # save out viz 120 | #import cv2 121 | #cv2.imwrite("/tmp/viz/%05d.jpg" % self._time_idx, self._cv_img) 122 | return best_box, score 123 | 124 | def _update_tracklets(self, img): 125 | active_tracklets = [t for t in self._all_tracklets if t.end_time == self._time_idx] 126 | if len(active_tracklets) == 0: 127 | active_tracklets_boxes_noresize = np.zeros((0, 4), dtype=np.float32) 128 | active_tracklets_feats = np.zeros((0, 256, 7, 7)) 129 | else: 130 | active_tracklets_boxes_noresize = np.stack([t.boxes[-1] for t in active_tracklets], axis=0) 131 | active_tracklets_feats = np.stack([t.feats[-1] for t in active_tracklets], axis=0) 132 | resized_img, active_tracklets_boxes = self._resize_image_together_with_boxes(img, 133 | active_tracklets_boxes_noresize) 134 | boxes, scores, third_stage_feats_out, ff_gt_tracklet_scores, sparse_tracklet_scores, \ 135 | tracklet_score_indices = self._pred_func( 136 | resized_img, self._ff_gt_feats, self._ff_gt_tracklet.feats[-1], active_tracklets_feats, 137 | active_tracklets_boxes, self._tracklet_distance_threshold) 138 | boxes = resize_and_clip_boxes(img, resized_img, boxes) 139 | # for simplicity let's just convert it to a dense matrix. If that gets too large, we can still change it. 140 | tracklet_scores = scipy.sparse.coo_matrix((sparse_tracklet_scores, (tracklet_score_indices[:, 0], 141 | tracklet_score_indices[:, 1])), 142 | shape=(len(active_tracklets), scores.size) 143 | ).toarray() 144 | # free memory 145 | for t in self._all_tracklets: 146 | if t.end_time != self._time_idx and t.start_time != 0: 147 | t.feats = None 148 | self._update_tracklets_with_network_outputs(active_tracklets, boxes, scores, third_stage_feats_out, 149 | ff_gt_tracklet_scores, tracklet_scores) 150 | 151 | def _update_tracklets_with_network_outputs(self, active_tracklets, boxes, scores, third_stage_feats_out, 152 | ff_gt_tracklet_scores, tracklet_scores): 153 | n_dets = scores.size 154 | for det_idx in range(n_dets): 155 | merged = False 156 | det_args = (third_stage_feats_out[det_idx], boxes[det_idx], scores[det_idx], 157 | ff_gt_tracklet_scores[det_idx]) 158 | 159 | # try to extend tracklets in active_tracklets 160 | if tracklet_scores.size > 0: 161 | if tracklet_scores[:, det_idx].max() > self._tracklet_merging_threshold: 162 | tracklet_idx = tracklet_scores[:, det_idx].argmax() 163 | max_score = tracklet_scores[tracklet_idx, det_idx] 164 | # there should be no other det which has a high similarity 165 | if (tracklet_scores[tracklet_idx] >= max_score - self._tracklet_merging_second_best_relative_threshold).sum() == 1: 166 | # there should be no other tracklet to which this det is similar... 167 | if (tracklet_scores[:, det_idx] >= max_score - self._tracklet_merging_second_best_relative_threshold).sum() == 1: 168 | active_tracklets[tracklet_idx].add_detection(*det_args) 169 | merged = True 170 | 171 | # otherwise start new tracklet 172 | if not merged: 173 | tracklet = Tracklet(start_time=self._time_idx) 174 | tracklet.add_detection(*det_args) 175 | self._all_tracklets.append(tracklet) 176 | 177 | def _track(self): 178 | # we know that the tracklets are always sorted by time! 179 | n_tracklets = len(self._all_tracklets) 180 | last_dynprog_scores = self._dynprog_scores 181 | self._dynprog_scores = np.full(n_tracklets, fill_value=-1e20, dtype=np.float32) 182 | # init gt tracklet score 183 | self._dynprog_scores[0] = 0.0 184 | if last_dynprog_scores is not None: 185 | self._dynprog_scores[:last_dynprog_scores.size] = last_dynprog_scores 186 | end_times = np.array([t.end_time for t in self._all_tracklets]) 187 | im_h, im_w = self._ff_img_noresize.shape[:2] 188 | norm = np.array([im_w, im_h, im_w, im_h], np.float32) 189 | 190 | active_indices, = np.where(end_times >= self._time_idx + 1) 191 | active_tracklets = [self._all_tracklets[idx] for idx in active_indices] 192 | 193 | TRACKLET_KEEP_ALIVE_TIME = 1500 194 | if len(active_tracklets) > 0: 195 | if len(active_tracklets) == n_tracklets: 196 | alive_start_time = 0 197 | else: 198 | # select non-active tracklets: end_times < self._time_idx + 1 199 | alive_start_time = end_times[end_times < self._time_idx + 1].max() 200 | 201 | alive_indices, = np.where(end_times >= alive_start_time + 1 - TRACKLET_KEEP_ALIVE_TIME) 202 | alive_tracklets = [self._all_tracklets[idx] for idx in alive_indices] 203 | alive_end_boxes_cxcywh = xyxy_to_cxcywh_np(np.array([t.boxes[-1] for t in alive_tracklets])) 204 | alive_end_times = end_times[alive_indices] 205 | alive_dynprog_scores = self._dynprog_scores[alive_indices] 206 | active_start_boxes_cxcywh = xyxy_to_cxcywh_np(np.array([t.boxes[0] for t in active_tracklets])) 207 | all_pairwise_diffs = np.abs(active_start_boxes_cxcywh[:, np.newaxis] - alive_end_boxes_cxcywh[np.newaxis]) / norm 208 | all_pairwise_diffs = -all_pairwise_diffs.mean(axis=2) 209 | 210 | for idx, t_idx in enumerate(active_indices): 211 | tracklet = self._all_tracklets[t_idx] 212 | unary = self._ff_gt_score_weight * sum(tracklet.ff_gt_scores) + \ 213 | self._ff_gt_tracklet_score_weight * sum(tracklet.ff_gt_tracklet_scores) 214 | 215 | valid_mask = tracklet.start_time >= alive_end_times 216 | if valid_mask.any(): 217 | pairwise_scores = all_pairwise_diffs[idx] 218 | pred_scores = alive_dynprog_scores + self._location_score_weight * pairwise_scores 219 | pred_scores[np.logical_not(valid_mask)] = -1e20 220 | best_pred_idx = pred_scores.argmax() 221 | best_pred_score = pred_scores[best_pred_idx] 222 | if best_pred_score > -1e20: 223 | self._dynprog_scores[t_idx] = best_pred_score + unary 224 | 225 | t_idx = self._dynprog_scores.argmax() 226 | tracklet = self._all_tracklets[t_idx] 227 | # add current frame score weighted with epsilon to change relative ranking within tracklet 228 | EPSILON = 0.00001 229 | if tracklet.end_time >= self._time_idx + 1: 230 | score = self._ff_gt_score_weight * max(tracklet.ff_gt_scores) + \ 231 | self._ff_gt_tracklet_score_weight * max(tracklet.ff_gt_tracklet_scores) \ 232 | + EPSILON * tracklet.ff_gt_scores[-1] 233 | else: 234 | score = -1.0 + EPSILON * tracklet.ff_gt_scores[-1] 235 | # or we could select the best tracklet in current frame 236 | return tracklet.boxes[-1], score 237 | 238 | if VIZ_WITH_OPENCV: 239 | def _viz_tracklets(self): 240 | print("viz tracklets frame", self._time_idx) 241 | import cv2 242 | self._cv_img = self._imgs_for_viz[-1].copy() 243 | colors = generate_colors() 244 | t = self._time_idx 245 | for idx, tracklet in enumerate(self._all_tracklets): 246 | # probably filter by confidence and tracklet length 247 | #if tracklet.end_time - tracklet.start_time < 2: 248 | # continue 249 | if max(tracklet.ff_gt_scores) < 0.2: 250 | continue 251 | if tracklet.start_time <= t < tracklet.end_time: 252 | color = colors[idx % len(colors)] 253 | box = tracklet.boxes[t - tracklet.start_time] 254 | #cv2.rectangle(self._cv_img, (box[0], box[1]), (box[2], box[3]), [255 * x for x in color], 1) 255 | 256 | def _viz_result(self, box): 257 | import cv2 258 | #cv2.rectangle(self._cv_img, (box[0], box[1]), (box[2], box[3]), (0, 0, 255), 6) 259 | cv2.rectangle(self._cv_img, (box[0], box[1]), (box[2], box[3]), (0, 252, 124), 6) 260 | cv2.imshow('SUPERTRACK', self._cv_img) 261 | cv2.waitKey(1) 262 | #cv2.waitKey(0) 263 | else: 264 | def _viz_tracklets(self): 265 | print("viz tracklets frame", self._time_idx) 266 | import matplotlib.pyplot as plt 267 | from matplotlib.patches import Rectangle 268 | if self._ax is None: 269 | fig, self._ax = plt.subplots(1) 270 | colors = generate_colors() 271 | t = self._time_idx 272 | img = self._imgs_for_viz[-1] 273 | self._ax.clear() 274 | self._ax.imshow(img[..., ::-1]) 275 | for idx, tracklet in enumerate(self._all_tracklets): 276 | # probably filter by confidence and tracklet length 277 | if tracklet.end_time - tracklet.start_time < 2: 278 | continue 279 | if max(tracklet.ff_gt_scores) < 0.2: 280 | continue 281 | if tracklet.start_time <= t < tracklet.end_time: 282 | color = colors[idx % len(colors)] 283 | box = tracklet.boxes[t - tracklet.start_time] 284 | width = box[2] - box[0] 285 | height = box[3] - box[1] 286 | rect = Rectangle((box[0], box[1]), width, height, color=color, fill=False) 287 | self._ax.add_patch(rect) 288 | # plt.pause(0.0001) 289 | 290 | def _viz_result(self, box): 291 | width = box[2] - box[0] 292 | height = box[3] - box[1] 293 | import matplotlib.pyplot as plt 294 | from matplotlib.patches import Rectangle 295 | rect = Rectangle((box[0], box[1]), width, height, color="red", fill=False, linewidth=4.0) 296 | self._ax.add_patch(rect) 297 | plt.pause(0.00001) 298 | -------------------------------------------------------------------------------- /tracking/do_tracking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import os 4 | 5 | from got10k.experiments import ExperimentGOT10k, ExperimentVOT, ExperimentOTB, ExperimentUAV123, ExperimentLaSOT, ExperimentDAVIS, ExperimentYouTubeVOS, ExperimentTrackingNet, ExperimentOxuva, ExperimentNfS, ExperimentTColor128 6 | from got10k.experiments.custom import ExperimentCustom 7 | 8 | from tracking.argmax_tracker import ArgmaxTracker 9 | from tracking.three_stage_tracker import ThreeStageTracker 10 | 11 | # change these data paths to where you have the datasets! 12 | DATASET_PREFIX = "/globalwork/data/" 13 | VOT18_ROOT_DIR = os.path.join(DATASET_PREFIX, 'vot18') 14 | VOT17_ROOT_DIR = os.path.join(DATASET_PREFIX, 'vot17') 15 | VOT16_ROOT_DIR = os.path.join(DATASET_PREFIX, 'vot16') 16 | VOT15_ROOT_DIR = os.path.join(DATASET_PREFIX, 'vot15') 17 | VOT18_LT_ROOT_DIR = os.path.join(DATASET_PREFIX, 'vot18-lt') 18 | OTB_2015_ROOT_DIR = os.path.join(DATASET_PREFIX, 'OTB_new') 19 | OTB_2013_ROOT_DIR = os.path.join(DATASET_PREFIX, 'OTB2013') 20 | DAVIS_2017_ROOT_DIR = os.path.join(DATASET_PREFIX, 'DAVIS2017') 21 | YOUTUBE_VOS_2019_ROOT_DIR = os.path.join(DATASET_PREFIX, "youtube-vos-2019") 22 | GOT10K_ROOT_DIR = os.path.join(DATASET_PREFIX, 'GOT10k') 23 | UAV123_ROOT_DIR = os.path.join(DATASET_PREFIX, 'UAV123') 24 | LASOT_ROOT_DIR = os.path.join(DATASET_PREFIX, 'LaSOTBenchmark') 25 | TRACKINGNET_ROOT_DIR = os.path.join(DATASET_PREFIX, 'TrackingNet') 26 | NFS_ROOT_DIR = os.path.join(DATASET_PREFIX, 'nfs') 27 | TC128_ROOT_DIR = os.path.join(DATASET_PREFIX, 'tc128/Temple-color-128') 28 | OXUVA_ROOT_DIR = os.path.join(DATASET_PREFIX, 'oxuva') 29 | 30 | RESULT_DIR = 'tracking_data/results/' 31 | REPORT_DIR = 'tracking_data/reports/' 32 | 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('--start_idx', type=int, help='first video index to process', default=0) 35 | parser.add_argument('--end_idx', type=int, help='last video index to process (exclusive)', default=None) 36 | 37 | # TDPA parameters. You can just leave them at the default values which will work well on a wide range of datasets 38 | parser.add_argument('--tracklet_distance_threshold', type=float, default=0.06) 39 | parser.add_argument('--tracklet_merging_threshold', type=float, default=0.3) 40 | parser.add_argument('--tracklet_merging_second_best_relative_threshold', type=float, default=0.3) 41 | parser.add_argument('--ff_gt_score_weight', type=float, default=0.1) 42 | parser.add_argument('--ff_gt_tracklet_score_weight', type=float, default=0.9) 43 | parser.add_argument('--location_score_weight', type=float, default=7.0) 44 | 45 | parser.add_argument('--model', type=str, default="best", help='one of "best", "nohardexamples", or "gotonly"') 46 | parser.add_argument('--tracker', type=str, default='ThreeStageTracker') 47 | parser.add_argument('--n_proposals', type=int, default=None) 48 | parser.add_argument('--resolution', type=str, default=None) 49 | parser.add_argument('--visualize_tracker', action='store_true', 50 | help='use visualization of tracker (recommended over --visualize_experiment)') 51 | parser.add_argument('--visualize_experiment', action='store_true', 52 | help='use visualization of got experiment (not recommended, usually --visualize_tracker is better)') 53 | parser.add_argument('--custom_dataset_name', type=str, default=None) 54 | parser.add_argument('--custom_dataset_root_dir', type=str, default=None) 55 | parser.add_argument('--main', type=str) 56 | args = parser.parse_args() 57 | 58 | 59 | def build_tracker(): 60 | if args.tracker == "ArgmaxTracker": 61 | return ArgmaxTracker() 62 | elif args.tracker == "ThreeStageTracker": 63 | pass 64 | else: 65 | assert False, ("Unknown tracker", args.tracker) 66 | 67 | tracklet_param_str = str(args.tracklet_distance_threshold) + "_" + str(args.tracklet_merging_threshold) + "_" + \ 68 | str(args.tracklet_merging_second_best_relative_threshold) 69 | if args.n_proposals is not None: 70 | tracklet_param_str += "_proposals" + str(args.n_proposals) 71 | if args.resolution is not None: 72 | tracklet_param_str += "_resolution-" + str(args.resolution) 73 | if args.model != "best": 74 | tracklet_param_str = args.model + "_" + tracklet_param_str 75 | if args.visualize_tracker: 76 | tracklet_param_str2 = "viz_" + tracklet_param_str 77 | else: 78 | tracklet_param_str2 = tracklet_param_str 79 | param_str = tracklet_param_str2 + "_" + str(args.ff_gt_score_weight) + "_" + \ 80 | str(args.ff_gt_tracklet_score_weight) + "_" + str(args.location_score_weight) 81 | 82 | name = "ThreeStageTracker_" + param_str 83 | tracker = ThreeStageTracker(tracklet_distance_threshold=args.tracklet_distance_threshold, 84 | tracklet_merging_threshold=args.tracklet_merging_threshold, 85 | tracklet_merging_second_best_relative_threshold= 86 | args.tracklet_merging_second_best_relative_threshold, 87 | ff_gt_score_weight=args.ff_gt_score_weight, 88 | ff_gt_tracklet_score_weight=args.ff_gt_tracklet_score_weight, 89 | location_score_weight=args.location_score_weight, 90 | name=name, 91 | do_viz=args.visualize_tracker, 92 | model=args.model, 93 | n_proposals=args.n_proposals, 94 | resolution=args.resolution) 95 | return tracker 96 | 97 | 98 | def main_vot18(reset=True): 99 | root_dir = VOT18_ROOT_DIR 100 | if reset: 101 | experiments = "supervised" 102 | else: 103 | experiments = "unsupervised" 104 | tracker = build_tracker() 105 | experiment = ExperimentVOT( 106 | root_dir=root_dir, 107 | version=2018, 108 | result_dir=RESULT_DIR, 109 | report_dir=REPORT_DIR, 110 | experiments=experiments, 111 | start_idx=args.start_idx, 112 | end_idx=args.end_idx 113 | ) 114 | experiment.run(tracker, visualize=args.visualize_experiment) 115 | experiment.report([tracker.name]) 116 | 117 | 118 | def main_vot18_noreset(): 119 | main_vot18(reset=False) 120 | 121 | 122 | def main_vot18_threestage(): 123 | tracker = build_tracker() 124 | root_dir = VOT18_ROOT_DIR 125 | experiment = ExperimentVOT( 126 | root_dir=root_dir, 127 | version=2018, 128 | result_dir=RESULT_DIR, 129 | report_dir=REPORT_DIR, 130 | experiments="supervised", 131 | start_idx=args.start_idx, 132 | end_idx=args.end_idx 133 | ) 134 | experiment.run(tracker, visualize=args.visualize_experiment) 135 | experiment.report([tracker.name]) 136 | 137 | 138 | def main_vot17(): 139 | root_dir = VOT17_ROOT_DIR 140 | experiments = "supervised" 141 | tracker = build_tracker() 142 | experiment = ExperimentVOT( 143 | root_dir=root_dir, 144 | version=2017, 145 | result_dir=RESULT_DIR, 146 | report_dir=REPORT_DIR, 147 | experiments=experiments, 148 | start_idx=args.start_idx, 149 | end_idx=args.end_idx 150 | ) 151 | experiment.run(tracker, visualize=args.visualize_experiment) 152 | 153 | 154 | def main_vot16(): 155 | root_dir = VOT16_ROOT_DIR 156 | experiments = "supervised" 157 | tracker = build_tracker() 158 | experiment = ExperimentVOT( 159 | root_dir=root_dir, 160 | version=2016, 161 | result_dir=RESULT_DIR, 162 | report_dir=REPORT_DIR, 163 | experiments=experiments, 164 | start_idx=args.start_idx, 165 | end_idx=args.end_idx 166 | ) 167 | experiment.run(tracker, visualize=args.visualize_experiment) 168 | 169 | 170 | def main_vot15(): 171 | root_dir = VOT15_ROOT_DIR 172 | experiments = "supervised" 173 | tracker = build_tracker() 174 | experiment = ExperimentVOT( 175 | root_dir=root_dir, 176 | version=2015, 177 | result_dir=RESULT_DIR, 178 | report_dir=REPORT_DIR, 179 | experiments=experiments, 180 | start_idx=args.start_idx, 181 | end_idx=args.end_idx 182 | ) 183 | experiment.run(tracker, visualize=args.visualize_experiment) 184 | 185 | 186 | def main_vot18lt(): 187 | tracker = build_tracker() 188 | experiment = ExperimentVOT( 189 | root_dir=VOT18_LT_ROOT_DIR, 190 | version='LT2018', 191 | result_dir=RESULT_DIR, 192 | report_dir=REPORT_DIR, 193 | experiments="unsupervised", 194 | start_idx=args.start_idx, 195 | end_idx=args.end_idx 196 | ) 197 | experiment.run(tracker, visualize=args.visualize_experiment) 198 | # this needs to be eval'ed from matlab, so do not call report() 199 | 200 | 201 | def main_otb(): 202 | tracker = build_tracker() 203 | root_dir = OTB_2015_ROOT_DIR 204 | experiment = ExperimentOTB( 205 | root_dir=root_dir, 206 | result_dir=RESULT_DIR, 207 | report_dir=REPORT_DIR, 208 | start_idx=args.start_idx, 209 | end_idx=args.end_idx 210 | ) 211 | experiment.run(tracker, visualize=args.visualize_experiment) 212 | experiment.report([tracker.name]) 213 | 214 | 215 | def main_otb2013(): 216 | tracker = build_tracker() 217 | root_dir = OTB_2013_ROOT_DIR 218 | experiment = ExperimentOTB( 219 | version=2013, 220 | root_dir=root_dir, 221 | result_dir=RESULT_DIR, 222 | report_dir=REPORT_DIR, 223 | start_idx=args.start_idx, 224 | end_idx=args.end_idx 225 | ) 226 | experiment.run(tracker, visualize=args.visualize_experiment) 227 | experiment.report([tracker.name]) 228 | 229 | 230 | def main_otb50(): 231 | tracker = build_tracker() 232 | root_dir = OTB_2015_ROOT_DIR 233 | experiment = ExperimentOTB( 234 | version='tb50', 235 | root_dir=root_dir, 236 | result_dir=RESULT_DIR, 237 | report_dir=REPORT_DIR, 238 | start_idx=args.start_idx, 239 | end_idx=args.end_idx 240 | ) 241 | experiment.run(tracker, visualize=args.visualize_experiment) 242 | experiment.report([tracker.name]) 243 | 244 | 245 | def main_davis(version="2017_val"): 246 | tracker = build_tracker() 247 | root_dir = DAVIS_2017_ROOT_DIR 248 | experiment = ExperimentDAVIS( 249 | root_dir=root_dir, 250 | result_dir=RESULT_DIR, 251 | report_dir=REPORT_DIR, 252 | start_idx=args.start_idx, 253 | end_idx=args.end_idx, 254 | version=version 255 | ) 256 | experiment.run(tracker, visualize=args.visualize_experiment) 257 | experiment.report([tracker.name]) 258 | 259 | 260 | def main_davis2016(): 261 | main_davis(version="2016_val") 262 | 263 | 264 | def main_davis2017(): 265 | main_davis(version="2017_val") 266 | 267 | 268 | def main_davis2017_testdev(): 269 | main_davis(version="2017_testdev") 270 | 271 | 272 | def main_davis2017_train(): 273 | main_davis(version="2017_train") 274 | 275 | 276 | def main_davis2017_train_multiobj(): 277 | main_davis(version="2017_train_multiobj") 278 | 279 | 280 | def main_youtubevos(version="valid"): 281 | tracker = build_tracker() 282 | root_dir = YOUTUBE_VOS_2019_ROOT_DIR 283 | experiment = ExperimentYouTubeVOS( 284 | root_dir=root_dir, 285 | result_dir=RESULT_DIR, 286 | report_dir=REPORT_DIR, 287 | start_idx=args.start_idx, 288 | end_idx=args.end_idx, 289 | version=version 290 | ) 291 | experiment.run(tracker, visualize=args.visualize_experiment) 292 | 293 | 294 | def main_got(subset='val'): 295 | dataset_name = "GOT10k" 296 | if subset != 'val': 297 | dataset_name += "_" + subset 298 | tracker = build_tracker() 299 | experiment = ExperimentGOT10k( 300 | root_dir=GOT10K_ROOT_DIR, # GOT-10k's root directory 301 | subset=subset, # 'train' | 'val' | 'test' 302 | result_dir=RESULT_DIR, # where to store tracking results 303 | report_dir=REPORT_DIR, # where to store evaluation reports 304 | start_idx=args.start_idx, 305 | end_idx=args.end_idx 306 | ) 307 | experiment.run(tracker, visualize=args.visualize_experiment) 308 | experiment.report([tracker.name]) 309 | 310 | 311 | def main_got_test(): 312 | main_got(subset='test') 313 | 314 | 315 | def main_uav123(): 316 | tracker = build_tracker() 317 | experiment = ExperimentUAV123( 318 | root_dir=UAV123_ROOT_DIR, 319 | result_dir=RESULT_DIR, 320 | report_dir=REPORT_DIR, 321 | start_idx=args.start_idx, 322 | end_idx=args.end_idx 323 | ) 324 | experiment.run(tracker, visualize=args.visualize_experiment) 325 | experiment.report([tracker.name]) 326 | 327 | 328 | def main_uav20l(): 329 | tracker = build_tracker() 330 | experiment = ExperimentUAV123( 331 | root_dir=UAV123_ROOT_DIR, 332 | version='UAV20L', 333 | result_dir=RESULT_DIR, 334 | report_dir=REPORT_DIR, 335 | start_idx=args.start_idx, 336 | end_idx=args.end_idx 337 | ) 338 | experiment.run(tracker, visualize=args.visualize_experiment) 339 | experiment.report([tracker.name]) 340 | 341 | 342 | def main_lasot(): 343 | tracker = build_tracker() 344 | experiment = ExperimentLaSOT( 345 | root_dir=LASOT_ROOT_DIR, 346 | result_dir=RESULT_DIR, 347 | report_dir=REPORT_DIR, 348 | subset='test', 349 | start_idx=args.start_idx, 350 | end_idx=args.end_idx 351 | ) 352 | experiment.run(tracker, visualize=args.visualize_experiment) 353 | experiment.report([tracker.name]) 354 | 355 | 356 | def main_trackingnet(): 357 | tracker = build_tracker() 358 | experiment = ExperimentTrackingNet( 359 | root_dir=TRACKINGNET_ROOT_DIR, 360 | result_dir=RESULT_DIR, 361 | report_dir=REPORT_DIR, 362 | subset='test', 363 | start_idx=args.start_idx, 364 | end_idx=args.end_idx 365 | ) 366 | experiment.run(tracker, visualize=args.visualize_experiment) 367 | 368 | 369 | def main_nfs(): 370 | tracker = build_tracker() 371 | experiment = ExperimentNfS( 372 | root_dir=NFS_ROOT_DIR, 373 | fps=30, 374 | result_dir=RESULT_DIR, 375 | report_dir=REPORT_DIR, 376 | start_idx=args.start_idx, 377 | end_idx=args.end_idx 378 | ) 379 | experiment.run(tracker, visualize=args.visualize_experiment) 380 | experiment.report([tracker.name]) 381 | 382 | 383 | def main_tc128(): 384 | tracker = build_tracker() 385 | experiment = ExperimentTColor128( 386 | root_dir=TC128_ROOT_DIR, 387 | result_dir=RESULT_DIR, 388 | report_dir=REPORT_DIR, 389 | start_idx=args.start_idx, 390 | end_idx=args.end_idx 391 | ) 392 | experiment.run(tracker, visualize=args.visualize_experiment) 393 | experiment.report([tracker.name]) 394 | 395 | 396 | def main_oxuva(testset=True): 397 | tracker = build_tracker() 398 | experiment = ExperimentOxuva( 399 | root_dir=OXUVA_ROOT_DIR, 400 | result_dir=RESULT_DIR, 401 | report_dir=REPORT_DIR, 402 | subset='test' if testset else 'dev', 403 | start_idx=args.start_idx, 404 | end_idx=args.end_idx 405 | ) 406 | experiment.run(tracker, visualize=args.visualize_experiment) 407 | 408 | 409 | def main_oxuva_dev(): 410 | main_oxuva(testset=False) 411 | 412 | 413 | def main_custom(): 414 | custom_dataset_root_dir = args.custom_dataset_root_dir 415 | assert custom_dataset_root_dir is not None 416 | custom_dataset_name = args.custom_dataset_name 417 | assert custom_dataset_name is not None 418 | tracker = build_tracker() 419 | experiment = ExperimentCustom( 420 | root_dir=custom_dataset_root_dir, 421 | name=custom_dataset_name 422 | ) 423 | experiment.run(tracker, visualize=args.visualize_experiment) 424 | 425 | 426 | if __name__ == "__main__": 427 | assert args.main is not None, "--main not supplied, e.g. --main main_otb" 428 | eval(args.main + "()") 429 | -------------------------------------------------------------------------------- /model_frcnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File: model.py 3 | 4 | import tensorflow as tf 5 | 6 | from tensorpack.models import Conv2D, FullyConnected, layer_register 7 | from tensorpack.tfutils.argscope import argscope 8 | from tensorpack.tfutils.common import get_tf_version_tuple 9 | from tensorpack.tfutils.scope_utils import under_name_scope 10 | from tensorpack.tfutils.summary import add_moving_summary 11 | from tensorpack.utils.argtools import memoized_method 12 | 13 | from basemodel import GroupNorm 14 | from config import config as cfg 15 | from model_box import decode_bbox_target, encode_bbox_target 16 | from utils.box_ops import pairwise_iou 17 | 18 | 19 | @under_name_scope() 20 | def proposal_metrics(iou): 21 | """ 22 | Add summaries for RPN proposals. 23 | 24 | Args: 25 | iou: nxm, #proposal x #gt 26 | """ 27 | # find best roi for each gt, for summary only 28 | best_iou = tf.reduce_max(iou, axis=0) 29 | mean_best_iou = tf.reduce_mean(best_iou, name='best_iou_per_gt') 30 | summaries = [mean_best_iou] 31 | with tf.device('/cpu:0'): 32 | for th in [0.3, 0.5]: 33 | recall = tf.truediv( 34 | tf.count_nonzero(best_iou >= th), 35 | tf.size(best_iou, out_type=tf.int64), 36 | name='recall_iou{}'.format(th)) 37 | summaries.append(recall) 38 | add_moving_summary(*summaries) 39 | 40 | 41 | @under_name_scope() 42 | def sample_fast_rcnn_targets(boxes, gt_boxes, gt_labels): 43 | """ 44 | Sample some boxes from all proposals for training. 45 | #fg is guaranteed to be > 0, because ground truth boxes will be added as proposals. 46 | 47 | Args: 48 | boxes: nx4 region proposals, floatbox 49 | gt_boxes: mx4, floatbox 50 | gt_labels: m, int32 51 | 52 | Returns: 53 | A BoxProposals instance. 54 | sampled_boxes: tx4 floatbox, the rois 55 | sampled_labels: t int64 labels, in [0, #class). Positive means foreground. 56 | fg_inds_wrt_gt: #fg indices, each in range [0, m-1]. 57 | It contains the matching GT of each foreground roi. 58 | """ 59 | iou = pairwise_iou(boxes, gt_boxes) # nxm 60 | proposal_metrics(iou) 61 | 62 | # add ground truth as proposals as well 63 | boxes = tf.concat([boxes, gt_boxes], axis=0) # (n+m) x 4 64 | iou = tf.concat([iou, tf.eye(tf.shape(gt_boxes)[0])], axis=0) # (n+m) x m 65 | # #proposal=n+m from now on 66 | 67 | def sample_fg_bg(iou): 68 | fg_mask = tf.reduce_max(iou, axis=1) >= cfg.FRCNN.FG_THRESH 69 | 70 | fg_inds = tf.reshape(tf.where(fg_mask), [-1]) 71 | num_fg = tf.minimum(int( 72 | cfg.FRCNN.BATCH_PER_IM * cfg.FRCNN.FG_RATIO), 73 | tf.size(fg_inds), name='num_fg') 74 | fg_inds = tf.random_shuffle(fg_inds)[:num_fg] 75 | 76 | bg_inds = tf.reshape(tf.where(tf.logical_not(fg_mask)), [-1]) 77 | num_bg = tf.minimum( 78 | cfg.FRCNN.BATCH_PER_IM - num_fg, 79 | tf.size(bg_inds), name='num_bg') 80 | bg_inds = tf.random_shuffle(bg_inds)[:num_bg] 81 | 82 | add_moving_summary(num_fg, num_bg) 83 | return fg_inds, bg_inds 84 | 85 | fg_inds, bg_inds = sample_fg_bg(iou) 86 | # fg,bg indices w.r.t proposals 87 | 88 | best_iou_ind = tf.argmax(iou, axis=1) # #proposal, each in 0~m-1 89 | fg_inds_wrt_gt = tf.gather(best_iou_ind, fg_inds) # num_fg 90 | 91 | all_indices = tf.concat([fg_inds, bg_inds], axis=0) # indices w.r.t all n+m proposal boxes 92 | ret_boxes = tf.gather(boxes, all_indices) 93 | 94 | ret_labels = tf.concat( 95 | [tf.gather(gt_labels, fg_inds_wrt_gt), 96 | tf.zeros_like(bg_inds, dtype=tf.int64)], axis=0) 97 | # stop the gradient -- they are meant to be training targets 98 | return BoxProposals( 99 | tf.stop_gradient(ret_boxes, name='sampled_proposal_boxes'), 100 | tf.stop_gradient(ret_labels, name='sampled_labels'), 101 | tf.stop_gradient(fg_inds_wrt_gt)) 102 | 103 | 104 | @layer_register(log_shape=True) 105 | def fastrcnn_outputs(feature, num_classes, class_agnostic_regression=False): 106 | """ 107 | Args: 108 | feature (any shape): 109 | num_classes(int): num_category + 1 110 | class_agnostic_regression (bool): if True, regression to N x 1 x 4 111 | 112 | Returns: 113 | cls_logits: N x num_class classification logits 114 | reg_logits: N x num_classx4 or Nx2x4 if class agnostic 115 | """ 116 | classification = FullyConnected( 117 | 'class', feature, num_classes, 118 | kernel_initializer=tf.random_normal_initializer(stddev=0.01)) 119 | num_classes_for_box = 1 if class_agnostic_regression else num_classes 120 | box_regression = FullyConnected( 121 | 'box', feature, num_classes_for_box * 4, 122 | kernel_initializer=tf.random_normal_initializer(stddev=0.001)) 123 | box_regression = tf.reshape(box_regression, (-1, num_classes_for_box, 4), name='output_box') 124 | return classification, box_regression 125 | 126 | 127 | @under_name_scope() 128 | def fastrcnn_losses(labels, label_logits, fg_boxes, fg_box_logits): 129 | """ 130 | Args: 131 | labels: n, 132 | label_logits: nxC 133 | fg_boxes: nfgx4, encoded 134 | fg_box_logits: nfgxCx4 or nfgx1x4 if class agnostic 135 | 136 | Returns: 137 | label_loss, box_loss 138 | """ 139 | label_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( 140 | labels=labels, logits=label_logits) 141 | if cfg.FRCNN.USE_FOCAL_LOSS: 142 | indices = tf.stack((tf.cast(tf.range(tf.shape(labels)[0]), tf.int64), labels), axis=1) 143 | posteriors = tf.nn.softmax(label_logits, axis=1) 144 | gathered_posteriors = tf.gather_nd(posteriors, indices) 145 | gamma = 2.0 146 | label_loss = 5 * (1 - gathered_posteriors) ** gamma * label_loss 147 | # possibly upweight the foreground labels for balancing 148 | if cfg.FRCNN.FG_LOSS_WEIGHTING_FACTOR != 1.0: 149 | label_loss *= (tf.constant(cfg.FRCNN.FG_LOSS_WEIGHTING_FACTOR - 1.0, dtype=tf.float32) * \ 150 | tf.cast(labels > 0, tf.float32)) + 1.0 151 | 152 | label_loss = tf.reduce_mean(label_loss, name='label_loss') 153 | 154 | fg_inds = tf.where(labels > 0)[:, 0] 155 | fg_labels = tf.gather(labels, fg_inds) 156 | num_fg = tf.size(fg_inds, out_type=tf.int64) 157 | empty_fg = tf.equal(num_fg, 0) 158 | if int(fg_box_logits.shape[1]) > 1: 159 | indices = tf.stack( 160 | [tf.range(num_fg), fg_labels], axis=1) # #fgx2 161 | fg_box_logits = tf.gather_nd(fg_box_logits, indices) 162 | else: 163 | fg_box_logits = tf.reshape(fg_box_logits, [-1, 4]) 164 | 165 | with tf.name_scope('label_metrics'), tf.device('/cpu:0'): 166 | prediction = tf.argmax(label_logits, axis=1, name='label_prediction') 167 | correct = tf.cast(tf.equal(prediction, labels), tf.float32) # boolean/integer gather is unavailable on GPU 168 | accuracy = tf.reduce_mean(correct, name='accuracy') 169 | fg_label_pred = tf.argmax(tf.gather(label_logits, fg_inds), axis=1) 170 | num_zero = tf.reduce_sum(tf.cast(tf.equal(fg_label_pred, 0), tf.int64), name='num_zero') 171 | false_negative = tf.where( 172 | empty_fg, 0., tf.cast(tf.truediv(num_zero, num_fg), tf.float32), name='false_negative') 173 | fg_accuracy = tf.where( 174 | empty_fg, 0., tf.reduce_mean(tf.gather(correct, fg_inds)), name='fg_accuracy') 175 | 176 | box_loss = tf.losses.huber_loss( 177 | fg_boxes, fg_box_logits, reduction=tf.losses.Reduction.SUM) 178 | box_loss *= cfg.FRCNN.BOX_LOSS_WEIGHTING_FACTOR 179 | box_loss = tf.truediv( 180 | box_loss, tf.cast(tf.shape(labels)[0], tf.float32), name='box_loss') 181 | 182 | add_moving_summary(label_loss, box_loss, accuracy, 183 | fg_accuracy, false_negative, tf.cast(num_fg, tf.float32, name='num_fg_label')) 184 | return [label_loss, box_loss] 185 | 186 | 187 | @under_name_scope() 188 | def fastrcnn_predictions(boxes, scores): 189 | """ 190 | Generate final results from predictions of all proposals. 191 | 192 | Args: 193 | boxes: n#classx4 floatbox in float32 194 | scores: nx#class 195 | 196 | Returns: 197 | boxes: Kx4 198 | scores: K 199 | labels: K 200 | """ 201 | assert boxes.shape[1] == cfg.DATA.NUM_CLASS 202 | assert scores.shape[1] == cfg.DATA.NUM_CLASS 203 | boxes = tf.transpose(boxes, [1, 0, 2])[1:, :, :] # #catxnx4 204 | scores = tf.transpose(scores[:, 1:], [1, 0]) # #catxn 205 | 206 | def f(X): 207 | """ 208 | prob: n probabilities 209 | box: nx4 boxes 210 | 211 | Returns: n boolean, the selection 212 | """ 213 | prob, box = X 214 | output_shape = tf.shape(prob, out_type=tf.int64) 215 | # filter by score threshold 216 | ids = tf.reshape(tf.where(prob > cfg.TEST.RESULT_SCORE_THRESH), [-1]) 217 | prob = tf.gather(prob, ids) 218 | box = tf.gather(box, ids) 219 | # NMS within each class 220 | nms_thres = cfg.TEST.FRCNN_NMS_THRESH 221 | selection = tf.image.non_max_suppression( 222 | box, prob, cfg.TEST.RESULTS_PER_IM, nms_thres) 223 | selection = tf.gather(ids, selection) 224 | 225 | if get_tf_version_tuple() >= (1, 13): 226 | sorted_selection = tf.sort(selection, direction='ASCENDING') 227 | mask = tf.sparse.SparseTensor(indices=tf.expand_dims(sorted_selection, 1), 228 | values=tf.ones_like(sorted_selection, dtype=tf.bool), 229 | dense_shape=output_shape) 230 | mask = tf.sparse.to_dense(mask, default_value=False) 231 | else: 232 | # this function is deprecated by TF 233 | sorted_selection = -tf.nn.top_k(-selection, k=tf.size(selection))[0] 234 | mask = tf.sparse_to_dense( 235 | sparse_indices=sorted_selection, 236 | output_shape=output_shape, 237 | sparse_values=True, 238 | default_value=False) 239 | return mask 240 | 241 | # TF bug in version 1.11, 1.12: https://github.com/tensorflow/tensorflow/issues/22750 242 | buggy_tf = get_tf_version_tuple() in [(1, 11), (1, 12)] 243 | masks = tf.map_fn(f, (scores, boxes), dtype=tf.bool, 244 | parallel_iterations=1 if buggy_tf else 10) # #cat x N 245 | selected_indices = tf.where(masks) # #selection x 2, each is (cat_id, box_id) 246 | scores = tf.boolean_mask(scores, masks) 247 | 248 | # filter again by sorting scores 249 | topk_scores, topk_indices = tf.nn.top_k( 250 | scores, 251 | tf.minimum(cfg.TEST.RESULTS_PER_IM, tf.size(scores)), 252 | sorted=False) 253 | filtered_selection = tf.gather(selected_indices, topk_indices) 254 | cat_ids, box_ids = tf.unstack(filtered_selection, axis=1) 255 | 256 | final_scores = tf.identity(topk_scores, name='scores') 257 | final_labels = tf.add(cat_ids, 1, name='labels') 258 | final_ids = tf.stack([cat_ids, box_ids], axis=1, name='all_ids') 259 | final_boxes = tf.gather_nd(boxes, final_ids, name='boxes') 260 | return final_boxes, final_scores, final_labels 261 | 262 | 263 | """ 264 | FastRCNN heads for FPN: 265 | """ 266 | 267 | 268 | @layer_register(log_shape=True) 269 | def fastrcnn_2fc_head(feature): 270 | """ 271 | Args: 272 | feature (any shape): 273 | 274 | Returns: 275 | 2D head feature 276 | """ 277 | dim = cfg.FPN.FRCNN_FC_HEAD_DIM 278 | init = tf.variance_scaling_initializer() 279 | hidden = FullyConnected('fc6', feature, dim, kernel_initializer=init, activation=tf.nn.relu) 280 | hidden = FullyConnected('fc7', hidden, dim, kernel_initializer=init, activation=tf.nn.relu) 281 | return hidden 282 | 283 | 284 | @layer_register(log_shape=True) 285 | def fastrcnn_Xconv1fc_head(feature, num_convs, norm=None): 286 | """ 287 | Args: 288 | feature (NCHW): 289 | num_classes(int): num_category + 1 290 | num_convs (int): number of conv layers 291 | norm (str or None): either None or 'GN' 292 | 293 | Returns: 294 | 2D head feature 295 | """ 296 | assert norm in [None, 'GN'], norm 297 | l = feature 298 | with argscope(Conv2D, data_format='channels_first', 299 | kernel_initializer=tf.variance_scaling_initializer( 300 | scale=2.0, mode='fan_out', 301 | distribution='untruncated_normal' if get_tf_version_tuple() >= (1, 12) else 'normal')): 302 | for k in range(num_convs): 303 | l = Conv2D('conv{}'.format(k), l, cfg.FPN.FRCNN_CONV_HEAD_DIM, 3, activation=tf.nn.relu) 304 | if norm is not None: 305 | l = GroupNorm('gn{}'.format(k), l) 306 | l = FullyConnected('fc', l, cfg.FPN.FRCNN_FC_HEAD_DIM, 307 | kernel_initializer=tf.variance_scaling_initializer(), activation=tf.nn.relu) 308 | return l 309 | 310 | 311 | def fastrcnn_4conv1fc_head(*args, **kwargs): 312 | return fastrcnn_Xconv1fc_head(*args, num_convs=4, **kwargs) 313 | 314 | 315 | def fastrcnn_4conv1fc_gn_head(*args, **kwargs): 316 | return fastrcnn_Xconv1fc_head(*args, num_convs=4, norm='GN', **kwargs) 317 | 318 | 319 | class BoxProposals(object): 320 | """ 321 | A structure to manage box proposals and their relations with ground truth. 322 | """ 323 | def __init__(self, boxes, labels=None, fg_inds_wrt_gt=None): 324 | """ 325 | Args: 326 | boxes: Nx4 327 | labels: N, each in [0, #class), the true label for each input box 328 | fg_inds_wrt_gt: #fg, each in [0, M) 329 | 330 | The last four arguments could be None when not training. 331 | """ 332 | for k, v in locals().items(): 333 | if k != 'self' and v is not None: 334 | setattr(self, k, v) 335 | 336 | @memoized_method 337 | def fg_inds(self): 338 | """ Returns: #fg indices in [0, N-1] """ 339 | return tf.reshape(tf.where(self.labels > 0), [-1], name='fg_inds') 340 | 341 | @memoized_method 342 | def fg_boxes(self): 343 | """ Returns: #fg x4""" 344 | return tf.gather(self.boxes, self.fg_inds(), name='fg_boxes') 345 | 346 | @memoized_method 347 | def fg_labels(self): 348 | """ Returns: #fg""" 349 | return tf.gather(self.labels, self.fg_inds(), name='fg_labels') 350 | 351 | 352 | class FastRCNNHead(object): 353 | """ 354 | A class to process & decode inputs/outputs of a fastrcnn classification+regression head. 355 | """ 356 | def __init__(self, proposals, box_logits, label_logits, gt_boxes, bbox_regression_weights): 357 | """ 358 | Args: 359 | proposals: BoxProposals 360 | box_logits: Nx#classx4 or Nx1x4, the output of the head 361 | label_logits: Nx#class, the output of the head 362 | gt_boxes: Mx4 363 | bbox_regression_weights: a 4 element tensor 364 | """ 365 | for k, v in locals().items(): 366 | if k != 'self' and v is not None: 367 | setattr(self, k, v) 368 | self._bbox_class_agnostic = int(box_logits.shape[1]) == 1 369 | 370 | @memoized_method 371 | def fg_box_logits(self): 372 | """ Returns: #fg x ? x 4 """ 373 | return tf.gather(self.box_logits, self.proposals.fg_inds(), name='fg_box_logits') 374 | 375 | @memoized_method 376 | def losses(self): 377 | encoded_fg_gt_boxes = encode_bbox_target( 378 | tf.gather(self.gt_boxes, self.proposals.fg_inds_wrt_gt), 379 | self.proposals.fg_boxes()) * self.bbox_regression_weights 380 | return fastrcnn_losses( 381 | self.proposals.labels, self.label_logits, 382 | encoded_fg_gt_boxes, self.fg_box_logits() 383 | ) 384 | 385 | @memoized_method 386 | def decoded_output_boxes(self): 387 | """ Returns: N x #class x 4 """ 388 | anchors = tf.tile(tf.expand_dims(self.proposals.boxes, 1), 389 | [1, cfg.DATA.NUM_CLASS, 1]) # N x #class x 4 390 | decoded_boxes = decode_bbox_target( 391 | self.box_logits / self.bbox_regression_weights, 392 | anchors 393 | ) 394 | return decoded_boxes 395 | 396 | @memoized_method 397 | def decoded_output_boxes_for_true_label(self): 398 | """ Returns: Nx4 decoded boxes """ 399 | return self._decoded_output_boxes_for_label(self.proposals.labels) 400 | 401 | @memoized_method 402 | def decoded_output_boxes_for_predicted_label(self): 403 | """ Returns: Nx4 decoded boxes """ 404 | return self._decoded_output_boxes_for_label(self.predicted_labels()) 405 | 406 | @memoized_method 407 | def decoded_output_boxes_for_label(self, labels): 408 | assert not self._bbox_class_agnostic 409 | indices = tf.stack([ 410 | tf.range(tf.size(labels, out_type=tf.int64)), 411 | labels 412 | ]) 413 | needed_logits = tf.gather_nd(self.box_logits, indices) 414 | decoded = decode_bbox_target( 415 | needed_logits / self.bbox_regression_weights, 416 | self.proposals.boxes 417 | ) 418 | return decoded 419 | 420 | @memoized_method 421 | def decoded_output_boxes_class_agnostic(self): 422 | """ Returns: Nx4 """ 423 | assert self._bbox_class_agnostic 424 | box_logits = tf.reshape(self.box_logits, [-1, 4]) 425 | decoded = decode_bbox_target( 426 | box_logits / self.bbox_regression_weights, 427 | self.proposals.boxes 428 | ) 429 | return decoded 430 | 431 | @memoized_method 432 | def output_scores(self, name=None): 433 | """ Returns: N x #class scores, summed to one for each box.""" 434 | return tf.nn.softmax(self.label_logits, name=name) 435 | 436 | @memoized_method 437 | def predicted_labels(self): 438 | """ Returns: N ints """ 439 | return tf.argmax(self.label_logits, axis=1, name='predicted_labels') 440 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File: coco.py 3 | 4 | import numpy as np 5 | import random 6 | import os 7 | import tqdm 8 | import json 9 | import glob 10 | 11 | from tensorpack.utils import logger 12 | from tensorpack.utils.timer import timed_operation 13 | 14 | from config import config as cfg 15 | 16 | __all__ = ['COCODetection', 'DetectionDataset'] 17 | 18 | 19 | class COCODetection(object): 20 | # handle the weird (but standard) split of train and val 21 | _INSTANCE_TO_BASEDIR = { 22 | 'valminusminival2014': 'val2014', 23 | 'minival2014': 'val2014', 24 | } 25 | 26 | COCO_id_to_category_id = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 27: 25, 28: 26, 31: 27, 32: 28, 33: 29, 34: 30, 35: 31, 36: 32, 37: 33, 38: 34, 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40, 46: 41, 47: 42, 48: 43, 49: 44, 50: 45, 51: 46, 52: 47, 53: 48, 54: 49, 55: 50, 56: 51, 57: 52, 58: 53, 59: 54, 60: 55, 61: 56, 62: 57, 63: 58, 64: 59, 65: 60, 67: 61, 70: 62, 72: 63, 73: 64, 74: 65, 75: 66, 76: 67, 77: 68, 78: 69, 79: 70, 80: 71, 81: 72, 82: 73, 84: 74, 85: 75, 86: 76, 87: 77, 88: 78, 89: 79, 90: 80} # noqa 27 | """ 28 | Mapping from the incontinuous COCO category id to an id in [1, #category] 29 | For your own dataset, this should usually be an identity mapping. 30 | """ 31 | 32 | # 80 names for COCO 33 | class_names = [ 34 | "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] # noqa 35 | 36 | def __init__(self, basedir, name): 37 | self.name = name 38 | self._imgdir = os.path.realpath(os.path.join( 39 | basedir, self._INSTANCE_TO_BASEDIR.get(name, name))) 40 | assert os.path.isdir(self._imgdir), self._imgdir 41 | annotation_file = os.path.join( 42 | basedir, 'annotations/instances_{}.json'.format(name)) 43 | assert os.path.isfile(annotation_file), annotation_file 44 | 45 | from pycocotools.coco import COCO 46 | self.coco = COCO(annotation_file) 47 | logger.info("Instances loaded from {}.".format(annotation_file)) 48 | 49 | # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb 50 | def print_coco_metrics(self, json_file): 51 | """ 52 | Args: 53 | json_file (str): path to the results json file in coco format 54 | Returns: 55 | dict: the evaluation metrics 56 | """ 57 | from pycocotools.cocoeval import COCOeval 58 | ret = {} 59 | cocoDt = self.coco.loadRes(json_file) 60 | cocoEval = COCOeval(self.coco, cocoDt, 'bbox') 61 | cocoEval.evaluate() 62 | cocoEval.accumulate() 63 | cocoEval.summarize() 64 | fields = ['IoU=0.5:0.95', 'IoU=0.5', 'IoU=0.75', 'small', 'medium', 'large'] 65 | for k in range(6): 66 | ret['mAP(bbox)/' + fields[k]] = cocoEval.stats[k] 67 | 68 | json_obj = json.load(open(json_file)) 69 | if len(json_obj) > 0 and 'segmentation' in json_obj[0]: 70 | cocoEval = COCOeval(self.coco, cocoDt, 'segm') 71 | cocoEval.evaluate() 72 | cocoEval.accumulate() 73 | cocoEval.summarize() 74 | for k in range(6): 75 | ret['mAP(segm)/' + fields[k]] = cocoEval.stats[k] 76 | return ret 77 | 78 | def load(self, add_gt=True, add_mask=False): 79 | """ 80 | Args: 81 | add_gt: whether to add ground truth bounding box annotations to the dicts 82 | add_mask: whether to also add ground truth mask 83 | 84 | Returns: 85 | a list of dict, each has keys including: 86 | 'height', 'width', 'id', 'file_name', 87 | and (if add_gt is True) 'boxes', 'class', 'is_crowd', and optionally 88 | 'segmentation'. 89 | """ 90 | if add_mask: 91 | assert add_gt 92 | with timed_operation('Load Groundtruth Boxes for {}'.format(self.name)): 93 | img_ids = self.coco.getImgIds() 94 | img_ids.sort() 95 | # list of dict, each has keys: height,width,id,file_name 96 | imgs = self.coco.loadImgs(img_ids) 97 | 98 | for img in tqdm.tqdm(imgs): 99 | self._use_absolute_file_name(img) 100 | if add_gt: 101 | self._add_detection_gt(img, add_mask) 102 | return imgs 103 | 104 | def _use_absolute_file_name(self, img): 105 | """ 106 | Change relative filename to abosolute file name. 107 | """ 108 | img['file_name'] = os.path.join( 109 | self._imgdir, img['file_name']) 110 | assert os.path.isfile(img['file_name']), img['file_name'] 111 | 112 | def _add_detection_gt(self, img, add_mask): 113 | """ 114 | Add 'boxes', 'class', 'is_crowd' of this image to the dict, used by detection. 115 | If add_mask is True, also add 'segmentation' in coco poly format. 116 | """ 117 | # ann_ids = self.coco.getAnnIds(imgIds=img['id']) 118 | # objs = self.coco.loadAnns(ann_ids) 119 | objs = self.coco.imgToAnns[img['id']] # equivalent but faster than the above two lines 120 | 121 | # clean-up boxes 122 | valid_objs = [] 123 | width = img['width'] 124 | height = img['height'] 125 | for objid, obj in enumerate(objs): 126 | if obj.get('ignore', 0) == 1: 127 | continue 128 | x1, y1, w, h = obj['bbox'] 129 | # bbox is originally in float 130 | # x1/y1 means upper-left corner and w/h means true w/h. This can be verified by segmentation pixels. 131 | # But we do make an assumption here that (0.0, 0.0) is upper-left corner of the first pixel 132 | 133 | x1 = np.clip(float(x1), 0, width) 134 | y1 = np.clip(float(y1), 0, height) 135 | w = np.clip(float(x1 + w), 0, width) - x1 136 | h = np.clip(float(y1 + h), 0, height) - y1 137 | # Require non-zero seg area and more than 1x1 box size 138 | if obj['area'] > 1 and w > 0 and h > 0 and w * h >= 4: 139 | obj['bbox'] = [x1, y1, x1 + w, y1 + h] 140 | valid_objs.append(obj) 141 | 142 | if add_mask: 143 | segs = obj['segmentation'] 144 | if not isinstance(segs, list): 145 | assert obj['iscrowd'] == 1 146 | obj['segmentation'] = None 147 | else: 148 | valid_segs = [np.asarray(p).reshape(-1, 2).astype('float32') for p in segs if len(p) >= 6] 149 | if len(valid_segs) == 0: 150 | logger.error("Object {} in image {} has no valid polygons!".format(objid, img['file_name'])) 151 | elif len(valid_segs) < len(segs): 152 | logger.warn("Object {} in image {} has invalid polygons!".format(objid, img['file_name'])) 153 | 154 | obj['segmentation'] = valid_segs 155 | 156 | # all geometrically-valid boxes are returned 157 | boxes = np.asarray([obj['bbox'] for obj in valid_objs], dtype='float32') # (n, 4) 158 | cls = np.asarray([ 159 | self.COCO_id_to_category_id[obj['category_id']] 160 | for obj in valid_objs], dtype='int32') # (n,) 161 | is_crowd = np.asarray([obj['iscrowd'] for obj in valid_objs], dtype='int8') 162 | 163 | # add the keys 164 | img['boxes'] = boxes # nx4 165 | img['class'] = cls # n, always >0 166 | img['is_crowd'] = is_crowd # n, 167 | if add_mask: 168 | # also required to be float32 169 | img['segmentation'] = [ 170 | obj['segmentation'] for obj in valid_objs] 171 | 172 | @staticmethod 173 | def load_many(basedir, names, add_gt=True, add_mask=False): 174 | """ 175 | Load and merges several instance files together. 176 | 177 | Returns the same format as :meth:`COCODetection.load`. 178 | """ 179 | if not isinstance(names, (list, tuple)): 180 | names = [names] 181 | ret = [] 182 | for n in names: 183 | coco = COCODetection(basedir, n) 184 | ret.extend(coco.load(add_gt, add_mask=add_mask)) 185 | return ret 186 | 187 | 188 | if cfg.DATA.IMAGENET_VID or cfg.DATA.DAVIS2017 or cfg.DATA.GOT10K or cfg.DATA.TRACKINGNET or cfg.DATA.COCO \ 189 | or cfg.DATA.YOUTUBE_BB or cfg.DATA.DAVIS_LUCID or cfg.DATA.LASOT: 190 | 191 | def calculate_ious(bboxes1, bboxes2): 192 | # assume layout (x0, y0, x1, y1) 193 | min_ = np.minimum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :]) 194 | max_ = np.maximum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :]) 195 | I = np.maximum(min_[..., 2] - max_[..., 0], 0) * np.maximum(min_[..., 3] - max_[..., 1], 0) 196 | area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1]) 197 | area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1]) 198 | U = area1[:, np.newaxis] + area2[np.newaxis, :] - I 199 | assert (U > 0).all() 200 | IOUs = I / U 201 | assert (IOUs >= 0).all() 202 | assert (IOUs <= 1).all() 203 | return IOUs 204 | 205 | class DetectionDataset(object): 206 | occluders = None 207 | coco = None 208 | coco_anns = None 209 | 210 | def __init__(self): 211 | """ 212 | This function is responsible for setting the dataset-specific 213 | attributes in both cfg and self. 214 | """ 215 | # we do it category agnostic, so only foreground and background 216 | #self.num_category = cfg.DATA.NUM_CATEGORY = 1 217 | self.num_category = cfg.DATA.NUM_CATEGORY 218 | cfg.DATA.TRAIN = ["train"] 219 | cfg.DATA.VAL = ["val"] 220 | self.num_classes = self.num_category + 1 221 | self.class_names = cfg.DATA.CLASS_NAMES = ["BG", "FG"] 222 | 223 | def _load_roidb_imagenet_vid(self, subset): 224 | imageset_postfix = "ImageSets/VID/" + subset + ".txt" 225 | imagesets_file = os.path.join(cfg.DATA.IMAGENET_VID_ROOT, imageset_postfix) 226 | vid_names = set() 227 | with open(imagesets_file) as f: 228 | for l in f: 229 | sp = l.split("/") 230 | vid_name = sp[0] + "/" + sp[1] 231 | vid_names.add(vid_name) 232 | vid_names = list(vid_names) 233 | return vid_names 234 | 235 | def _load_roidb_davis(self, subset): 236 | imagesets_file = os.path.join(cfg.DATA.DAVIS2017_ROOT, "ImageSets", "2017", subset + ".txt") 237 | vid_names = [] 238 | with open(imagesets_file) as f: 239 | for l in f: 240 | vid_name = l.strip() 241 | vid_names.append(vid_name) 242 | return vid_names 243 | 244 | def _load_roidb_davis_lucid(self, subset): 245 | vid_names = sorted(glob.glob(cfg.DATA.DAVIS_LUCID_ROOT + "*/*/")) 246 | vid_names = ['/'.join(v.split("/")[-3:]) for v in vid_names] 247 | 248 | if cfg.TRACK_VIDEO_ID is not None: 249 | vid_names = sorted(glob.glob(cfg.DATA.DAVIS_LUCID_ROOT + "test-challenge/*/")) 250 | vid_names = ['/'.join(v.split("/")[-3:]) for v in vid_names] 251 | vid_names = [vid_names[cfg.TRACK_VIDEO_ID]] 252 | 253 | print("!!!!!!!!!!!!!!!ONLY DOING: ", vid_names[0], "!!!!!!!!!!!!!!!!!!!!!!!!!") 254 | 255 | # vid_names = ['test-challenge/speed-skating/'] 256 | 257 | return vid_names 258 | 259 | def _load_roidb_youtubevos(self, subset): 260 | meta_file = os.path.join(cfg.DATA.YOUTUBE_VOS_ROOT, subset, "meta.json") 261 | with open(meta_file) as f: 262 | metadata = json.load(f) 263 | vid_names = list(metadata["videos"].keys()) 264 | return vid_names 265 | 266 | def _load_roidb_got10k(self, subset): 267 | vid_names = [] 268 | with open(os.path.join(cfg.DATA.GOT10K_ROOT, 'train/list.txt')) as f: 269 | for l in f: 270 | vid_names.append(l.strip()) 271 | assert len(vid_names) > 0 272 | return vid_names 273 | 274 | def _load_roidb_lasot(self, subset): 275 | vid_names = [] 276 | with open(os.path.join(cfg.DATA.LASOT_ROOT, 'training_set.txt')) as f: 277 | for l in f: 278 | vid_names.append(l.strip()) 279 | assert len(vid_names) > 0 280 | return vid_names 281 | 282 | def _load_roidb_youtube_bb(self, subset): 283 | clips_fn = os.path.join(cfg.DATA.YOUTUBE_BB_ROOT, "sets", "clips.txt") 284 | roidbs = [] 285 | with open(clips_fn) as f: 286 | for l in f: 287 | roidbs.append(l.strip()) 288 | return roidbs 289 | 290 | def _load_roidb_trackingnet(self, subset): 291 | gt_files = glob.glob(os.path.join(cfg.DATA.TRACKINGNET_ROOT, "TRAIN*", "anno", "*.txt")) 292 | vid_names = [x.split("/")[-3] + "____" + x.split("/")[-1].replace(".txt", "") for x in gt_files] 293 | return vid_names 294 | 295 | def _load_roidb(self, subset): 296 | vid_names = [] 297 | if cfg.DATA.IMAGENET_VID: 298 | logger.info("using imagenet vid") 299 | vid_names_imgnet = self._load_roidb_imagenet_vid(subset) 300 | vid_names_imgnet = ["VID/" + x for x in vid_names_imgnet] 301 | vid_names += vid_names_imgnet 302 | if cfg.DATA.DAVIS2017: 303 | logger.info("using davis2017") 304 | vid_names_davis = self._load_roidb_davis(subset) 305 | vid_names_davis = ["DAVIS/" + x for x in vid_names_davis] 306 | vid_names += vid_names_davis 307 | if cfg.DATA.YOUTUBE_VOS: 308 | logger.info("using YouTube-VOS") 309 | vid_names_youtubevos = self._load_roidb_youtubevos(subset) 310 | vid_names_youtubevos = ["YouTubeVOS/" + x for x in vid_names_youtubevos] 311 | vid_names += vid_names_youtubevos 312 | if cfg.DATA.GOT10K: 313 | logger.info("using GOT10K") 314 | vid_names_got = self._load_roidb_got10k(subset) 315 | vid_names_got = ["GOT10K/" + x for x in vid_names_got] 316 | vid_names += vid_names_got 317 | if cfg.DATA.LASOT: 318 | logger.info("using LaSOT") 319 | vid_names_lasot = self._load_roidb_lasot(subset) 320 | vid_names_lasot = ["LaSOT/" + x for x in vid_names_lasot] 321 | vid_names += vid_names_lasot 322 | if cfg.DATA.YOUTUBE_BB: 323 | logger.info("using YouTube-BB") 324 | vid_names_youtube_bb = self._load_roidb_youtube_bb(subset) 325 | vid_names_youtube_bb = ["YouTube-BB/" + x for x in vid_names_youtube_bb] 326 | # duplicate all other vid names in order to sample them more often (YouTube-BB is very large, 300k clips) 327 | vid_names *= 60 328 | vid_names += vid_names_youtube_bb 329 | if cfg.DATA.TRACKINGNET: 330 | logger.info("using TrackingNet") 331 | vid_names_trackingnet = self._load_roidb_trackingnet(subset) 332 | vid_names_trackingnet = ["TrackingNet/" + x for x in vid_names_trackingnet] 333 | # duplicate all other vid names in order to sample them more often (trackingnet is very large) 334 | vid_names *= 2 335 | vid_names += vid_names_trackingnet 336 | random.shuffle(vid_names) 337 | return vid_names 338 | 339 | def load_training_roidbs(self, names): 340 | """ 341 | Args: 342 | names (list[str]): name of the training datasets, e.g. ['train2014', 'valminusminival2014'] 343 | 344 | Returns: 345 | roidbs (list[dict]): 346 | 347 | Produce "roidbs" as a list of dict, each dict corresponds to one image with k>=0 instances. 348 | and the following keys are expected for training: 349 | 350 | height, width: integer 351 | file_name: str, full path to the image 352 | boxes: numpy array of kx4 floats, each row is [x1, y1, x2, y2] 353 | category: numpy array of k integers, in the range of [1, #categories] 354 | is_crowd: k booleans. Use k False if you don't know what it means. 355 | segmentation: k lists of numpy arrays (one for each instance). 356 | Each list of numpy arrays corresponds to the mask for one instance. 357 | Each numpy array in the list is a polygon of shape Nx2, 358 | because one mask can be represented by N polygons. 359 | 360 | If your segmentation annotations are originally masks rather than polygons, 361 | either convert it, or the augmentation will need to be changed or skipped accordingly. 362 | 363 | Include this field only if training Mask R-CNN. 364 | """ 365 | return self._load_roidb("train") 366 | 367 | def load_inference_roidbs(self, name): 368 | """ 369 | Args: 370 | name (str): name of one inference dataset, e.g. 'minival2014' 371 | 372 | Returns: 373 | roidbs (list[dict]): 374 | 375 | Each dict corresponds to one image to run inference on. The 376 | following keys in the dict are expected: 377 | 378 | file_name (str): full path to the image 379 | id (str): an id for the image. The inference results will be stored with this id. 380 | """ 381 | return self._load_roidb("val") 382 | 383 | def eval_or_save_inference_results(self, results, dataset, output=None): 384 | ious_at_k = [[] for _ in range(10)] 385 | ious_per_obj = {} 386 | # results.sort(key=lambda x: x.gt_file) 387 | for r in results: 388 | gt_file, res, target_box = r 389 | seq, obj_id, timestep = gt_file.split('__') 390 | obj_name = seq + "__" + obj_id 391 | res.sort(key=lambda x: x.score, reverse=True) 392 | max_iou = 0.0 393 | if obj_name not in ious_per_obj.keys(): 394 | ious_per_obj[obj_name] = [[] for _ in range(10)] 395 | 396 | for k in range(10): 397 | if len(res) > k: 398 | det = res[k] 399 | det_box = det.box 400 | iou = calculate_ious(target_box[np.newaxis], det_box[np.newaxis])[0, 0] 401 | max_iou = max(max_iou, iou) 402 | if k == 0: 403 | best_box = det_box 404 | ious_per_obj[obj_name][k].append(max_iou) 405 | # print(seq,obj_id,timestep,target_box, best_box, ious_per_obj[obj_name][0][-1]) 406 | 407 | for obj_name in ious_per_obj.keys(): 408 | for k in range(10): 409 | ious_at_k[k].append(np.mean(ious_per_obj[obj_name][k])) 410 | print(obj_name, np.mean(ious_per_obj[obj_name][0])) 411 | 412 | eval_res = {"miou@" + str(k + 1): np.mean(ious_at_k[k]) for k in range(10)} 413 | print(eval_res) 414 | return eval_res 415 | 416 | # code for singleton: 417 | _instance = None 418 | 419 | def __new__(cls): 420 | if not isinstance(cls._instance, cls): 421 | cls._instance = object.__new__(cls) 422 | return cls._instance 423 | else: 424 | class DetectionDataset(object): 425 | """ 426 | A singleton to load datasets, evaluate results, and provide metadata. 427 | 428 | To use your own dataset that's not in COCO format, rewrite all methods of this class. 429 | """ 430 | def __init__(self): 431 | """ 432 | This function is responsible for setting the dataset-specific 433 | attributes in both cfg and self. 434 | """ 435 | self.num_category = cfg.DATA.NUM_CATEGORY = len(COCODetection.class_names) 436 | self.num_classes = self.num_category + 1 437 | self.class_names = cfg.DATA.CLASS_NAMES = ["BG"] + COCODetection.class_names 438 | 439 | def load_training_roidbs(self, names): 440 | """ 441 | Args: 442 | names (list[str]): name of the training datasets, e.g. ['train2014', 'valminusminival2014'] 443 | 444 | Returns: 445 | roidbs (list[dict]): 446 | 447 | Produce "roidbs" as a list of dict, each dict corresponds to one image with k>=0 instances. 448 | and the following keys are expected for training: 449 | 450 | height, width: integer 451 | file_name: str, full path to the image 452 | boxes: numpy array of kx4 floats, each row is [x1, y1, x2, y2] 453 | category: numpy array of k integers, in the range of [1, #categories] 454 | is_crowd: k booleans. Use k False if you don't know what it means. 455 | segmentation: k lists of numpy arrays (one for each instance). 456 | Each list of numpy arrays corresponds to the mask for one instance. 457 | Each numpy array in the list is a polygon of shape Nx2, 458 | because one mask can be represented by N polygons. 459 | 460 | If your segmentation annotations are originally masks rather than polygons, 461 | either convert it, or the augmentation will need to be changed or skipped accordingly. 462 | 463 | Include this field only if training Mask R-CNN. 464 | """ 465 | return COCODetection.load_many( 466 | cfg.DATA.BASEDIR, cfg.DATA.TRAIN, add_gt=True, add_mask=cfg.MODE_MASK) 467 | 468 | def load_inference_roidbs(self, name): 469 | """ 470 | Args: 471 | name (str): name of one inference dataset, e.g. 'minival2014' 472 | 473 | Returns: 474 | roidbs (list[dict]): 475 | 476 | Each dict corresponds to one image to run inference on. The 477 | following keys in the dict are expected: 478 | 479 | file_name (str): full path to the image 480 | id (str): an id for the image. The inference results will be stored with this id. 481 | """ 482 | return COCODetection.load_many(cfg.DATA.BASEDIR, name, add_gt=False) 483 | 484 | def eval_or_save_inference_results(self, results, dataset, output=None): 485 | """ 486 | Args: 487 | results (list[dict]): the inference results as dicts. 488 | Each dict corresponds to one __instance__. It contains the following keys: 489 | 490 | image_id (str): the id that matches `load_inference_roidbs`. 491 | category_id (int): the category prediction, in range [1, #category] 492 | bbox (list[float]): x1, y1, x2, y2 493 | score (float): 494 | segmentation: the segmentation mask in COCO's rle format. 495 | 496 | dataset (str): the name of the dataset to evaluate. 497 | output (str): the output file to optionally save the results to. 498 | 499 | Returns: 500 | dict: the evaluation results. 501 | """ 502 | continuous_id_to_COCO_id = {v: k for k, v in COCODetection.COCO_id_to_category_id.items()} 503 | for res in results: 504 | # convert to COCO's incontinuous category id 505 | res['category_id'] = continuous_id_to_COCO_id[res['category_id']] 506 | # COCO expects results in xywh format 507 | box = res['bbox'] 508 | box[2] -= box[0] 509 | box[3] -= box[1] 510 | res['bbox'] = [round(float(x), 3) for x in box] 511 | 512 | assert output is not None, "COCO evaluation requires an output file!" 513 | with open(output, 'w') as f: 514 | json.dump(results, f) 515 | if len(output): 516 | # sometimes may crash if the results are empty? 517 | return COCODetection(cfg.DATA.BASEDIR, dataset).print_coco_metrics(output) 518 | else: 519 | return {} 520 | 521 | # code for singleton: 522 | _instance = None 523 | 524 | def __new__(cls): 525 | if not isinstance(cls._instance, cls): 526 | cls._instance = object.__new__(cls) 527 | return cls._instance 528 | 529 | 530 | if __name__ == '__main__': 531 | c = COCODetection(cfg.DATA.BASEDIR, 'train2014') 532 | gt_boxes = c.load(add_gt=True, add_mask=True) 533 | print("#Images:", len(gt_boxes)) 534 | --------------------------------------------------------------------------------