├── tracking
    ├── __init__.py
    ├── util.py
    ├── argmax_tracker.py
    ├── three_stage_tracker.py
    └── do_tracking.py
├── .gitignore
├── davis2017_fast_val_ids.txt
├── LICENSE
├── main_trax.py
├── hard_example_utils.py
├── utils
    ├── box_ops.py
    ├── generate_anchors.py
    └── np_box_ops.py
├── viz.py
├── model_mrcnn.py
├── README.md
├── vot_helper.py
├── common.py
├── model_rpn.py
├── model_box.py
├── basemodel.py
├── model_fpn.py
├── model_cascade.py
├── eval_utils.py
├── config.py
├── model_frcnn.py
└── dataset.py


/tracking/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | train_log
3 | tracking_data
4 | 


--------------------------------------------------------------------------------
/davis2017_fast_val_ids.txt:
--------------------------------------------------------------------------------
 1 | bike-packing__2
 2 | bmx-trees__1
 3 | bmx-trees__2
 4 | dogs-jump__1
 5 | dogs-jump__2
 6 | gold-fish__1
 7 | gold-fish__2
 8 | gold-fish__3
 9 | gold-fish__4
10 | gold-fish__5
11 | india__1
12 | india__2
13 | india__3
14 | judo__2
15 | kite-surf__1
16 | kite-surf__2
17 | lab-coat__1
18 | lab-coat__2
19 | loading__2
20 | loading__3
21 | motocross-jump__1
22 | paragliding-launch__1
23 | paragliding-launch__2
24 | paragliding-launch__3
25 | pigs__2
26 | shooting__1
27 | shooting__3
28 | soapbox__2
29 | soapbox__3
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Visual Computing Institute
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/main_trax.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from tracking.three_stage_tracker import ThreeStageTracker
 4 | import sys
 5 | import cv2
 6 | import PIL
 7 | import numpy as np
 8 | import vot_helper
 9 | 
10 | 
11 | class SiamRCNN:
12 |     def __init__(self, image, region):
13 |         sp = __file__.split("/")
14 |         ckpt = "/".join(sp[:-1]) + "/train_log/hard_mining3/model-1360500"
15 |         self._tracker = ThreeStageTracker(model="checkpoint:" + ckpt)
16 |         x, y, w, h = region
17 |         box = np.array([x, y, w, h])
18 |         self._tracker.init(image, box)
19 | 
20 |     def track(self, image):
21 |         new_box, score = self._tracker.update(image, use_confidences=True)
22 |         x, y, w, h = new_box
23 |         print(new_box, score)
24 |         rect = vot_helper.Rectangle(x, y, w, h)
25 |         return rect, score
26 | 
27 | 
28 | handle = vot_helper.VOT("rectangle")
29 | selection = handle.region()
30 | imagefile = handle.frame()
31 | if not imagefile:
32 |     sys.exit(0)
33 | 
34 | image = np.array(PIL.Image.open(imagefile))
35 | tracker = SiamRCNN(image, selection)
36 | while True:
37 |     imagefile = handle.frame()
38 |     if not imagefile:
39 |         break
40 |     image = np.array(PIL.Image.open(imagefile))
41 |     region, confidence = tracker.track(image)
42 |     handle.report(region, confidence)
43 | 


--------------------------------------------------------------------------------
/hard_example_utils.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def subsample_nns(query_seq, nns, names, n_seqs_to_sample, remove_query=True):
 5 |     random.shuffle(nns)
 6 | 
 7 |     nn_names = [names[n] for n in nns]
 8 |     nn_seqs = [x.split("/")[-2] for x in nn_names]
 9 | 
10 |     seq_to_nns = {}
11 |     for nn, nn_seq in zip(nns, nn_seqs):
12 |         if nn_seq not in seq_to_nns:
13 |             seq_to_nns[nn_seq] = []
14 |         seq_to_nns[nn_seq].append(nn)
15 | 
16 |     #seq_to_n = sorted([(k, len(v)) for k, v in seq_to_nns.items()], key=lambda x: x[1], reverse=True)
17 |     #n_total = sum(x[1] for x in seq_to_n)
18 |     #for seq, n_in_seqs in seq_to_n:
19 |     #    pct = n_in_seqs * 100 / n_total
20 |     #    if pct > 1.0:
21 |     #        print(seq, pct, "%")
22 |     #print("n_seqs in nns", len(seq_to_nns))
23 | 
24 |     sampled_nns = []
25 |     sample_seqs = set(seq_to_nns.keys())
26 |     if remove_query:
27 |         sample_seqs.remove(query_seq)
28 |     sample_seqs = list(sample_seqs)
29 |     random.shuffle(sample_seqs)
30 |     sample_seqs = sample_seqs[:n_seqs_to_sample]
31 | 
32 |     # get 1 per sequence
33 |     for seq in sample_seqs:
34 |         seq_nns = seq_to_nns[seq]
35 |         nn = random.choice(seq_nns)
36 |         sampled_nns.append(nn)
37 |     return sampled_nns
38 | 
39 | 
40 | def subsample_nns_old(name, nns, names, n_seqs_to_sample):
41 |     random.shuffle(nns)
42 | 
43 |     nn_names = [names[n] for n in nns]
44 |     nn_seqs = [x.split("/")[-2] for x in nn_names]
45 |     seq = name.split("/")[-2]
46 | 
47 |     sampled_nns = []
48 |     sampled_seqs = set()
49 |     sampled_seqs.add(seq)
50 |     # get 1 per sequence
51 |     for nn, seq in zip(nns, nn_seqs):
52 |         if seq not in sampled_seqs:
53 |             sampled_seqs.add(seq)
54 |             sampled_nns.append(nn)
55 |     sampled_nns = sampled_nns[:n_seqs_to_sample]
56 |     return sampled_nns
57 | 


--------------------------------------------------------------------------------
/utils/box_ops.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # File: box_ops.py
 3 | 
 4 | import tensorflow as tf
 5 | 
 6 | from tensorpack.tfutils.scope_utils import under_name_scope
 7 | 
 8 | 
 9 | """
10 | This file is modified from
11 | https://github.com/tensorflow/models/blob/master/object_detection/core/box_list_ops.py
12 | """
13 | 
14 | 
15 | @under_name_scope()
16 | def area(boxes):
17 |     """
18 |     Args:
19 |       boxes: nx4 floatbox
20 | 
21 |     Returns:
22 |       n
23 |     """
24 |     x_min, y_min, x_max, y_max = tf.split(boxes, 4, axis=1)
25 |     return tf.squeeze((y_max - y_min) * (x_max - x_min), [1])
26 | 
27 | 
28 | @under_name_scope()
29 | def pairwise_intersection(boxlist1, boxlist2):
30 |     """Compute pairwise intersection areas between boxes.
31 | 
32 |     Args:
33 |       boxlist1: Nx4 floatbox
34 |       boxlist2: Mx4
35 | 
36 |     Returns:
37 |       a tensor with shape [N, M] representing pairwise intersections
38 |     """
39 |     x_min1, y_min1, x_max1, y_max1 = tf.split(boxlist1, 4, axis=1)
40 |     x_min2, y_min2, x_max2, y_max2 = tf.split(boxlist2, 4, axis=1)
41 |     all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2))
42 |     all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2))
43 |     intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin)
44 |     all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2))
45 |     all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2))
46 |     intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin)
47 |     return intersect_heights * intersect_widths
48 | 
49 | 
50 | @under_name_scope()
51 | def pairwise_iou(boxlist1, boxlist2):
52 |     """Computes pairwise intersection-over-union between box collections.
53 | 
54 |     Args:
55 |       boxlist1: Nx4 floatbox
56 |       boxlist2: Mx4
57 | 
58 |     Returns:
59 |       a tensor with shape [N, M] representing pairwise iou scores.
60 |     """
61 |     intersections = pairwise_intersection(boxlist1, boxlist2)
62 |     areas1 = area(boxlist1)
63 |     areas2 = area(boxlist2)
64 |     unions = (
65 |         tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections)
66 |     return tf.where(
67 |         tf.equal(intersections, 0.0),
68 |         tf.zeros_like(intersections), tf.truediv(intersections, unions))
69 | 


--------------------------------------------------------------------------------
/viz.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # File: viz.py
  3 | 
  4 | import numpy as np
  5 | from six.moves import zip
  6 | 
  7 | from tensorpack.utils import viz
  8 | from tensorpack.utils.palette import PALETTE_RGB
  9 | 
 10 | from config import config as cfg
 11 | from utils.np_box_ops import iou as np_iou
 12 | 
 13 | 
 14 | def draw_annotation(img, boxes, klass, is_crowd=None):
 15 |     """Will not modify img"""
 16 |     labels = []
 17 |     assert len(boxes) == len(klass)
 18 |     if is_crowd is not None:
 19 |         assert len(boxes) == len(is_crowd)
 20 |         for cls, crd in zip(klass, is_crowd):
 21 |             clsname = cfg.DATA.CLASS_NAMES[cls]
 22 |             if crd == 1:
 23 |                 clsname += ';Crowd'
 24 |             labels.append(clsname)
 25 |     else:
 26 |         for cls in klass:
 27 |             labels.append(cfg.DATA.CLASS_NAMES[cls])
 28 |     img = viz.draw_boxes(img, boxes, labels)
 29 |     return img
 30 | 
 31 | 
 32 | def draw_proposal_recall(img, proposals, proposal_scores, gt_boxes):
 33 |     """
 34 |     Draw top3 proposals for each gt.
 35 |     Args:
 36 |         proposals: NPx4
 37 |         proposal_scores: NP
 38 |         gt_boxes: NG
 39 |     """
 40 |     box_ious = np_iou(gt_boxes, proposals)    # ng x np
 41 |     box_ious_argsort = np.argsort(-box_ious, axis=1)
 42 |     good_proposals_ind = box_ious_argsort[:, :3]   # for each gt, find 3 best proposals
 43 |     good_proposals_ind = np.unique(good_proposals_ind.ravel())
 44 | 
 45 |     proposals = proposals[good_proposals_ind, :]
 46 |     tags = list(map(str, proposal_scores[good_proposals_ind]))
 47 |     img = viz.draw_boxes(img, proposals, tags)
 48 |     return img, good_proposals_ind
 49 | 
 50 | 
 51 | def draw_predictions(img, boxes, scores):
 52 |     """
 53 |     Args:
 54 |         boxes: kx4
 55 |         scores: kxC
 56 |     """
 57 |     if len(boxes) == 0:
 58 |         return img
 59 |     labels = scores.argmax(axis=1)
 60 |     scores = scores.max(axis=1)
 61 |     tags = ["{},{:.2f}".format(cfg.DATA.CLASS_NAMES[lb], score) for lb, score in zip(labels, scores)]
 62 |     return viz.draw_boxes(img, boxes, tags)
 63 | 
 64 | 
 65 | def draw_final_outputs(img, results):
 66 |     """
 67 |     Args:
 68 |         results: [DetectionResult]
 69 |     """
 70 |     if len(results) == 0:
 71 |         return img
 72 | 
 73 |     tags = []
 74 |     for r in results:
 75 |         tags.append(
 76 |             "{},{:.2f}".format(cfg.DATA.CLASS_NAMES[r.class_id], r.score))
 77 |     boxes = np.asarray([r.box for r in results])
 78 |     ret = viz.draw_boxes(img, boxes, tags)
 79 | 
 80 |     for r in results:
 81 |         if r.mask is not None:
 82 |             ret = draw_mask(ret, r.mask)
 83 |     return ret
 84 | 
 85 | 
 86 | def draw_mask(im, mask, alpha=0.5, color=None):
 87 |     """
 88 |     Overlay a mask on top of the image.
 89 | 
 90 |     Args:
 91 |         im: a 3-channel uint8 image in BGR
 92 |         mask: a binary 1-channel image of the same size
 93 |         color: if None, will choose automatically
 94 |     """
 95 |     if color is None:
 96 |         color = PALETTE_RGB[np.random.choice(len(PALETTE_RGB))][::-1]
 97 |     im = np.where(np.repeat((mask > 0)[:, :, None], 3, axis=2),
 98 |                   im * (1 - alpha) + color * alpha, im)
 99 |     im = im.astype('uint8')
100 |     return im
101 | 


--------------------------------------------------------------------------------
/utils/generate_anchors.py:
--------------------------------------------------------------------------------
  1 | # https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py
  2 | 
  3 | # --------------------------------------------------------
  4 | # Faster R-CNN
  5 | # Copyright (c) 2015 Microsoft
  6 | # Licensed under The MIT License [see LICENSE for details]
  7 | # Written by Ross Girshick and Sean Bell
  8 | # --------------------------------------------------------
  9 | 
 10 | import numpy as np
 11 | from six.moves import range
 12 | 
 13 | # Verify that we compute the same anchors as Shaoqing's matlab implementation:
 14 | #
 15 | #    >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
 16 | #    >> anchors
 17 | #
 18 | #    anchors =
 19 | #
 20 | #       -83   -39   100    56
 21 | #      -175   -87   192   104
 22 | #      -359  -183   376   200
 23 | #       -55   -55    72    72
 24 | #      -119  -119   136   136
 25 | #      -247  -247   264   264
 26 | #       -35   -79    52    96
 27 | #       -79  -167    96   184
 28 | #      -167  -343   184   360
 29 | 
 30 | # array([[ -83.,  -39.,  100.,   56.],
 31 | #       [-175.,  -87.,  192.,  104.],
 32 | #       [-359., -183.,  376.,  200.],
 33 | #       [ -55.,  -55.,   72.,   72.],
 34 | #       [-119., -119.,  136.,  136.],
 35 | #       [-247., -247.,  264.,  264.],
 36 | #       [ -35.,  -79.,   52.,   96.],
 37 | #       [ -79., -167.,   96.,  184.],
 38 | #       [-167., -343.,  184.,  360.]])
 39 | 
 40 | 
 41 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
 42 |                      scales=2**np.arange(3, 6)):
 43 |     """
 44 |     Generate anchor (reference) windows by enumerating aspect ratios X
 45 |     scales wrt a reference (0, 0, 15, 15) window.
 46 |     """
 47 | 
 48 |     base_anchor = np.array([1, 1, base_size, base_size], dtype='float32') - 1
 49 |     ratio_anchors = _ratio_enum(base_anchor, ratios)
 50 |     anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
 51 |                          for i in range(ratio_anchors.shape[0])])
 52 |     return anchors
 53 | 
 54 | 
 55 | def _whctrs(anchor):
 56 |     """
 57 |     Return width, height, x center, and y center for an anchor (window).
 58 |     """
 59 | 
 60 |     w = anchor[2] - anchor[0] + 1
 61 |     h = anchor[3] - anchor[1] + 1
 62 |     x_ctr = anchor[0] + 0.5 * (w - 1)
 63 |     y_ctr = anchor[1] + 0.5 * (h - 1)
 64 |     return w, h, x_ctr, y_ctr
 65 | 
 66 | 
 67 | def _mkanchors(ws, hs, x_ctr, y_ctr):
 68 |     """
 69 |     Given a vector of widths (ws) and heights (hs) around a center
 70 |     (x_ctr, y_ctr), output a set of anchors (windows).
 71 |     """
 72 | 
 73 |     ws = ws[:, np.newaxis]
 74 |     hs = hs[:, np.newaxis]
 75 |     anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
 76 |                          y_ctr - 0.5 * (hs - 1),
 77 |                          x_ctr + 0.5 * (ws - 1),
 78 |                          y_ctr + 0.5 * (hs - 1)))
 79 |     return anchors
 80 | 
 81 | 
 82 | def _ratio_enum(anchor, ratios):
 83 |     """
 84 |     Enumerate a set of anchors for each aspect ratio wrt an anchor.
 85 |     """
 86 | 
 87 |     w, h, x_ctr, y_ctr = _whctrs(anchor)
 88 |     size = w * h
 89 |     size_ratios = size / ratios
 90 |     ws = np.round(np.sqrt(size_ratios))
 91 |     hs = np.round(ws * ratios)
 92 |     anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
 93 |     return anchors
 94 | 
 95 | 
 96 | def _scale_enum(anchor, scales):
 97 |     """
 98 |     Enumerate a set of anchors for each scale wrt an anchor.
 99 |     """
100 | 
101 |     w, h, x_ctr, y_ctr = _whctrs(anchor)
102 |     ws = w * scales
103 |     hs = h * scales
104 |     anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
105 |     return anchors
106 | 


--------------------------------------------------------------------------------
/model_mrcnn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | from tensorpack.models import Conv2D, Conv2DTranspose, layer_register
 6 | from tensorpack.tfutils.argscope import argscope
 7 | from tensorpack.tfutils.common import get_tf_version_tuple
 8 | from tensorpack.tfutils.scope_utils import under_name_scope
 9 | from tensorpack.tfutils.summary import add_moving_summary
10 | 
11 | from basemodel import GroupNorm
12 | from config import config as cfg
13 | 
14 | 
15 | @under_name_scope()
16 | def maskrcnn_loss(mask_logits, fg_labels, fg_target_masks):
17 |     """
18 |     Args:
19 |         mask_logits: #fg x #category xhxw
20 |         fg_labels: #fg, in 1~#class, int64
21 |         fg_target_masks: #fgxhxw, float32
22 |     """
23 |     num_fg = tf.size(fg_labels, out_type=tf.int64)
24 |     indices = tf.stack([tf.range(num_fg), fg_labels - 1], axis=1)  # #fgx2
25 |     mask_logits = tf.gather_nd(mask_logits, indices)  # #fgxhxw
26 |     mask_probs = tf.sigmoid(mask_logits)
27 | 
28 |     # add some training visualizations to tensorboard
29 |     with tf.name_scope('mask_viz'):
30 |         viz = tf.concat([fg_target_masks, mask_probs], axis=1)
31 |         viz = tf.expand_dims(viz, 3)
32 |         viz = tf.cast(viz * 255, tf.uint8, name='viz')
33 |         tf.summary.image('mask_truth|pred', viz, max_outputs=10)
34 | 
35 |     loss = tf.nn.sigmoid_cross_entropy_with_logits(
36 |         labels=fg_target_masks, logits=mask_logits)
37 |     loss = tf.reduce_mean(loss, name='maskrcnn_loss')
38 | 
39 |     pred_label = mask_probs > 0.5
40 |     truth_label = fg_target_masks > 0.5
41 |     accuracy = tf.reduce_mean(
42 |         tf.cast(tf.equal(pred_label, truth_label), tf.float32),
43 |         name='accuracy')
44 |     pos_accuracy = tf.logical_and(
45 |         tf.equal(pred_label, truth_label),
46 |         tf.equal(truth_label, True))
47 |     pos_accuracy = tf.reduce_mean(tf.cast(pos_accuracy, tf.float32), name='pos_accuracy')
48 |     fg_pixel_ratio = tf.reduce_mean(tf.cast(truth_label, tf.float32), name='fg_pixel_ratio')
49 | 
50 |     add_moving_summary(loss, accuracy, fg_pixel_ratio, pos_accuracy)
51 |     return loss
52 | 
53 | 
54 | @layer_register(log_shape=True)
55 | def maskrcnn_upXconv_head(feature, num_category, num_convs, norm=None):
56 |     """
57 |     Args:
58 |         feature (NxCx s x s): size is 7 in C4 models and 14 in FPN models.
59 |         num_category(int):
60 |         num_convs (int): number of convolution layers
61 |         norm (str or None): either None or 'GN'
62 | 
63 |     Returns:
64 |         mask_logits (N x num_category x 2s x 2s):
65 |     """
66 |     assert norm in [None, 'GN'], norm
67 |     l = feature
68 |     with argscope([Conv2D, Conv2DTranspose], data_format='channels_first',
69 |                   kernel_initializer=tf.variance_scaling_initializer(
70 |                       scale=2.0, mode='fan_out',
71 |                       distribution='untruncated_normal' if get_tf_version_tuple() >= (1, 12) else 'normal')):
72 |         # c2's MSRAFill is fan_out
73 |         for k in range(num_convs):
74 |             l = Conv2D('fcn{}'.format(k), l, cfg.MRCNN.HEAD_DIM, 3, activation=tf.nn.relu)
75 |             if norm is not None:
76 |                 l = GroupNorm('gn{}'.format(k), l)
77 |         l = Conv2DTranspose('deconv', l, cfg.MRCNN.HEAD_DIM, 2, strides=2, activation=tf.nn.relu)
78 |         l = Conv2D('conv', l, num_category, 1)
79 |     return l
80 | 
81 | 
82 | def maskrcnn_up4conv_head(*args, **kwargs):
83 |     return maskrcnn_upXconv_head(*args, num_convs=4, **kwargs)
84 | 
85 | 
86 | def maskrcnn_up4conv_gn_head(*args, **kwargs):
87 |     return maskrcnn_upXconv_head(*args, num_convs=4, norm='GN', **kwargs)
88 | 


--------------------------------------------------------------------------------
/utils/np_box_ops.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Operations for [N, 4] numpy arrays representing bounding boxes.
17 | 
18 | Example box operations that are supported:
19 |   * Areas: compute bounding box areas
20 |   * IOU: pairwise intersection-over-union scores
21 | """
22 | import numpy as np
23 | 
24 | 
25 | def area(boxes):
26 |   """Computes area of boxes.
27 | 
28 |   Args:
29 |     boxes: Numpy array with shape [N, 4] holding N boxes
30 | 
31 |   Returns:
32 |     a numpy array with shape [N*1] representing box areas
33 |   """
34 |   return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
35 | 
36 | 
37 | def intersection(boxes1, boxes2):
38 |   """Compute pairwise intersection areas between boxes.
39 | 
40 |   Args:
41 |     boxes1: a numpy array with shape [N, 4] holding N boxes
42 |     boxes2: a numpy array with shape [M, 4] holding M boxes
43 | 
44 |   Returns:
45 |     a numpy array with shape [N*M] representing pairwise intersection area
46 |   """
47 |   [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
48 |   [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
49 | 
50 |   all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
51 |   all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
52 |   intersect_heights = np.maximum(
53 |       np.zeros(all_pairs_max_ymin.shape, dtype='f4'),
54 |       all_pairs_min_ymax - all_pairs_max_ymin)
55 |   all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
56 |   all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
57 |   intersect_widths = np.maximum(
58 |       np.zeros(all_pairs_max_xmin.shape, dtype='f4'),
59 |       all_pairs_min_xmax - all_pairs_max_xmin)
60 |   return intersect_heights * intersect_widths
61 | 
62 | 
63 | def iou(boxes1, boxes2):
64 |   """Computes pairwise intersection-over-union between box collections.
65 | 
66 |   Args:
67 |     boxes1: a numpy array with shape [N, 4] holding N boxes.
68 |     boxes2: a numpy array with shape [M, 4] holding M boxes.
69 | 
70 |   Returns:
71 |     a numpy array with shape [N, M] representing pairwise iou scores.
72 |   """
73 |   intersect = intersection(boxes1, boxes2)
74 |   area1 = area(boxes1)
75 |   area2 = area(boxes2)
76 |   union = np.expand_dims(area1, axis=1) + np.expand_dims(
77 |       area2, axis=0) - intersect
78 |   return intersect / union
79 | 
80 | 
81 | def ioa(boxes1, boxes2):
82 |   """Computes pairwise intersection-over-area between box collections.
83 | 
84 |   Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
85 |   their intersection area over box2's area. Note that ioa is not symmetric,
86 |   that is, IOA(box1, box2) != IOA(box2, box1).
87 | 
88 |   Args:
89 |     boxes1: a numpy array with shape [N, 4] holding N boxes.
90 |     boxes2: a numpy array with shape [M, 4] holding N boxes.
91 | 
92 |   Returns:
93 |     a numpy array with shape [N, M] representing pairwise ioa scores.
94 |   """
95 |   intersect = intersection(boxes1, boxes2)
96 |   inv_areas = np.expand_dims(1.0 / area(boxes2), axis=0)
97 |   return intersect * inv_areas
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Siam R-CNN: Visual Tracking by Re-Detection
 2 | ### [Paul Voigtlaender](https://www.vision.rwth-aachen.de/person/197/), [Jonathon Luiten](https://www.vision.rwth-aachen.de/person/216/), [Philip H.S. Torr](https://www.robots.ox.ac.uk/~tvg/), [Bastian Leibe](https://www.vision.rwth-aachen.de/)
 3 | The corresponding project page can be found here: https://www.vision.rwth-aachen.de/page/siamrcnn
 4 | 
 5 | This software is written in Python3 and powered by TensorFlow 1.
 6 | 
 7 | We borrow a lot of code from TensorPack's Faster R-CNN example: https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN
 8 | 
 9 | ## Installation
10 | 
11 | ### Download necessary libraries
12 | Here we will put all external libraries and this repository into /home/${USERNAME}/vision and use 
13 | pip to install common libraries
14 | ```
15 | mkdir /home/${USERNAME}/vision
16 | cd /home/${USERNAME}/vision
17 | 
18 | git clone https://github.com/VisualComputingInstitute/SiamR-CNN.git
19 | git clone https://github.com/pvoigtlaender/got10k-toolkit.git
20 | git clone https://github.com/tensorpack/tensorpack.git
21 | 
22 | cd tensorpack
23 | git checkout d24a9230d50b1dea1712a4c2765a11876f1e193c
24 | cd ..
25 | 
26 | pip3 install cython
27 | pip3 install tensorflow-gpu==1.15
28 | pip3 install wget shapely msgpack msgpack_numpy tabulate xmltodict pycocotools opencv-python tqdm zmq annoy
29 | ```
30 | ### Add libraries to your PYTHONPATH
31 | ```
32 | export PYTHONPATH=${PYTHONPATH}:/home/${USERNAME}/vision/got10k-toolkit/:/home/${USERNAME}/vision/tensorpack/
33 | ```
34 | 
35 | ### Make Folder for models and logs and download pre-trained model
36 | ```
37 | cd SiamR-CNN/
38 | mkdir train_log
39 | cd train_log
40 | wget --no-check-certificate -r -nH --cut-dirs=2 --no-parent --reject="index.html*" https://omnomnom.vision.rwth-aachen.de/data/siamrcnn/hard_mining3/
41 | cd ..
42 | ```
43 | ## Evaluation
44 | For evaluation, first set the path to the dataset on which you want to evaluate in tracking/do_tracking.py, e.g.
45 | ```
46 | OTB_2015_ROOT_DIR = '/data/otb2015/'
47 | ```
48 | 
49 | Then run tracking/do_tracking.py and specify the dataset you want to evaluate on using the main function for this dataset using e.g. --main main_otb
50 |  
51 | ```
52 | python3 tracking/do_tracking.py --main main_otb
53 | ```
54 | 
55 | The result will then be written to tracking_data/results/
56 | 
57 | ## Training
58 | Download the pre-trained Mask R-CNN model from http://models.tensorpack.com/FasterRCNN/COCO-MaskRCNN-R101FPN9xGNCasAugScratch.npz
59 | 
60 | Now change the paths to the training datasets in config.py, e.g.
61 | ```
62 | _C.DATA.IMAGENET_VID_ROOT = "/globalwork/data/ILSVRC_VID/ILSVRC/"
63 | ```
64 | there you can also enable and disable different datasets, e.g.
65 | ```
66 | _C.DATA.IMAGENET_VID = True
67 | ```
68 | 
69 | To run the main training (without hard example mining):
70 | ```
71 | python3 train.py --load /path/to/COCO-R101FPN-MaskRCNN-ScratchGN.npz
72 | ```
73 | 
74 | ## Hints about the code
75 | In the code, we sometimes use the terminology "ThreeStageTracker" or three stages. This refers to the Tracklet Dynamic Programming Algorithm (TDPA).
76 | 
77 | In order to make the code more readable, we removed some parts before publishing. If there's an important feature which you are missing, please write us an email at voigtlaender@vision.rwth-aachen.de
78 | 
79 | In the current version of the code, the functions to pre-compute the features for hard example mining are not available, but we can share the pre-computed data on request.
80 | 
81 | ## References
82 | If you find this code useful, please cite
83 | ```
84 | Siam R-CNN: Visual Tracking by Re-Detection
85 | Paul Voigtlaender, Jonathon Luiten, Philip H.S. Torr, Bastian Leibe.
86 | IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2020.
87 | ```
88 | 


--------------------------------------------------------------------------------
/tracking/util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import PIL.Image
  3 | 
  4 | from examples.FasterRCNN.common import clip_boxes
  5 | 
  6 | 
  7 | def xyxy_to_cxcywh_np(boxes_xyxy):
  8 |     wh = boxes_xyxy[:, 2:] - boxes_xyxy[:, :2]
  9 |     c = boxes_xyxy[:, :2] + wh / 2
 10 |     boxes_cwh = np.concatenate((c, wh), axis=1)
 11 |     return boxes_cwh
 12 | 
 13 | 
 14 | def cxcywh_to_xyxy_np(boxes_cxcywh):
 15 |     boxes_xyxy = boxes_cxcywh.copy()
 16 |     boxes_xyxy[:, :2] -= 0.5 * boxes_xyxy[:, 2:]
 17 |     boxes_xyxy[:, 2:] += boxes_xyxy[:, :2]
 18 |     return boxes_xyxy
 19 | 
 20 | 
 21 | def resize_and_clip_boxes(img, resized_img, boxes):
 22 |     scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1])
 23 |     orig_shape = img.shape[:2]
 24 |     boxes = boxes / scale
 25 |     boxes = clip_boxes(boxes, orig_shape)
 26 |     return boxes
 27 | 
 28 | 
 29 | # adapted from https://github.com/matterport/Mask_RCNN/blob/master/mrcnn/visualize.py
 30 | def generate_colors():
 31 |     """
 32 |     Generate random colors.
 33 |     To get visually distinct colors, generate them in HSV space then
 34 |     convert to RGB.
 35 |     """
 36 |     N = 30
 37 |     brightness = 0.7
 38 |     hsv = [(i / N, 1, brightness) for i in range(N)]
 39 |     import colorsys
 40 |     colors = list(map(lambda c: colorsys.hsv_to_rgb(*c), hsv))
 41 |     perm = [15, 13, 25, 12, 19, 8, 22, 24, 29, 17, 28, 20, 2, 27, 11, 26, 21, 4, 3, 18, 9, 5, 14, 1, 16, 0, 23, 7, 6, 10]
 42 |     colors = [colors[idx] for idx in perm]
 43 |     return colors
 44 | 
 45 | 
 46 | def postproc_seq_name_otb(seq_name):
 47 |     if seq_name == "Human4":
 48 |         seq_name_postproc = "Human4-2"
 49 |     elif seq_name == "Skating2_1":
 50 |         seq_name_postproc = "Skating2-1"
 51 |     elif seq_name == "Skating2_2":
 52 |         seq_name_postproc = "Skating2-2"
 53 |     elif seq_name == "Jogging_1":
 54 |         seq_name_postproc = "Jogging-1"
 55 |     elif seq_name == "Jogging_2":
 56 |         seq_name_postproc = "Jogging-2"
 57 |     else:
 58 |         seq_name_postproc = seq_name
 59 |     return seq_name_postproc
 60 | 
 61 | 
 62 | def read_gt_otb(gt_file):
 63 |     boxes = []
 64 |     with open(gt_file) as f:
 65 |         for l in f:
 66 |             l = l.strip()
 67 |             assert "," in l
 68 |             sp = l.split(",")
 69 |             x1, y1, w, h = [float(x) for x in sp]
 70 |             x2 = x1 + w
 71 |             y2 = y1 + h
 72 |             box = [x1, y1, x2, y2]
 73 |             boxes.append(box)
 74 |     boxes = np.array(boxes)
 75 |     return boxes
 76 | 
 77 | 
 78 | pascal_colormap = [
 79 |     0, 0, 0,
 80 |     0.5020, 0, 0,
 81 |     0, 0.5020, 0,
 82 |     0.5020, 0.5020, 0,
 83 |     0, 0, 0.5020,
 84 |     0.5020, 0, 0.5020,
 85 |     0, 0.5020, 0.5020,
 86 |     0.5020, 0.5020, 0.5020,
 87 |     0.2510, 0, 0,
 88 |     0.7529, 0, 0,
 89 |     0.2510, 0.5020, 0,
 90 |     0.7529, 0.5020, 0,
 91 |     0.2510, 0, 0.5020,
 92 |     0.7529, 0, 0.5020,
 93 |     0.2510, 0.5020, 0.5020,
 94 |     0.7529, 0.5020, 0.5020,
 95 |     0, 0.2510, 0,
 96 |     0.5020, 0.2510, 0,
 97 |     0, 0.7529, 0,
 98 |     0.5020, 0.7529, 0,
 99 |     0, 0.2510, 0.5020,
100 |     0.5020, 0.2510, 0.5020,
101 |     0, 0.7529, 0.5020,
102 |     0.5020, 0.7529, 0.5020,
103 |     0.2510, 0.2510, 0]
104 | 
105 | 
106 | def save_segmentation_with_colormap(filename, img):
107 |     """Saves a segmentation with the pascal colormap as expected for DAVIS eval.
108 |     Args:
109 |     filename: Where to store the segmentation.
110 |     img: A numpy array of the segmentation to be saved.
111 |     """
112 |     if img.shape[-1] == 1:
113 |         img = img[..., 0]
114 | 
115 |     # Save with colormap.
116 |     colormap = (np.array(pascal_colormap) * 255).round().astype('uint8')
117 |     colormap_image = PIL.Image.new('P', (16, 16))
118 |     colormap_image.putpalette(colormap)
119 |     pil_image = PIL.Image.fromarray(img.astype('uint8'))
120 |     pil_image_with_colormap = pil_image.quantize(palette=colormap_image)
121 |     pil_image_with_colormap.save(filename)
122 | 
123 | 


--------------------------------------------------------------------------------
/vot_helper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | \file vot.py
  3 | 
  4 | @brief Python utility functions for VOT integration
  5 | 
  6 | @author Luka Cehovin, Alessio Dore
  7 | 
  8 | @date 2016
  9 | 
 10 | """
 11 | 
 12 | import sys
 13 | import copy
 14 | import collections
 15 | import numpy as np
 16 | 
 17 | try:
 18 |     import trax
 19 | except ImportError:
 20 |     raise Exception('TraX support not found. Please add trax module to Python path.')
 21 | 
 22 | Rectangle = collections.namedtuple('Rectangle', ['x', 'y', 'width', 'height'])
 23 | Point = collections.namedtuple('Point', ['x', 'y'])
 24 | Polygon = collections.namedtuple('Polygon', ['points'])
 25 | 
 26 | class VOT(object):
 27 |     """ Base class for Python VOT integration """
 28 |     def __init__(self, region_format, channels=None):
 29 |         """ Constructor
 30 | 
 31 |         Args:
 32 |             region_format: Region format options
 33 |         """
 34 |         assert(region_format in [trax.Region.RECTANGLE, trax.Region.POLYGON, trax.Region.MASK])
 35 | 
 36 |         if channels is None:
 37 |             channels = ['color']
 38 |         elif channels == 'rgbd':
 39 |             channels = ['color', 'depth']
 40 |         elif channels == 'rgbt':
 41 |             channels = ['color', 'ir']
 42 |         elif channels == 'ir':
 43 |             channels = ['ir']
 44 |         else:
 45 |             raise Exception('Illegal configuration {}.'.format(channels))
 46 | 
 47 |         self._trax = trax.Server([region_format], [trax.Image.PATH], channels, customMetadata=dict(vot="python"))
 48 | 
 49 |         request = self._trax.wait()
 50 |         assert(request.type == 'initialize')
 51 |         if isinstance(request.region, trax.Polygon):
 52 |             self._region = Polygon([Point(x[0], x[1]) for x in request.region])
 53 |         elif isinstance(request.region, trax.Mask):
 54 |             self._region = request.region.array(True)
 55 |         else:
 56 |             self._region = Rectangle(*request.region.bounds())
 57 |         self._image = [x.path() for k, x in request.image.items()]
 58 |         if len(self._image) == 1:
 59 |             self._image = self._image[0]
 60 | 
 61 |         self._trax.status(request.region)
 62 | 
 63 |     def region(self):
 64 |         """
 65 |         Send configuration message to the client and receive the initialization
 66 |         region and the path of the first image
 67 | 
 68 |         Returns:
 69 |             initialization region
 70 |         """
 71 | 
 72 |         return self._region
 73 | 
 74 |     def report(self, region, confidence = None):
 75 |         """
 76 |         Report the tracking results to the client
 77 | 
 78 |         Arguments:
 79 |             region: region for the frame
 80 |         """
 81 |         assert(isinstance(region, (Rectangle, Polygon, np.ndarray)))
 82 |         if isinstance(region, Polygon):
 83 |             tregion = trax.Polygon.create([(x.x, x.y) for x in region.points])
 84 |         elif isinstance(region, np.ndarray):
 85 |             tregion = trax.Mask.create(region)
 86 |         else:
 87 |             tregion = trax.Rectangle.create(region.x, region.y, region.width, region.height)
 88 |         properties = {}
 89 |         if not confidence is None:
 90 |             properties['confidence'] = confidence
 91 |         self._trax.status(tregion, properties)
 92 | 
 93 |     def frame(self):
 94 |         """
 95 |         Get a frame (image path) from client
 96 | 
 97 |         Returns:
 98 |             absolute path of the image
 99 |         """
100 |         if hasattr(self, "_image"):
101 |             image = self._image
102 |             del self._image
103 |             return image
104 | 
105 |         request = self._trax.wait()
106 | 
107 |         if request.type == 'frame':
108 |             image = [x.path() for k, x in request.image.items()]
109 |             if len(image) == 1:
110 |                 return image[0]
111 |             return image
112 |         else:
113 |             return None
114 | 
115 | 
116 |     def quit(self):
117 |         if hasattr(self, '_trax'):
118 |             self._trax.quit()
119 | 
120 |     def __del__(self):
121 |         self.quit()
122 | 
123 | 


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # File: common.py
  3 | 
  4 | import numpy as np
  5 | import cv2
  6 | 
  7 | from tensorpack.dataflow import RNGDataFlow
  8 | from tensorpack.dataflow.imgaug import transform
  9 | 
 10 | 
 11 | class DataFromListOfDict(RNGDataFlow):
 12 |     def __init__(self, lst, keys, shuffle=False):
 13 |         self._lst = lst
 14 |         self._keys = keys
 15 |         self._shuffle = shuffle
 16 |         self._size = len(lst)
 17 | 
 18 |     def __len__(self):
 19 |         return self._size
 20 | 
 21 |     def __iter__(self):
 22 |         if self._shuffle:
 23 |             self.rng.shuffle(self._lst)
 24 |         for dic in self._lst:
 25 |             dp = [dic[k] for k in self._keys]
 26 |             yield dp
 27 | 
 28 | 
 29 | class CustomResize(transform.TransformAugmentorBase):
 30 |     """
 31 |     Try resizing the shortest edge to a certain number
 32 |     while avoiding the longest edge to exceed max_size.
 33 |     """
 34 | 
 35 |     def __init__(self, short_edge_length, max_size, interp=cv2.INTER_LINEAR):
 36 |         """
 37 |         Args:
 38 |             short_edge_length ([int, int]): a [min, max] interval from which to sample the
 39 |                 shortest edge length.
 40 |             max_size (int): maximum allowed longest edge length.
 41 |         """
 42 |         super(CustomResize, self).__init__()
 43 |         if isinstance(short_edge_length, int):
 44 |             short_edge_length = (short_edge_length, short_edge_length)
 45 |         self._init(locals())
 46 | 
 47 |     def _get_augment_params(self, img):
 48 |         h, w = img.shape[:2]
 49 |         size = self.rng.randint(
 50 |             self.short_edge_length[0], self.short_edge_length[1] + 1)
 51 |         scale = size * 1.0 / min(h, w)
 52 |         if h < w:
 53 |             newh, neww = size, scale * w
 54 |         else:
 55 |             newh, neww = scale * h, size
 56 |         if max(newh, neww) > self.max_size:
 57 |             scale = self.max_size * 1.0 / max(newh, neww)
 58 |             newh = newh * scale
 59 |             neww = neww * scale
 60 |         neww = int(neww + 0.5)
 61 |         newh = int(newh + 0.5)
 62 |         return transform.ResizeTransform(h, w, newh, neww, self.interp)
 63 | 
 64 | 
 65 | def box_to_point8(boxes):
 66 |     """
 67 |     Args:
 68 |         boxes: nx4
 69 | 
 70 |     Returns:
 71 |         (nx4)x2
 72 |     """
 73 |     b = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]]
 74 |     b = b.reshape((-1, 2))
 75 |     return b
 76 | 
 77 | 
 78 | def point8_to_box(points):
 79 |     """
 80 |     Args:
 81 |         points: (nx4)x2
 82 |     Returns:
 83 |         nx4 boxes (x1y1x2y2)
 84 |     """
 85 |     p = points.reshape((-1, 4, 2))
 86 |     minxy = p.min(axis=1)   # nx2
 87 |     maxxy = p.max(axis=1)   # nx2
 88 |     return np.concatenate((minxy, maxxy), axis=1)
 89 | 
 90 | 
 91 | def segmentation_to_mask(polys, height, width):
 92 |     """
 93 |     Convert polygons to binary masks.
 94 | 
 95 |     Args:
 96 |         polys: a list of nx2 float array. Each array contains many (x, y) coordinates.
 97 | 
 98 |     Returns:
 99 |         a binary matrix of (height, width)
100 |     """
101 |     polys = [p.flatten().tolist() for p in polys]
102 |     assert len(polys) > 0, "Polygons are empty!"
103 | 
104 |     import pycocotools.mask as cocomask
105 |     rles = cocomask.frPyObjects(polys, height, width)
106 |     rle = cocomask.merge(rles)
107 |     return cocomask.decode(rle)
108 | 
109 | 
110 | def clip_boxes(boxes, shape):
111 |     """
112 |     Args:
113 |         boxes: (...)x4, float
114 |         shape: h, w
115 |     """
116 |     orig_shape = boxes.shape
117 |     boxes = boxes.reshape([-1, 4])
118 |     h, w = shape
119 |     boxes[:, [0, 1]] = np.maximum(boxes[:, [0, 1]], 0)
120 |     boxes[:, 2] = np.minimum(boxes[:, 2], w)
121 |     boxes[:, 3] = np.minimum(boxes[:, 3], h)
122 |     return boxes.reshape(orig_shape)
123 | 
124 | 
125 | def filter_boxes_inside_shape(boxes, shape):
126 |     """
127 |     Args:
128 |         boxes: (nx4), float
129 |         shape: (h, w)
130 | 
131 |     Returns:
132 |         indices: (k, )
133 |         selection: (kx4)
134 |     """
135 |     assert boxes.ndim == 2, boxes.shape
136 |     assert len(shape) == 2, shape
137 |     h, w = shape
138 |     indices = np.where(
139 |         (boxes[:, 0] >= 0) &
140 |         (boxes[:, 1] >= 0) &
141 |         (boxes[:, 2] <= w) &
142 |         (boxes[:, 3] <= h))[0]
143 |     return indices, boxes[indices, :]
144 | 
145 | 
146 | try:
147 |     import pycocotools.mask as cocomask
148 | 
149 |     # Much faster than utils/np_box_ops
150 |     def np_iou(A, B):
151 |         def to_xywh(box):
152 |             box = box.copy()
153 |             box[:, 2] -= box[:, 0]
154 |             box[:, 3] -= box[:, 1]
155 |             return box
156 | 
157 |         ret = cocomask.iou(
158 |             to_xywh(A), to_xywh(B),
159 |             np.zeros((len(B),), dtype=np.bool))
160 |         # can accelerate even more, if using float32
161 |         return ret.astype('float32')
162 | 
163 | except ImportError:
164 |     from utils.np_box_ops import iou as np_iou  # noqa
165 | 


--------------------------------------------------------------------------------
/model_rpn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | from tensorpack.models import Conv2D, layer_register
  6 | from tensorpack.tfutils.argscope import argscope
  7 | from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope, under_name_scope
  8 | from tensorpack.tfutils.summary import add_moving_summary
  9 | 
 10 | from config import config as cfg
 11 | from model_box import clip_boxes
 12 | 
 13 | 
 14 | @layer_register(log_shape=True)
 15 | @auto_reuse_variable_scope
 16 | def rpn_head(featuremap, channel, num_anchors):
 17 |     """
 18 |     Returns:
 19 |         label_logits: fHxfWxNA
 20 |         box_logits: fHxfWxNAx4
 21 |     """
 22 |     with argscope(Conv2D, data_format='channels_first',
 23 |                   kernel_initializer=tf.random_normal_initializer(stddev=0.01)):
 24 |         hidden = Conv2D('conv0', featuremap, channel, 3, activation=tf.nn.relu)
 25 | 
 26 |         label_logits = Conv2D('class', hidden, num_anchors, 1)
 27 |         box_logits = Conv2D('box', hidden, 4 * num_anchors, 1)
 28 |         # 1, NA(*4), im/16, im/16 (NCHW)
 29 | 
 30 |         label_logits = tf.transpose(label_logits, [0, 2, 3, 1])  # 1xfHxfWxNA
 31 |         label_logits = tf.squeeze(label_logits, 0)  # fHxfWxNA
 32 | 
 33 |         shp = tf.shape(box_logits)  # 1x(NAx4)xfHxfW
 34 |         box_logits = tf.transpose(box_logits, [0, 2, 3, 1])  # 1xfHxfWx(NAx4)
 35 |         box_logits = tf.reshape(box_logits, tf.stack([shp[2], shp[3], num_anchors, 4]))  # fHxfWxNAx4
 36 |     return label_logits, box_logits
 37 | 
 38 | 
 39 | @under_name_scope()
 40 | def rpn_losses(anchor_labels, anchor_boxes, label_logits, box_logits):
 41 |     """
 42 |     Args:
 43 |         anchor_labels: fHxfWxNA
 44 |         anchor_boxes: fHxfWxNAx4, encoded
 45 |         label_logits:  fHxfWxNA
 46 |         box_logits: fHxfWxNAx4
 47 | 
 48 |     Returns:
 49 |         label_loss, box_loss
 50 |     """
 51 |     with tf.device('/cpu:0'):
 52 |         valid_mask = tf.stop_gradient(tf.not_equal(anchor_labels, -1))
 53 |         pos_mask = tf.stop_gradient(tf.equal(anchor_labels, 1))
 54 |         nr_valid = tf.stop_gradient(tf.count_nonzero(valid_mask, dtype=tf.int32), name='num_valid_anchor')
 55 |         nr_pos = tf.identity(tf.count_nonzero(pos_mask, dtype=tf.int32), name='num_pos_anchor')
 56 |         # nr_pos is guaranteed >0 in C4. But in FPN. even nr_valid could be 0.
 57 | 
 58 |         valid_anchor_labels = tf.boolean_mask(anchor_labels, valid_mask)
 59 |     valid_label_logits = tf.boolean_mask(label_logits, valid_mask)
 60 | 
 61 |     with tf.name_scope('label_metrics'):
 62 |         valid_label_prob = tf.nn.sigmoid(valid_label_logits)
 63 |         summaries = []
 64 |         with tf.device('/cpu:0'):
 65 |             for th in [0.5, 0.2, 0.1]:
 66 |                 valid_prediction = tf.cast(valid_label_prob > th, tf.int32)
 67 |                 nr_pos_prediction = tf.reduce_sum(valid_prediction, name='num_pos_prediction')
 68 |                 pos_prediction_corr = tf.count_nonzero(
 69 |                     tf.logical_and(
 70 |                         valid_label_prob > th,
 71 |                         tf.equal(valid_prediction, valid_anchor_labels)),
 72 |                     dtype=tf.int32)
 73 |                 placeholder = 0.5   # A small value will make summaries appear lower.
 74 |                 recall = tf.cast(tf.truediv(pos_prediction_corr, nr_pos), tf.float32)
 75 |                 recall = tf.where(tf.equal(nr_pos, 0), placeholder, recall, name='recall_th{}'.format(th))
 76 |                 precision = tf.cast(tf.truediv(pos_prediction_corr, nr_pos_prediction), tf.float32)
 77 |                 precision = tf.where(tf.equal(nr_pos_prediction, 0),
 78 |                                      placeholder, precision, name='precision_th{}'.format(th))
 79 |                 summaries.extend([precision, recall])
 80 |         add_moving_summary(*summaries)
 81 | 
 82 |     # Per-level loss summaries in FPN may appear lower due to the use of a small placeholder.
 83 |     # But the total RPN loss will be fine.  TODO make the summary op smarter
 84 |     placeholder = 0.
 85 |     label_loss = tf.nn.sigmoid_cross_entropy_with_logits(
 86 |         labels=tf.cast(valid_anchor_labels, tf.float32), logits=valid_label_logits)
 87 |     label_loss = tf.reduce_sum(label_loss) * (1. / cfg.RPN.BATCH_PER_IM)
 88 |     label_loss = tf.where(tf.equal(nr_valid, 0), placeholder, label_loss, name='label_loss')
 89 | 
 90 |     pos_anchor_boxes = tf.boolean_mask(anchor_boxes, pos_mask)
 91 |     pos_box_logits = tf.boolean_mask(box_logits, pos_mask)
 92 |     delta = 1.0 / 9
 93 |     box_loss = tf.losses.huber_loss(
 94 |         pos_anchor_boxes, pos_box_logits, delta=delta,
 95 |         reduction=tf.losses.Reduction.SUM) / delta
 96 |     box_loss = box_loss * (1. / cfg.RPN.BATCH_PER_IM)
 97 |     box_loss = tf.where(tf.equal(nr_pos, 0), placeholder, box_loss, name='box_loss')
 98 | 
 99 |     add_moving_summary(label_loss, box_loss, nr_valid, nr_pos)
100 |     return [label_loss, box_loss]
101 | 
102 | 
103 | @under_name_scope()
104 | def generate_rpn_proposals(boxes, scores, img_shape,
105 |                            pre_nms_topk, post_nms_topk=None):
106 |     """
107 |     Sample RPN proposals by the following steps:
108 |     1. Pick top k1 by scores
109 |     2. NMS them
110 |     3. Pick top k2 by scores. Default k2 == k1, i.e. does not filter the NMS output.
111 | 
112 |     Args:
113 |         boxes: nx4 float dtype, the proposal boxes. Decoded to floatbox already
114 |         scores: n float, the logits
115 |         img_shape: [h, w]
116 |         pre_nms_topk, post_nms_topk (int): See above.
117 | 
118 |     Returns:
119 |         boxes: kx4 float
120 |         scores: k logits
121 |     """
122 |     assert boxes.shape.ndims == 2, boxes.shape
123 |     if post_nms_topk is None:
124 |         post_nms_topk = pre_nms_topk
125 | 
126 |     topk = tf.minimum(pre_nms_topk, tf.size(scores))
127 |     topk_scores, topk_indices = tf.nn.top_k(scores, k=topk, sorted=False)
128 |     topk_boxes = tf.gather(boxes, topk_indices)
129 |     topk_boxes = clip_boxes(topk_boxes, img_shape)
130 | 
131 |     topk_boxes_x1y1x2y2 = tf.reshape(topk_boxes, (-1, 2, 2))
132 |     topk_boxes_x1y1, topk_boxes_x2y2 = tf.split(topk_boxes_x1y1x2y2, 2, axis=1)
133 |     # nx1x2 each
134 |     wbhb = tf.squeeze(topk_boxes_x2y2 - topk_boxes_x1y1, axis=1)
135 |     valid = tf.reduce_all(wbhb > cfg.RPN.MIN_SIZE, axis=1)  # n,
136 |     topk_valid_boxes_x1y1x2y2 = tf.boolean_mask(topk_boxes_x1y1x2y2, valid)
137 |     topk_valid_scores = tf.boolean_mask(topk_scores, valid)
138 | 
139 |     # TODO not needed
140 |     topk_valid_boxes_y1x1y2x2 = tf.reshape(
141 |         tf.reverse(topk_valid_boxes_x1y1x2y2, axis=[2]),
142 |         (-1, 4), name='nms_input_boxes')
143 |     nms_indices = tf.image.non_max_suppression(
144 |         topk_valid_boxes_y1x1y2x2,
145 |         topk_valid_scores,
146 |         max_output_size=post_nms_topk,
147 |         iou_threshold=cfg.RPN.PROPOSAL_NMS_THRESH)
148 | 
149 |     topk_valid_boxes = tf.reshape(topk_valid_boxes_x1y1x2y2, (-1, 4))
150 |     proposal_boxes = tf.gather(topk_valid_boxes, nms_indices)
151 |     proposal_scores = tf.gather(topk_valid_scores, nms_indices)
152 |     tf.sigmoid(proposal_scores, name='probs')  # for visualization
153 |     return tf.stop_gradient(proposal_boxes, name='boxes'), tf.stop_gradient(proposal_scores, name='scores')
154 | 


--------------------------------------------------------------------------------
/model_box.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # File: model_box.py
  3 | 
  4 | import numpy as np
  5 | from collections import namedtuple
  6 | import tensorflow as tf
  7 | 
  8 | from tensorpack.tfutils.scope_utils import under_name_scope
  9 | 
 10 | from config import config
 11 | 
 12 | 
 13 | @under_name_scope()
 14 | def clip_boxes(boxes, window, name=None):
 15 |     """
 16 |     Args:
 17 |         boxes: nx4, xyxy
 18 |         window: [h, w]
 19 |     """
 20 |     boxes = tf.maximum(boxes, 0.0)
 21 |     m = tf.tile(tf.reverse(window, [0]), [2])    # (4,)
 22 |     boxes = tf.minimum(boxes, tf.cast(m, tf.float32), name=name)
 23 |     return boxes
 24 | 
 25 | 
 26 | @under_name_scope()
 27 | def decode_bbox_target(box_predictions, anchors):
 28 |     """
 29 |     Args:
 30 |         box_predictions: (..., 4), logits
 31 |         anchors: (..., 4), floatbox. Must have the same shape
 32 | 
 33 |     Returns:
 34 |         box_decoded: (..., 4), float32. With the same shape.
 35 |     """
 36 |     orig_shape = tf.shape(anchors)
 37 |     box_pred_txtytwth = tf.reshape(box_predictions, (-1, 2, 2))
 38 |     box_pred_txty, box_pred_twth = tf.split(box_pred_txtytwth, 2, axis=1)
 39 |     # each is (...)x1x2
 40 |     anchors_x1y1x2y2 = tf.reshape(anchors, (-1, 2, 2))
 41 |     anchors_x1y1, anchors_x2y2 = tf.split(anchors_x1y1x2y2, 2, axis=1)
 42 | 
 43 |     waha = anchors_x2y2 - anchors_x1y1
 44 |     xaya = (anchors_x2y2 + anchors_x1y1) * 0.5
 45 | 
 46 |     clip = np.log(config.PREPROC.MAX_SIZE / 16.)
 47 |     wbhb = tf.exp(tf.minimum(box_pred_twth, clip)) * waha
 48 |     xbyb = box_pred_txty * waha + xaya
 49 |     x1y1 = xbyb - wbhb * 0.5
 50 |     x2y2 = xbyb + wbhb * 0.5    # (...)x1x2
 51 |     out = tf.concat([x1y1, x2y2], axis=-2)
 52 |     return tf.reshape(out, orig_shape)
 53 | 
 54 | 
 55 | @under_name_scope()
 56 | def encode_bbox_target(boxes, anchors):
 57 |     """
 58 |     Args:
 59 |         boxes: (..., 4), float32
 60 |         anchors: (..., 4), float32
 61 | 
 62 |     Returns:
 63 |         box_encoded: (..., 4), float32 with the same shape.
 64 |     """
 65 |     anchors_x1y1x2y2 = tf.reshape(anchors, (-1, 2, 2))
 66 |     anchors_x1y1, anchors_x2y2 = tf.split(anchors_x1y1x2y2, 2, axis=1)
 67 |     waha = anchors_x2y2 - anchors_x1y1
 68 |     xaya = (anchors_x2y2 + anchors_x1y1) * 0.5
 69 | 
 70 |     boxes_x1y1x2y2 = tf.reshape(boxes, (-1, 2, 2))
 71 |     boxes_x1y1, boxes_x2y2 = tf.split(boxes_x1y1x2y2, 2, axis=1)
 72 |     wbhb = boxes_x2y2 - boxes_x1y1
 73 |     xbyb = (boxes_x2y2 + boxes_x1y1) * 0.5
 74 | 
 75 |     # Note that here not all boxes are valid. Some may be zero
 76 |     txty = (xbyb - xaya) / waha
 77 |     twth = tf.log(wbhb / waha)  # may contain -inf for invalid boxes
 78 |     encoded = tf.concat([txty, twth], axis=1)  # (-1x2x2)
 79 |     return tf.reshape(encoded, tf.shape(boxes))
 80 | 
 81 | 
 82 | @under_name_scope()
 83 | def crop_and_resize(image, boxes, box_ind, crop_size, pad_border=True):
 84 |     """
 85 |     Aligned version of tf.image.crop_and_resize, following our definition of floating point boxes.
 86 | 
 87 |     Args:
 88 |         image: NCHW
 89 |         boxes: nx4, x1y1x2y2
 90 |         box_ind: (n,)
 91 |         crop_size (int):
 92 |     Returns:
 93 |         n,C,size,size
 94 |     """
 95 |     assert isinstance(crop_size, int), crop_size
 96 |     boxes = tf.stop_gradient(boxes)
 97 | 
 98 |     # TF's crop_and_resize produces zeros on border
 99 |     if pad_border:
100 |         # this can be quite slow
101 |         image = tf.pad(image, [[0, 0], [0, 0], [1, 1], [1, 1]], mode='SYMMETRIC')
102 |         boxes = boxes + 1
103 | 
104 |     @under_name_scope()
105 |     def transform_fpcoor_for_tf(boxes, image_shape, crop_shape):
106 |         """
107 |         The way tf.image.crop_and_resize works (with normalized box):
108 |         Initial point (the value of output[0]): x0_box * (W_img - 1)
109 |         Spacing: w_box * (W_img - 1) / (W_crop - 1)
110 |         Use the above grid to bilinear sample.
111 | 
112 |         However, what we want is (with fpcoor box):
113 |         Spacing: w_box / W_crop
114 |         Initial point: x0_box + spacing/2 - 0.5
115 |         (-0.5 because bilinear sample (in my definition) assumes floating point coordinate
116 |          (0.0, 0.0) is the same as pixel value (0, 0))
117 | 
118 |         This function transform fpcoor boxes to a format to be used by tf.image.crop_and_resize
119 | 
120 |         Returns:
121 |             y1x1y2x2
122 |         """
123 |         x0, y0, x1, y1 = tf.split(boxes, 4, axis=1)
124 | 
125 |         spacing_w = (x1 - x0) / tf.cast(crop_shape[1], tf.float32)
126 |         spacing_h = (y1 - y0) / tf.cast(crop_shape[0], tf.float32)
127 | 
128 |         imshape = [tf.cast(image_shape[0] - 1, tf.float32), tf.cast(image_shape[1] - 1, tf.float32)]
129 |         nx0 = (x0 + spacing_w / 2 - 0.5) / imshape[1]
130 |         ny0 = (y0 + spacing_h / 2 - 0.5) / imshape[0]
131 | 
132 |         nw = spacing_w * tf.cast(crop_shape[1] - 1, tf.float32) / imshape[1]
133 |         nh = spacing_h * tf.cast(crop_shape[0] - 1, tf.float32) / imshape[0]
134 | 
135 |         return tf.concat([ny0, nx0, ny0 + nh, nx0 + nw], axis=1)
136 | 
137 |     # Expand bbox to a minium size of 1
138 |     # boxes_x1y1, boxes_x2y2 = tf.split(boxes, 2, axis=1)
139 |     # boxes_wh = boxes_x2y2 - boxes_x1y1
140 |     # boxes_center = tf.reshape((boxes_x2y2 + boxes_x1y1) * 0.5, [-1, 2])
141 |     # boxes_newwh = tf.maximum(boxes_wh, 1.)
142 |     # boxes_x1y1new = boxes_center - boxes_newwh * 0.5
143 |     # boxes_x2y2new = boxes_center + boxes_newwh * 0.5
144 |     # boxes = tf.concat([boxes_x1y1new, boxes_x2y2new], axis=1)
145 | 
146 |     image_shape = tf.shape(image)[2:]
147 |     boxes = transform_fpcoor_for_tf(boxes, image_shape, [crop_size, crop_size])
148 |     image = tf.transpose(image, [0, 2, 3, 1])   # nhwc
149 |     ret = tf.image.crop_and_resize(
150 |         image, boxes, tf.cast(box_ind, tf.int32),
151 |         crop_size=[crop_size, crop_size])
152 |     ret = tf.transpose(ret, [0, 3, 1, 2])   # ncss
153 |     return ret
154 | 
155 | 
156 | @under_name_scope()
157 | def roi_align(featuremap, boxes, resolution):
158 |     """
159 |     Args:
160 |         featuremap: 1xCxHxW
161 |         boxes: Nx4 floatbox
162 |         resolution: output spatial resolution
163 | 
164 |     Returns:
165 |         NxCx res x res
166 |     """
167 |     # sample 4 locations per roi bin
168 |     ret = crop_and_resize(
169 |         featuremap, boxes,
170 |         tf.zeros([tf.shape(boxes)[0]], dtype=tf.int32),
171 |         resolution * 2)
172 |     ret = tf.nn.avg_pool(ret, [1, 1, 2, 2], [1, 1, 2, 2], padding='SAME', data_format='NCHW')
173 |     return ret
174 | 
175 | 
176 | class RPNAnchors(namedtuple('_RPNAnchors', ['boxes', 'gt_labels', 'gt_boxes'])):
177 |     """
178 |     boxes (FS x FS x NA x 4): The anchor boxes.
179 |     gt_labels (FS x FS x NA):
180 |     gt_boxes (FS x FS x NA x 4): Groundtruth boxes corresponding to each anchor.
181 |     """
182 |     def encoded_gt_boxes(self):
183 |         return encode_bbox_target(self.gt_boxes, self.boxes)
184 | 
185 |     def decode_logits(self, logits):
186 |         return decode_bbox_target(logits, self.boxes)
187 | 
188 |     @under_name_scope()
189 |     def narrow_to(self, featuremap):
190 |         """
191 |         Slice anchors to the spatial size of this featuremap.
192 |         """
193 |         shape2d = tf.shape(featuremap)[2:]  # h,w
194 |         slice3d = tf.concat([shape2d, [-1]], axis=0)
195 |         slice4d = tf.concat([shape2d, [-1, -1]], axis=0)
196 |         boxes = tf.slice(self.boxes, [0, 0, 0, 0], slice4d)
197 |         gt_labels = tf.slice(self.gt_labels, [0, 0, 0], slice3d)
198 |         gt_boxes = tf.slice(self.gt_boxes, [0, 0, 0, 0], slice4d)
199 |         return RPNAnchors(boxes, gt_labels, gt_boxes)
200 | 
201 | 
202 | if __name__ == '__main__':
203 |     """
204 |     Demonstrate what's wrong with tf.image.crop_and_resize:
205 |     """
206 |     import tensorflow.contrib.eager as tfe
207 |     tfe.enable_eager_execution()
208 | 
209 |     # want to crop 2x2 out of a 5x5 image, and resize to 4x4
210 |     image = np.arange(25).astype('float32').reshape(5, 5)
211 |     boxes = np.asarray([[1, 1, 3, 3]], dtype='float32')
212 |     target = 4
213 | 
214 |     print(crop_and_resize(
215 |         image[None, None, :, :], boxes, [0], target)[0][0])
216 |     """
217 |     Expected values:
218 |     4.5 5 5.5 6
219 |     7 7.5 8 8.5
220 |     9.5 10 10.5 11
221 |     12 12.5 13 13.5
222 | 
223 |     You cannot easily get the above results with tf.image.crop_and_resize.
224 |     Try out yourself here:
225 |     """
226 |     print(tf.image.crop_and_resize(
227 |         image[None, :, :, None],
228 |         np.asarray([[1, 1, 2, 2]]) / 4.0, [0], [target, target])[0][:, :, 0])
229 | 


--------------------------------------------------------------------------------
/tracking/argmax_tracker.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import random
  3 | import numpy as np
  4 | from got10k.trackers import Tracker
  5 | from config import config as cfg, finalize_configs
  6 | from tensorpack import PredictConfig, get_model_loader, OfflinePredictor, logger
  7 | 
  8 | from train import ResNetFPNModel
  9 | from common import CustomResize, box_to_point8, point8_to_box
 10 | 
 11 | 
 12 | class PrecomputingReferenceTracker(Tracker):
 13 |     def __init__(self, name, need_network=True, need_img=True, model="best"):
 14 |         super().__init__(name=name, is_deterministic=True)
 15 |         self._resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE)
 16 |         self._prev_box = None
 17 |         self._ff_gt_feats = None
 18 |         self._need_network = need_network
 19 |         self._need_img = need_img
 20 |         self._rotated_bbox = None
 21 | 
 22 |         if need_network:
 23 |             logger.set_logger_dir("/tmp/test_log_/" + str(random.randint(0, 10000)), 'd')
 24 |             if model == "best":
 25 |                 load = "train_log/hard_mining3/model-1360500"
 26 |             elif model == "nohardexamples":
 27 |                 load = "train_log/condrcnn_all_2gpu_lrreduce2/model-1200500"
 28 |             elif model == "newrpn":
 29 |                 load = "train_log/newrpn1/model"
 30 |             elif model =="resnet50_nohardexamples":
 31 |                 load = "train_log/condrcnn_all_resnet50/model-1200500"
 32 |                 cfg.BACKBONE.RESNET_NUM_BLOCKS = [3, 4, 6, 3]
 33 |             elif model =="resnet50":
 34 |                 load = "train_log/hard_mining3_resnet50/model-1360500"
 35 |                 cfg.BACKBONE.RESNET_NUM_BLOCKS = [3, 4, 6, 3]
 36 |             elif model == "gotonly":
 37 |                 load = "train_log/hard_mining3_onlygot/model-1361000"
 38 |             elif model.startswith("checkpoint:"):
 39 |                 load = model.replace("checkpoint:", "")
 40 |             else:
 41 |                 assert False, ("unknown model", model)
 42 |             from dataset import DetectionDataset
 43 |             # init tensorpack model
 44 |             # cfg.freeze(False)
 45 |             DetectionDataset()  # initialize the config with information from our dataset
 46 | 
 47 |             cfg.EXTRACT_GT_FEATURES = True
 48 |             cfg.MODE_TRACK = False
 49 |             extract_model = ResNetFPNModel()
 50 |             extract_ff_feats_cfg = PredictConfig(
 51 |                 model=extract_model,
 52 |                 session_init=get_model_loader(load),
 53 |                 input_names=['image', 'roi_boxes'],
 54 |                 output_names=['rpn/feature'])
 55 |             finalize_configs(is_training=False)
 56 |             self._extract_func = OfflinePredictor(extract_ff_feats_cfg)
 57 | 
 58 |             cfg.EXTRACT_GT_FEATURES = False
 59 |             cfg.MODE_TRACK = True
 60 |             cfg.USE_PRECOMPUTED_REF_FEATURES = True
 61 |             self._pred_func = self._make_pred_func(load)
 62 | 
 63 |     def _resize_image_together_with_boxes(self, img, *list_of_box_or_boxes):
 64 |         resized_img, params = self._resizer.augment_return_params(img)
 65 |         res_boxes = []
 66 |         for box_or_boxes in list_of_box_or_boxes:
 67 |             expand = len(box_or_boxes.shape) == 1
 68 |             if expand:
 69 |                 boxes = box_or_boxes[np.newaxis]
 70 |             else:
 71 |                 boxes = box_or_boxes
 72 |             points = box_to_point8(boxes)
 73 |             points = self._resizer.augment_coords(points, params)
 74 |             resized_boxes = point8_to_box(points)
 75 |             if expand:
 76 |                 resized_boxes = np.squeeze(resized_boxes, axis=0)
 77 |             res_boxes.append(resized_boxes)
 78 |         if len(res_boxes) == 1:
 79 |             res_boxes = res_boxes[0]
 80 |         return resized_img, res_boxes
 81 | 
 82 |     def _make_pred_func(self, load):
 83 |         from train import ResNetFPNTrackModel
 84 |         pred_model = ResNetFPNTrackModel()
 85 |         predcfg = PredictConfig(
 86 |             model=pred_model,
 87 |             session_init=get_model_loader(load),
 88 |             input_names=pred_model.get_inference_tensor_names()[0],
 89 |             output_names=pred_model.get_inference_tensor_names()[1])
 90 |         return OfflinePredictor(predcfg)
 91 | 
 92 |     def init(self, image, box):
 93 |         ref_img = np.array(image)[..., ::-1]
 94 |         if ref_img is None:
 95 |             raise ValueError("failed to load img" + image.filename)
 96 |         box[2] += box[0]
 97 |         box[3] += box[1]
 98 |         ref_bbox = box
 99 |         self._prev_box = box
100 |         if self._need_network:
101 |             resized_ref_img, resized_ref_box = self._resize_image_together_with_boxes(ref_img, ref_bbox)
102 |             feats, = self._extract_func(resized_ref_img, resized_ref_box[np.newaxis])
103 |             self._ff_gt_feats = feats[0]
104 | 
105 |     def update(self, image, use_confidences=False):
106 |         if self._need_img:
107 |             target_img = np.array(image)[..., ::-1]
108 |             if target_img is None:
109 |                 raise ValueError("failed to load img" + str(target_img))
110 |         else:
111 |             target_img = None
112 | 
113 |         new_box, score = self._update(target_img)
114 |         if new_box is not None:
115 |             self._prev_box = new_box
116 | 
117 |         ret_box = self._prev_box.copy()
118 |         ret_box[2] -= ret_box[0]
119 |         ret_box[3] -= ret_box[1]
120 |         if self._rotated_bbox is not None:
121 |             ret_box = self._rotated_bbox
122 |         if use_confidences:
123 |             return ret_box, score
124 |         else:
125 |             return ret_box
126 | 
127 | 
128 | class ArgmaxTracker(PrecomputingReferenceTracker):
129 |     def __init__(self):
130 |         super().__init__("ArgmaxTracker")
131 | 
132 |     def _update(self, img):
133 |         from eval import predict_image_track_with_precomputed_ref_features
134 |         results = predict_image_track_with_precomputed_ref_features(img, self._ff_gt_feats, self._pred_func)
135 |         det_boxes = np.array([r.box for r in results])
136 |         det_scores = np.array([r.score for r in results])
137 |         if len(det_boxes) > 0:
138 |             return det_boxes[0], det_scores[0]
139 |         else:
140 |             return None, None
141 | 
142 | 
143 | # just there to test the precomputing on against
144 | # not intended to be used anymore
145 | class NonPrecomputingArgmaxTracker(Tracker):
146 |     def __init__(self):
147 |         super().__init__(name='ArgmaxTracker', is_deterministic=True)
148 |         self._ref_img = None
149 |         self._ref_bbox = None
150 |         self._prev_box = None
151 |         model = self._init_model()
152 |         load = "train_log/condrcnn_onlygot/model-460000"
153 |         predcfg = PredictConfig(
154 |             model=model,
155 |             session_init=get_model_loader(load),
156 |             input_names=model.get_inference_tensor_names()[0],
157 |             output_names=model.get_inference_tensor_names()[1])
158 |         self._pred_func = OfflinePredictor(predcfg)
159 | 
160 |     def _init_model(self):
161 |         logger.set_logger_dir("/tmp/test_log/", 'd')
162 |         from dataset import DetectionDataset
163 |         from train import ResNetFPNTrackModel
164 |         # init tensorpack model
165 |         cfg.freeze(False)
166 |         model = ResNetFPNTrackModel()
167 |         DetectionDataset()  # initialize the config with information from our dataset
168 |         finalize_configs(is_training=False)
169 |         return model
170 | 
171 |     def init(self, image, box):
172 |         self._ref_img = cv2.imread(image.filename, cv2.IMREAD_COLOR)
173 |         if self._ref_img is None:
174 |             raise ValueError("failed to load img" + str(self._ref_img))
175 |         box[2] += box[0]
176 |         box[3] += box[1]
177 |         self._ref_bbox = box
178 |         self._prev_box = box
179 | 
180 |     def update(self, image):
181 |         target_img = cv2.imread(image.filename, cv2.IMREAD_COLOR)
182 |         # assert target_img is not None
183 |         if target_img is None:
184 |             raise ValueError("failed to load img" + str(target_img))
185 |         from eval import predict_image_track
186 |         results = predict_image_track(target_img, self._ref_img, self._ref_bbox, self._pred_func)
187 |         det_boxes = np.array([r.box for r in results])
188 |         det_scores = np.array([r.score for r in results])
189 |         if len(det_boxes) > 0:
190 |             self._prev_box = det_boxes[0]
191 | 
192 |         ret_box = self._prev_box.copy()
193 |         ret_box[2] -= ret_box[0]
194 |         ret_box[3] -= ret_box[1]
195 |         return ret_box
196 | 


--------------------------------------------------------------------------------
/basemodel.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # File: basemodel.py
  3 | 
  4 | import numpy as np
  5 | from contextlib import ExitStack, contextmanager
  6 | import tensorflow as tf
  7 | 
  8 | from tensorpack.models import BatchNorm, Conv2D, MaxPooling, layer_register
  9 | from tensorpack.tfutils import argscope
 10 | from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope
 11 | from tensorpack.tfutils.varreplace import custom_getter_scope, freeze_variables
 12 | 
 13 | from config import config as cfg
 14 | 
 15 | 
 16 | @layer_register(log_shape=True)
 17 | def GroupNorm(x, group=32, gamma_initializer=tf.constant_initializer(1.)):
 18 |     shape = x.get_shape().as_list()
 19 |     ndims = len(shape)
 20 |     assert ndims == 4, shape
 21 |     chan = shape[1]
 22 |     assert chan % group == 0, chan
 23 |     group_size = chan // group
 24 | 
 25 |     orig_shape = tf.shape(x)
 26 |     h, w = orig_shape[2], orig_shape[3]
 27 | 
 28 |     x = tf.reshape(x, tf.stack([-1, group, group_size, h, w]))
 29 | 
 30 |     mean, var = tf.nn.moments(x, [2, 3, 4], keep_dims=True)
 31 | 
 32 |     new_shape = [1, group, group_size, 1, 1]
 33 | 
 34 |     beta = tf.get_variable('beta', [chan], initializer=tf.constant_initializer())
 35 |     beta = tf.reshape(beta, new_shape)
 36 | 
 37 |     gamma = tf.get_variable('gamma', [chan], initializer=gamma_initializer)
 38 |     gamma = tf.reshape(gamma, new_shape)
 39 | 
 40 |     out = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-5, name='output')
 41 |     return tf.reshape(out, orig_shape, name='output')
 42 | 
 43 | 
 44 | def freeze_affine_getter(getter, *args, **kwargs):
 45 |     # custom getter to freeze affine params inside bn
 46 |     name = args[0] if len(args) else kwargs.get('name')
 47 |     if name.endswith('/gamma') or name.endswith('/beta'):
 48 |         kwargs['trainable'] = False
 49 |         ret = getter(*args, **kwargs)
 50 |         tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, ret)
 51 |     else:
 52 |         ret = getter(*args, **kwargs)
 53 |     return ret
 54 | 
 55 | 
 56 | def maybe_reverse_pad(topleft, bottomright):
 57 |     if cfg.BACKBONE.TF_PAD_MODE:
 58 |         return [topleft, bottomright]
 59 |     return [bottomright, topleft]
 60 | 
 61 | 
 62 | @contextmanager
 63 | def backbone_scope(freeze):
 64 |     """
 65 |     Args:
 66 |         freeze (bool): whether to freeze all the variables under the scope
 67 |     """
 68 |     def nonlin(x):
 69 |         x = get_norm()(x)
 70 |         return tf.nn.relu(x)
 71 | 
 72 |     with argscope([Conv2D, MaxPooling, BatchNorm], data_format='channels_first'), \
 73 |             argscope(Conv2D, use_bias=False, activation=nonlin,
 74 |                      kernel_initializer=tf.variance_scaling_initializer(
 75 |                          scale=2.0, mode='fan_out')), \
 76 |             ExitStack() as stack:
 77 |         if cfg.BACKBONE.NORM in ['FreezeBN', 'SyncBN']:
 78 |             if freeze or cfg.BACKBONE.NORM == 'FreezeBN':
 79 |                 stack.enter_context(argscope(BatchNorm, training=False))
 80 |             else:
 81 |                 stack.enter_context(argscope(
 82 |                     BatchNorm, sync_statistics='nccl' if cfg.TRAINER == 'replicated' else 'horovod'))
 83 | 
 84 |         if freeze:
 85 |             stack.enter_context(freeze_variables(stop_gradient=False, skip_collection=True))
 86 |         else:
 87 |             # the layers are not completely freezed, but we may want to only freeze the affine
 88 |             if cfg.BACKBONE.FREEZE_AFFINE:
 89 |                 stack.enter_context(custom_getter_scope(freeze_affine_getter))
 90 |         yield
 91 | 
 92 | 
 93 | def image_preprocess(image, bgr=True):
 94 |     with tf.name_scope('image_preprocess'):
 95 |         if image.dtype.base_dtype != tf.float32:
 96 |             image = tf.cast(image, tf.float32)
 97 | 
 98 |         mean = cfg.PREPROC.PIXEL_MEAN
 99 |         std = np.asarray(cfg.PREPROC.PIXEL_STD)
100 |         if bgr:
101 |             mean = mean[::-1]
102 |             std = std[::-1]
103 |         image_mean = tf.constant(mean, dtype=tf.float32)
104 |         image_invstd = tf.constant(1.0 / std, dtype=tf.float32)
105 |         image = (image - image_mean) * image_invstd
106 |         return image
107 | 
108 | 
109 | def get_norm(zero_init=False):
110 |     if cfg.BACKBONE.NORM == 'None':
111 |         return lambda x: x
112 |     if cfg.BACKBONE.NORM == 'GN':
113 |         Norm = GroupNorm
114 |         layer_name = 'gn'
115 |     else:
116 |         Norm = BatchNorm
117 |         layer_name = 'bn'
118 |     return lambda x: Norm(layer_name, x, gamma_initializer=tf.zeros_initializer() if zero_init else None)
119 | 
120 | 
121 | def resnet_shortcut(l, n_out, stride, activation=tf.identity):
122 |     n_in = l.shape[1]
123 |     if n_in != n_out:   # change dimension when channel is not the same
124 |         # TF's SAME mode output ceil(x/stride), which is NOT what we want when x is odd and stride is 2
125 |         # In FPN mode, the images are pre-padded already.
126 |         if not cfg.MODE_FPN and stride == 2:
127 |             l = l[:, :, :-1, :-1]
128 |         return Conv2D('convshortcut', l, n_out, 1,
129 |                       strides=stride, activation=activation)
130 |     else:
131 |         return l
132 | 
133 | 
134 | def resnet_bottleneck(l, ch_out, stride):
135 |     shortcut = l
136 |     if cfg.BACKBONE.STRIDE_1X1:
137 |         if stride == 2:
138 |             l = l[:, :, :-1, :-1]
139 |         l = Conv2D('conv1', l, ch_out, 1, strides=stride)
140 |         l = Conv2D('conv2', l, ch_out, 3, strides=1)
141 |     else:
142 |         l = Conv2D('conv1', l, ch_out, 1, strides=1)
143 |         if stride == 2:
144 |             l = tf.pad(l, [[0, 0], [0, 0], maybe_reverse_pad(0, 1), maybe_reverse_pad(0, 1)])
145 |             l = Conv2D('conv2', l, ch_out, 3, strides=2, padding='VALID')
146 |         else:
147 |             l = Conv2D('conv2', l, ch_out, 3, strides=stride)
148 |     if cfg.BACKBONE.NORM != 'None':
149 |         l = Conv2D('conv3', l, ch_out * 4, 1, activation=get_norm(zero_init=True))
150 |     else:
151 |         l = Conv2D('conv3', l, ch_out * 4, 1, activation=tf.identity,
152 |                    kernel_initializer=tf.constant_initializer())
153 |     ret = l + resnet_shortcut(shortcut, ch_out * 4, stride, activation=get_norm(zero_init=False))
154 |     return tf.nn.relu(ret, name='output')
155 | 
156 | 
157 | def resnet_group(name, l, block_func, features, count, stride):
158 |     with tf.variable_scope(name):
159 |         for i in range(0, count):
160 |             with tf.variable_scope('block{}'.format(i)):
161 |                 l = block_func(l, features, stride if i == 0 else 1)
162 |     return l
163 | 
164 | 
165 | def resnet_c4_backbone(image, num_blocks):
166 |     assert len(num_blocks) == 3
167 |     freeze_at = cfg.BACKBONE.FREEZE_AT
168 |     with backbone_scope(freeze=freeze_at > 0):
169 |         l = tf.pad(image, [[0, 0], [0, 0], maybe_reverse_pad(2, 3), maybe_reverse_pad(2, 3)])
170 |         l = Conv2D('conv0', l, 64, 7, strides=2, padding='VALID')
171 |         l = tf.pad(l, [[0, 0], [0, 0], maybe_reverse_pad(0, 1), maybe_reverse_pad(0, 1)])
172 |         l = MaxPooling('pool0', l, 3, strides=2, padding='VALID')
173 | 
174 |     with backbone_scope(freeze=freeze_at > 1):
175 |         c2 = resnet_group('group0', l, resnet_bottleneck, 64, num_blocks[0], 1)
176 |     with backbone_scope(freeze=False):
177 |         c3 = resnet_group('group1', c2, resnet_bottleneck, 128, num_blocks[1], 2)
178 |         c4 = resnet_group('group2', c3, resnet_bottleneck, 256, num_blocks[2], 2)
179 |     # 16x downsampling up to now
180 |     return c4
181 | 
182 | 
183 | @auto_reuse_variable_scope
184 | def resnet_conv5(image, num_block):
185 |     with backbone_scope(freeze=False):
186 |         l = resnet_group('group3', image, resnet_bottleneck, 512, num_block, 2)
187 |         return l
188 | 
189 | 
190 | def resnet_fpn_backbone(image, num_blocks):
191 |     freeze_at = cfg.BACKBONE.FREEZE_AT
192 |     shape2d = tf.shape(image)[2:]
193 |     mult = float(cfg.FPN.RESOLUTION_REQUIREMENT)
194 |     new_shape2d = tf.cast(tf.ceil(tf.cast(shape2d, tf.float32) / mult) * mult, tf.int32)
195 |     pad_shape2d = new_shape2d - shape2d
196 |     assert len(num_blocks) == 4, num_blocks
197 |     with backbone_scope(freeze=freeze_at > 0):
198 |         chan = image.shape[1]
199 |         pad_base = maybe_reverse_pad(2, 3)
200 |         l = tf.pad(image, tf.stack(
201 |             [[0, 0], [0, 0],
202 |              [pad_base[0], pad_base[1] + pad_shape2d[0]],
203 |              [pad_base[0], pad_base[1] + pad_shape2d[1]]]))
204 |         l.set_shape([None, chan, None, None])
205 |         l = Conv2D('conv0', l, 64, 7, strides=2, padding='VALID')
206 |         l = tf.pad(l, [[0, 0], [0, 0], maybe_reverse_pad(0, 1), maybe_reverse_pad(0, 1)])
207 |         l = MaxPooling('pool0', l, 3, strides=2, padding='VALID')
208 |     with backbone_scope(freeze=freeze_at > 1):
209 |         c2 = resnet_group('group0', l, resnet_bottleneck, 64, num_blocks[0], 1)
210 |     with backbone_scope(freeze=freeze_at > 2):
211 |         c3 = resnet_group('group1', c2, resnet_bottleneck, 128, num_blocks[1], 2)
212 |         c4 = resnet_group('group2', c3, resnet_bottleneck, 256, num_blocks[2], 2)
213 |         c5 = resnet_group('group3', c4, resnet_bottleneck, 512, num_blocks[3], 2)
214 |     # 32x downsampling up to now
215 |     # size of c5: ceil(input/32)
216 |     return c2, c3, c4, c5
217 | 


--------------------------------------------------------------------------------
/model_fpn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import itertools
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | 
  7 | from tensorpack.models import Conv2D, FixedUnPooling, MaxPooling, layer_register
  8 | from tensorpack.tfutils.argscope import argscope
  9 | from tensorpack.tfutils.scope_utils import under_name_scope
 10 | from tensorpack.tfutils.summary import add_moving_summary
 11 | from tensorpack.tfutils.tower import get_current_tower_context
 12 | 
 13 | from basemodel import GroupNorm
 14 | from config import config as cfg
 15 | from model_box import roi_align
 16 | from model_rpn import generate_rpn_proposals, rpn_losses
 17 | from utils.box_ops import area as tf_area
 18 | 
 19 | 
 20 | @layer_register(log_shape=True)
 21 | def fpn_model(features):
 22 |     """
 23 |     Args:
 24 |         features ([tf.Tensor]): ResNet features c2-c5
 25 | 
 26 |     Returns:
 27 |         [tf.Tensor]: FPN features p2-p6
 28 |     """
 29 |     assert len(features) == 4, features
 30 |     num_channel = cfg.FPN.NUM_CHANNEL
 31 | 
 32 |     use_gn = cfg.FPN.NORM == 'GN'
 33 | 
 34 |     def upsample2x(name, x):
 35 |         return FixedUnPooling(
 36 |             name, x, 2, unpool_mat=np.ones((2, 2), dtype='float32'),
 37 |             data_format='channels_first')
 38 | 
 39 |         # tf.image.resize is, again, not aligned.
 40 |         # with tf.name_scope(name):
 41 |         #     shape2d = tf.shape(x)[2:]
 42 |         #     x = tf.transpose(x, [0, 2, 3, 1])
 43 |         #     x = tf.image.resize_nearest_neighbor(x, shape2d * 2, align_corners=True)
 44 |         #     x = tf.transpose(x, [0, 3, 1, 2])
 45 |         #     return x
 46 | 
 47 |     with argscope(Conv2D, data_format='channels_first',
 48 |                   activation=tf.identity, use_bias=True,
 49 |                   kernel_initializer=tf.variance_scaling_initializer(scale=1.)):
 50 |         lat_2345 = [Conv2D('lateral_1x1_c{}'.format(i + 2), c, num_channel, 1)
 51 |                     for i, c in enumerate(features)]
 52 |         if use_gn:
 53 |             lat_2345 = [GroupNorm('gn_c{}'.format(i + 2), c) for i, c in enumerate(lat_2345)]
 54 |         lat_sum_5432 = []
 55 |         for idx, lat in enumerate(lat_2345[::-1]):
 56 |             if idx == 0:
 57 |                 lat_sum_5432.append(lat)
 58 |             else:
 59 |                 lat = lat + upsample2x('upsample_lat{}'.format(6 - idx), lat_sum_5432[-1])
 60 |                 lat_sum_5432.append(lat)
 61 |         p2345 = [Conv2D('posthoc_3x3_p{}'.format(i + 2), c, num_channel, 3)
 62 |                  for i, c in enumerate(lat_sum_5432[::-1])]
 63 |         if use_gn:
 64 |             p2345 = [GroupNorm('gn_p{}'.format(i + 2), c) for i, c in enumerate(p2345)]
 65 |         p6 = MaxPooling('maxpool_p6', p2345[-1], pool_size=1, strides=2, data_format='channels_first', padding='VALID')
 66 |         return p2345 + [p6]
 67 | 
 68 | 
 69 | @under_name_scope()
 70 | def fpn_map_rois_to_levels(boxes):
 71 |     """
 72 |     Assign boxes to level 2~5.
 73 | 
 74 |     Args:
 75 |         boxes (nx4):
 76 | 
 77 |     Returns:
 78 |         [tf.Tensor]: 4 tensors for level 2-5. Each tensor is a vector of indices of boxes in its level.
 79 |         [tf.Tensor]: 4 tensors, the gathered boxes in each level.
 80 | 
 81 |     Be careful that the returned tensor could be empty.
 82 |     """
 83 |     sqrtarea = tf.sqrt(tf_area(boxes))
 84 |     level = tf.cast(tf.floor(
 85 |         4 + tf.log(sqrtarea * (1. / 224) + 1e-6) * (1.0 / np.log(2))), tf.int32)
 86 | 
 87 |     # RoI levels range from 2~5 (not 6)
 88 |     level_ids = [
 89 |         tf.where(level <= 2),
 90 |         tf.where(tf.equal(level, 3)),   # == is not supported
 91 |         tf.where(tf.equal(level, 4)),
 92 |         tf.where(level >= 5)]
 93 |     level_ids = [tf.reshape(x, [-1], name='roi_level{}_id'.format(i + 2))
 94 |                  for i, x in enumerate(level_ids)]
 95 |     num_in_levels = [tf.size(x, name='num_roi_level{}'.format(i + 2))
 96 |                      for i, x in enumerate(level_ids)]
 97 |     add_moving_summary(*num_in_levels)
 98 | 
 99 |     level_boxes = [tf.gather(boxes, ids) for ids in level_ids]
100 |     return level_ids, level_boxes
101 | 
102 | 
103 | @under_name_scope()
104 | def multilevel_roi_align(features, rcnn_boxes, resolution):
105 |     """
106 |     Args:
107 |         features ([tf.Tensor]): 4 FPN feature level 2-5
108 |         rcnn_boxes (tf.Tensor): nx4 boxes
109 |         resolution (int): output spatial resolution
110 |     Returns:
111 |         NxC x res x res
112 |     """
113 |     assert len(features) == 4, features
114 |     # Reassign rcnn_boxes to levels
115 |     level_ids, level_boxes = fpn_map_rois_to_levels(rcnn_boxes)
116 |     all_rois = []
117 | 
118 |     # Crop patches from corresponding levels
119 |     for i, boxes, featuremap in zip(itertools.count(), level_boxes, features):
120 |         with tf.name_scope('roi_level{}'.format(i + 2)):
121 |             boxes_on_featuremap = boxes * (1.0 / cfg.FPN.ANCHOR_STRIDES[i])
122 |             all_rois.append(roi_align(featuremap, boxes_on_featuremap, resolution))
123 | 
124 |     # this can fail if using TF<=1.8 with MKL build
125 |     all_rois = tf.concat(all_rois, axis=0)  # NCHW
126 |     # Unshuffle to the original order, to match the original samples
127 |     level_id_perm = tf.concat(level_ids, axis=0)  # A permutation of 1~N
128 |     level_id_invert_perm = tf.invert_permutation(level_id_perm)
129 |     all_rois = tf.gather(all_rois, level_id_invert_perm)
130 |     return all_rois
131 | 
132 | 
133 | @under_name_scope()
134 | def neck_roi_align(features, rcnn_boxes, resolution):
135 |     """
136 |     Args:
137 |         features ([tf.Tensor]): 4 FPN feature level 2-5
138 |         rcnn_boxes (tf.Tensor): nx4 boxes
139 |         resolution (int): output spatial resolution
140 |     Returns:
141 |         NxC x res x res
142 |     """
143 |     assert len(features) == 4, features
144 |     aligned_features = None
145 |     for i in range(4):
146 |         with tf.name_scope('roi_level{}'.format(i + 2)):
147 |             boxes_on_featuremap = rcnn_boxes * (1.0 / cfg.FPN.ANCHOR_STRIDES[i])
148 |             level_features = roi_align(features[i], boxes_on_featuremap, resolution)
149 |             if aligned_features is None:
150 |                 aligned_features = level_features
151 |             else:
152 |                 aligned_features += level_features
153 |     return aligned_features
154 | 
155 | 
156 | def multilevel_rpn_losses(
157 |         multilevel_anchors, multilevel_label_logits, multilevel_box_logits):
158 |     """
159 |     Args:
160 |         multilevel_anchors: #lvl RPNAnchors
161 |         multilevel_label_logits: #lvl tensors of shape HxWxA
162 |         multilevel_box_logits: #lvl tensors of shape HxWxAx4
163 | 
164 |     Returns:
165 |         label_loss, box_loss
166 |     """
167 |     num_lvl = len(cfg.FPN.ANCHOR_STRIDES)
168 |     assert len(multilevel_anchors) == num_lvl
169 |     assert len(multilevel_label_logits) == num_lvl
170 |     assert len(multilevel_box_logits) == num_lvl
171 | 
172 |     losses = []
173 |     with tf.name_scope('rpn_losses'):
174 |         for lvl in range(num_lvl):
175 |             anchors = multilevel_anchors[lvl]
176 |             label_loss, box_loss = rpn_losses(
177 |                 anchors.gt_labels, anchors.encoded_gt_boxes(),
178 |                 multilevel_label_logits[lvl], multilevel_box_logits[lvl],
179 |                 name_scope='level{}'.format(lvl + 2))
180 |             losses.extend([label_loss, box_loss])
181 | 
182 |         total_label_loss = tf.add_n(losses[::2], name='label_loss')
183 |         total_box_loss = tf.add_n(losses[1::2], name='box_loss')
184 |         add_moving_summary(total_label_loss, total_box_loss)
185 |     return [total_label_loss, total_box_loss]
186 | 
187 | 
188 | @under_name_scope()
189 | def generate_fpn_proposals(
190 |         multilevel_pred_boxes, multilevel_label_logits, image_shape2d):
191 |     """
192 |     Args:
193 |         multilevel_pred_boxes: #lvl HxWxAx4 boxes
194 |         multilevel_label_logits: #lvl tensors of shape HxWxA
195 | 
196 |     Returns:
197 |         boxes: kx4 float
198 |         scores: k logits
199 |     """
200 |     num_lvl = len(cfg.FPN.ANCHOR_STRIDES)
201 |     assert len(multilevel_pred_boxes) == num_lvl
202 |     assert len(multilevel_label_logits) == num_lvl
203 | 
204 |     training = get_current_tower_context().is_training
205 |     all_boxes = []
206 |     all_scores = []
207 |     if cfg.FPN.PROPOSAL_MODE == 'Level':
208 |         fpn_nms_topk = cfg.RPN.TRAIN_PER_LEVEL_NMS_TOPK if training else cfg.RPN.TEST_PER_LEVEL_NMS_TOPK
209 |         for lvl in range(num_lvl):
210 |             with tf.name_scope('Lvl{}'.format(lvl + 2)):
211 |                 pred_boxes_decoded = multilevel_pred_boxes[lvl]
212 |                 proposal_boxes, proposal_scores = generate_rpn_proposals(
213 |                     tf.reshape(pred_boxes_decoded, [-1, 4]),
214 |                     tf.reshape(multilevel_label_logits[lvl], [-1]),
215 |                     image_shape2d, fpn_nms_topk)
216 |                 all_boxes.append(proposal_boxes)
217 |                 all_scores.append(proposal_scores)
218 | 
219 |         proposal_boxes = tf.concat(all_boxes, axis=0)  # nx4
220 |         proposal_scores = tf.concat(all_scores, axis=0)  # n
221 |         # Here we are different from Detectron.
222 |         # Detectron picks top-k within the batch, rather than within an image. However we do not have a batch.
223 |         proposal_topk = tf.minimum(tf.size(proposal_scores), fpn_nms_topk)
224 |         proposal_scores, topk_indices = tf.nn.top_k(proposal_scores, k=proposal_topk, sorted=False)
225 |         proposal_boxes = tf.gather(proposal_boxes, topk_indices)
226 |     else:
227 |         for lvl in range(num_lvl):
228 |             with tf.name_scope('Lvl{}'.format(lvl + 2)):
229 |                 pred_boxes_decoded = multilevel_pred_boxes[lvl]
230 |                 all_boxes.append(tf.reshape(pred_boxes_decoded, [-1, 4]))
231 |                 all_scores.append(tf.reshape(multilevel_label_logits[lvl], [-1]))
232 |         all_boxes = tf.concat(all_boxes, axis=0)
233 |         all_scores = tf.concat(all_scores, axis=0)
234 |         proposal_boxes, proposal_scores = generate_rpn_proposals(
235 |             all_boxes, all_scores, image_shape2d,
236 |             cfg.RPN.TRAIN_PRE_NMS_TOPK if training else cfg.RPN.TEST_PRE_NMS_TOPK,
237 |             cfg.RPN.TRAIN_POST_NMS_TOPK if training else cfg.RPN.TEST_POST_NMS_TOPK)
238 | 
239 |     tf.sigmoid(proposal_scores, name='probs')  # for visualization
240 |     return tf.stop_gradient(proposal_boxes, name='boxes'), \
241 |         tf.stop_gradient(proposal_scores, name='scores')
242 | 


--------------------------------------------------------------------------------
/model_cascade.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from tensorpack.tfutils import get_current_tower_context
  4 | from tensorpack.tfutils.summary import add_moving_summary
  5 | 
  6 | from config import config as cfg
  7 | from model_box import clip_boxes
  8 | from model_frcnn import BoxProposals, FastRCNNHead, fastrcnn_outputs
  9 | from utils.box_ops import pairwise_iou
 10 | 
 11 | 
 12 | class CascadeRCNNHead(object):
 13 |     def __init__(self, proposals,
 14 |                  roi_func, fastrcnn_head_func, gt_targets, image_shape2d, num_classes):
 15 |         """
 16 |         Args:
 17 |             proposals: BoxProposals
 18 |             roi_func (boxes -> features): a function to crop features with rois
 19 |             fastrcnn_head_func (features -> features): the fastrcnn head to apply on the cropped features
 20 |             gt_targets (gt_boxes, gt_labels):
 21 |         """
 22 |         for k, v in locals().items():
 23 |             if k != 'self':
 24 |                 setattr(self, k, v)
 25 |         self.gt_boxes, self.gt_labels = gt_targets
 26 |         del self.gt_targets
 27 | 
 28 |         self.num_cascade_stages = len(cfg.CASCADE.IOUS)
 29 | 
 30 |         self.is_training = get_current_tower_context().is_training
 31 |         if self.is_training:
 32 |             @tf.custom_gradient
 33 |             def scale_gradient(x):
 34 |                 return x, lambda dy: dy * (1.0 / self.num_cascade_stages)
 35 |             self.scale_gradient = scale_gradient
 36 |         else:
 37 |             self.scale_gradient = tf.identity
 38 | 
 39 |         ious = cfg.CASCADE.IOUS
 40 |         # It's unclear how to do >3 stages, so it does not make sense to implement them
 41 |         assert self.num_cascade_stages == 3, "Only 3-stage cascade was implemented!"
 42 |         with tf.variable_scope('cascade_rcnn_stage1'):
 43 |             H1, B1 = self.run_head(self.proposals, 0)
 44 | 
 45 |         with tf.variable_scope('cascade_rcnn_stage2'):
 46 |             B1_proposal = self.match_box_with_gt(B1, ious[1])
 47 |             H2, B2 = self.run_head(B1_proposal, 1)
 48 | 
 49 |         with tf.variable_scope('cascade_rcnn_stage3'):
 50 |             B2_proposal = self.match_box_with_gt(B2, ious[2])
 51 |             H3, B3 = self.run_head(B2_proposal, 2)
 52 |         self._cascade_boxes = [B1, B2, B3]
 53 |         self._heads = [H1, H2, H3]
 54 | 
 55 |     def run_head(self, proposals, stage):
 56 |         """
 57 |         Args:
 58 |             proposals: BoxProposals
 59 |             stage: 0, 1, 2
 60 | 
 61 |         Returns:
 62 |             FastRCNNHead
 63 |             Nx4, updated boxes
 64 |         """
 65 |         reg_weights = tf.constant(cfg.CASCADE.BBOX_REG_WEIGHTS[stage], dtype=tf.float32)
 66 |         pooled_feature = self.roi_func(proposals.boxes)  # N,C,S,S
 67 |         pooled_feature = self.scale_gradient(pooled_feature)
 68 |         head_feature = self.fastrcnn_head_func('head', pooled_feature)
 69 |         # changed by Paul
 70 |         label_logits, box_logits = fastrcnn_outputs(
 71 |             'outputs_new', head_feature, self.num_classes, class_agnostic_regression=True)
 72 |         head = FastRCNNHead(proposals, box_logits, label_logits, self.gt_boxes, reg_weights)
 73 | 
 74 |         refined_boxes = head.decoded_output_boxes_class_agnostic()
 75 |         refined_boxes = clip_boxes(refined_boxes, self.image_shape2d)
 76 |         return head, tf.stop_gradient(refined_boxes, name='output_boxes')
 77 | 
 78 |     def match_box_with_gt(self, boxes, iou_threshold):
 79 |         """
 80 |         Args:
 81 |             boxes: Nx4
 82 |         Returns:
 83 |             BoxProposals
 84 |         """
 85 |         if self.is_training:
 86 |             with tf.name_scope('match_box_with_gt_{}'.format(iou_threshold)):
 87 |                 iou = pairwise_iou(boxes, self.gt_boxes)  # NxM
 88 |                 max_iou_per_box = tf.reduce_max(iou, axis=1)  # N
 89 |                 best_iou_ind = tf.argmax(iou, axis=1)  # N
 90 |                 labels_per_box = tf.gather(self.gt_labels, best_iou_ind)
 91 |                 fg_mask = max_iou_per_box >= iou_threshold
 92 |                 fg_inds_wrt_gt = tf.boolean_mask(best_iou_ind, fg_mask)
 93 |                 labels_per_box = tf.stop_gradient(labels_per_box * tf.cast(fg_mask, tf.int64))
 94 |                 return BoxProposals(boxes, labels_per_box, fg_inds_wrt_gt)
 95 |         else:
 96 |             return BoxProposals(boxes)
 97 | 
 98 |     def losses(self):
 99 |         ret = []
100 |         for idx, head in enumerate(self._heads):
101 |             with tf.name_scope('cascade_loss_stage{}'.format(idx + 1)):
102 |                 ret.extend(head.losses())
103 |         return ret
104 | 
105 |     def decoded_output_boxes(self):
106 |         """
107 |         Returns:
108 |             Nx#classx4
109 |         """
110 |         ret = self._cascade_boxes[-1]
111 |         ret = tf.expand_dims(ret, 1)     # class-agnostic
112 |         return tf.tile(ret, [1, self.num_classes, 1])
113 | 
114 |     def output_scores(self, name=None):
115 |         """
116 |         Returns:
117 |             Nx#class
118 |         """
119 |         scores = [head.output_scores('cascade_scores_stage{}'.format(idx + 1))
120 |                   for idx, head in enumerate(self._heads)]
121 |         return tf.multiply(tf.add_n(scores), (1.0 / self.num_cascade_stages), name=name)
122 | 
123 | 
124 | class CascadeRCNNHeadWithHardExamples(CascadeRCNNHead):
125 |     def __init__(self, proposals, roi_func, fastrcnn_head_func, gt_targets, image_shape2d, num_classes,
126 |                  hard_negative_features, hard_positive_features, hard_negative_loss_scaling_factor,
127 |                  hard_positive_loss_scaling_factor, hard_positive_ious, hard_positive_gt_boxes,
128 |                  hard_positive_jitter_boxes):
129 |         super().__init__(proposals, roi_func, fastrcnn_head_func, gt_targets, image_shape2d, num_classes)
130 |         self._hard_negative_features = hard_negative_features
131 |         self._hard_positive_features = hard_positive_features
132 |         self._hard_negative_loss_scaling_factor = hard_negative_loss_scaling_factor
133 |         self._hard_positive_loss_scaling_factor = hard_positive_loss_scaling_factor
134 |         self._hard_positive_ious = hard_positive_ious
135 |         self._hard_positive_gt_boxes = hard_positive_gt_boxes
136 |         self._hard_positive_jitter_boxes = hard_positive_jitter_boxes
137 | 
138 |     def _hard_losses(self, negative=True):
139 |         if negative:
140 |             hard_features = self._hard_negative_features
141 |             desc = "neg"
142 |         else:
143 |             hard_features = self._hard_positive_features
144 |             desc = "pos"
145 |         losses = []
146 |         for cascade_idx, iou_thres in enumerate(cfg.CASCADE.IOUS):
147 |             with tf.name_scope('cascade_loss_{}_stage{}'.format(desc, cascade_idx + 1)):
148 |                 with tf.variable_scope('cascade_rcnn_stage' + str(cascade_idx + 1), reuse=True):
149 |                     pooled_feature = self.roi_func(None, hard_features[:, cascade_idx])
150 |                     pooled_feature = self.scale_gradient(pooled_feature)
151 |                     head_feature = self.fastrcnn_head_func('head', pooled_feature)
152 |                     # changed by Paul
153 |                     label_logits, box_logits = fastrcnn_outputs(
154 |                         'outputs_new', head_feature, self.num_classes, class_agnostic_regression=True)
155 |                     mean_label = None
156 |                     box_loss = None
157 |                     if negative:
158 |                         labels = tf.zeros((tf.shape(label_logits)[0],), dtype=tf.int64)
159 |                     else:
160 |                         labels = tf.cast(tf.greater_equal(self._hard_positive_ious[:, cascade_idx], iou_thres),
161 |                                          tf.int64)
162 |                         mean_label = tf.reduce_mean(tf.cast(labels, tf.float32),
163 |                                                     name='hard_{}_label_mean{}'.format(desc, cascade_idx + 1))
164 |                         if cfg.USE_REGRESSION_LOSS_ON_HARD_POSITIVES:
165 |                             labels_bool = tf.cast(labels, tf.bool)
166 |                             valid = tf.reduce_any(labels_bool)
167 | 
168 |                             def make_box_loss():
169 |                                 gt_boxes = tf.boolean_mask(self._hard_positive_gt_boxes, labels_bool)
170 |                                 inp_boxes = tf.boolean_mask(self._hard_positive_jitter_boxes[:, cascade_idx],
171 |                                                             labels_bool)
172 |                                 box_logits_masked = tf.boolean_mask(box_logits, labels_bool)
173 |                                 from examples.FasterRCNN.model_box import encode_bbox_target
174 |                                 reg_targets = encode_bbox_target(gt_boxes,
175 |                                                                  inp_boxes) * cfg.CASCADE.BBOX_REG_WEIGHTS[cascade_idx]
176 |                                 _box_loss = tf.losses.huber_loss(
177 |                                     reg_targets, tf.squeeze(box_logits_masked, axis=1),
178 |                                     reduction=tf.losses.Reduction.SUM)
179 |                                 _box_loss = tf.truediv(
180 |                                     _box_loss, tf.cast(tf.shape(reg_targets)[0], tf.float32))
181 |                                 return _box_loss
182 | 
183 |                             box_loss = tf.cond(valid, make_box_loss, lambda: tf.constant(0, dtype=tf.float32))
184 |                             box_loss = tf.multiply(box_loss, cfg.HARD_POSITIVE_BOX_LOSS_SCALING_FACTOR,
185 |                                                    name='hard_{}_box_loss{}'.format(desc, cascade_idx + 1))
186 |                             losses.append(box_loss)
187 |                     label_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
188 |                         labels=labels, logits=label_logits)
189 |                     if negative:
190 |                         label_loss *= self._hard_negative_loss_scaling_factor
191 |                     else:
192 |                         label_loss *= self._hard_positive_loss_scaling_factor
193 |                     label_loss = tf.reduce_mean(label_loss, name='hard_{}_label_loss{}'.format(desc, cascade_idx + 1))
194 |                     prediction = tf.argmax(label_logits, axis=1, name='label_prediction_hard_{}'.format(desc))
195 |                     correct = tf.cast(tf.equal(prediction, labels), tf.float32)
196 |                     accuracy = tf.reduce_mean(correct, name='hard_{}_label_accuracy{}'.format(desc, cascade_idx + 1))
197 |                     losses.append(label_loss)
198 |                 if mean_label is not None:
199 |                     add_moving_summary(mean_label)
200 |                 if box_loss is not None:
201 |                     add_moving_summary(box_loss)
202 |                 add_moving_summary(accuracy)
203 |                 add_moving_summary(label_loss)
204 |         return losses
205 | 
206 |     def losses(self):
207 |         normal_losses = super().losses()
208 |         if self.is_training:
209 |             hnl = self._hard_losses(negative=True)
210 |             if self._hard_positive_features is not None:
211 |                 hpl = self._hard_losses(negative=False)
212 |             else:
213 |                 hpl = []
214 |             return normal_losses + hnl + hpl
215 |         else:
216 |             return normal_losses
217 | 


--------------------------------------------------------------------------------
/eval_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # File: eval.py
  3 | 
  4 | import itertools
  5 | import random
  6 | import sys
  7 | import os
  8 | import json
  9 | import PIL
 10 | import numpy as np
 11 | import glob
 12 | from collections import namedtuple
 13 | from concurrent.futures import ThreadPoolExecutor
 14 | from contextlib import ExitStack
 15 | import cv2
 16 | import pycocotools.mask as cocomask
 17 | import tqdm
 18 | import tensorflow as tf
 19 | import xmltodict
 20 | 
 21 | from tensorpack.callbacks import Callback
 22 | from tensorpack.tfutils.common import get_tf_version_tuple
 23 | from tensorpack.utils import logger
 24 | from tensorpack.utils.utils import get_tqdm
 25 | 
 26 | from common import CustomResize, clip_boxes, box_to_point8, point8_to_box
 27 | from data import get_eval_dataflow
 28 | from dataset import DetectionDataset
 29 | from config import config as cfg
 30 | 
 31 | try:
 32 |     import horovod.tensorflow as hvd
 33 | except ImportError:
 34 |     pass
 35 | 
 36 | 
 37 | DetectionResult = namedtuple(
 38 |     'DetectionResult',
 39 |     ['box', 'score', 'class_id', 'mask'])
 40 | """
 41 | box: 4 float
 42 | score: float
 43 | class_id: int, 1~NUM_CLASS
 44 | mask: None, or a binary image of the original image shape
 45 | """
 46 | 
 47 | 
 48 | def _paste_mask(box, mask, shape):
 49 |     """
 50 |     Args:
 51 |         box: 4 float
 52 |         mask: MxM floats
 53 |         shape: h,w
 54 |     Returns:
 55 |         A uint8 binary image of hxw.
 56 |     """
 57 |     # int() is floor
 58 |     # box fpcoor=0.0 -> intcoor=0.0
 59 |     x0, y0 = list(map(int, box[:2] + 0.5))
 60 |     # box fpcoor=h -> intcoor=h-1, inclusive
 61 |     x1, y1 = list(map(int, box[2:] - 0.5))    # inclusive
 62 |     x1 = max(x0, x1)    # require at least 1x1
 63 |     y1 = max(y0, y1)
 64 | 
 65 |     w = x1 + 1 - x0
 66 |     h = y1 + 1 - y0
 67 | 
 68 |     # rounding errors could happen here, because masks were not originally computed for this shape.
 69 |     # but it's hard to do better, because the network does not know the "original" scale
 70 |     mask = (cv2.resize(mask, (w, h)) > 0.5).astype('uint8')
 71 |     ret = np.zeros(shape, dtype='uint8')
 72 |     ret[y0:y1 + 1, x0:x1 + 1] = mask
 73 |     return ret
 74 | 
 75 | 
 76 | def predict_image(img, model_func):
 77 |     """
 78 |     Run detection on one image, using the TF callable.
 79 |     This function should handle the preprocessing internally.
 80 | 
 81 |     Args:
 82 |         img: an image
 83 |         model_func: a callable from the TF model.
 84 |             It takes image and returns (boxes, probs, labels, [masks])
 85 | 
 86 |     Returns:
 87 |         [DetectionResult]
 88 |     """
 89 | 
 90 |     orig_shape = img.shape[:2]
 91 |     resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE)
 92 |     resized_img = resizer.augment(img)
 93 |     scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1])
 94 |     boxes, probs, labels, *masks = model_func(resized_img)
 95 |     boxes = boxes / scale
 96 |     # boxes are already clipped inside the graph, but after the floating point scaling, this may not be true any more.
 97 |     boxes = clip_boxes(boxes, orig_shape)
 98 | 
 99 |     if masks:
100 |         # has mask
101 |         full_masks = [_paste_mask(box, mask, orig_shape)
102 |                       for box, mask in zip(boxes, masks[0])]
103 |         masks = full_masks
104 |     else:
105 |         # fill with none
106 |         masks = [None] * len(boxes)
107 | 
108 |     results = [DetectionResult(*args) for args in zip(boxes, probs, labels, masks)]
109 |     return results
110 | 
111 | 
112 | def predict_image_track_with_precomputed_ref_features(img, ref_features, model_func):
113 |     orig_shape = img.shape[:2]
114 |     resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE)
115 |     resized_img = resizer.augment(img)
116 |     scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1])
117 |     boxes, probs, labels, *masks = model_func(resized_img, ref_features)
118 |     boxes = boxes / scale
119 |     # boxes are already clipped inside the graph, but after the floating point scaling, this may not be true any more.
120 |     boxes = clip_boxes(boxes, orig_shape)
121 | 
122 |     if masks:
123 |         # has mask
124 |         full_masks = [_paste_mask(box, mask, orig_shape)
125 |                       for box, mask in zip(boxes, masks[0])]
126 |         masks = full_masks
127 |     else:
128 |         # fill with none
129 |         masks = [None] * len(boxes)
130 | 
131 |     results = [DetectionResult(*args) for args in zip(boxes, probs, labels, masks)]
132 |     return results
133 | 
134 | 
135 | def predict_image_track(img, ref_img, ref_bbox, model_func):
136 |     """
137 |     Run detection on one image, using the TF callable.
138 |     This function should handle the preprocessing internally.
139 | 
140 |     Args:
141 |         img: an image
142 |         model_func: a callable from the TF model.
143 |             It takes image and returns (boxes, probs, labels, [masks])
144 | 
145 |     Returns:
146 |         [DetectionResult]
147 |     """
148 | 
149 |     orig_shape = img.shape[:2]
150 |     resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE)
151 |     resized_img = resizer.augment(img)
152 |     resized_ref_img, params = resizer.augment_return_params(ref_img)
153 | 
154 |     ref_points = box_to_point8(ref_bbox[np.newaxis])
155 |     ref_points = resizer.augment_coords(ref_points, params)
156 |     resized_ref_boxes = point8_to_box(ref_points)
157 |     resized_ref_bbox = resized_ref_boxes[0]
158 | 
159 |     scale = np.sqrt(resized_img.shape[0] * 1.0 / img.shape[0] * resized_img.shape[1] / img.shape[1])
160 |     boxes, probs, labels, *masks = model_func(resized_img, resized_ref_img, resized_ref_bbox)
161 |     boxes = boxes / scale
162 |     # boxes are already clipped inside the graph, but after the floating point scaling, this may not be true any more.
163 |     boxes = clip_boxes(boxes, orig_shape)
164 | 
165 |     if masks:
166 |         # has mask
167 |         full_masks = [_paste_mask(box, mask, orig_shape)
168 |                       for box, mask in zip(boxes, masks[0])]
169 |         masks = full_masks
170 |     else:
171 |         # fill with none
172 |         masks = [None] * len(boxes)
173 | 
174 |     results = [DetectionResult(*args) for args in zip(boxes, probs, labels, masks)]
175 |     return results
176 | 
177 | 
178 | def predict_dataflow(df, model_func, tqdm_bar=None):
179 |     """
180 |     Args:
181 |         df: a DataFlow which produces (image, image_id)
182 |         model_func: a callable from the TF model.
183 |             It takes image and returns (boxes, probs, labels, [masks])
184 |         tqdm_bar: a tqdm object to be shared among multiple evaluation instances. If None,
185 |             will create a new one.
186 | 
187 |     Returns:
188 |         list of dict, in the format used by
189 |         `DetectionDataset.eval_or_save_inference_results`
190 |     """
191 |     df.reset_state()
192 |     all_results = []
193 |     with ExitStack() as stack:
194 |         # tqdm is not quite thread-safe: https://github.com/tqdm/tqdm/issues/323
195 |         if tqdm_bar is None:
196 |             tqdm_bar = stack.enter_context(get_tqdm(total=df.size()))
197 |         for ref_img, ref_bbox, target_img, target_bbox, gt_file in df:
198 |             results = predict_image_track(target_img, ref_img, ref_bbox, model_func)
199 |             all_results.append((gt_file, results, target_bbox))
200 |             tqdm_bar.update(1)
201 |     return all_results
202 | 
203 | 
204 | def multithread_predict_dataflow(dataflows, model_funcs):
205 |     """
206 |     Running multiple `predict_dataflow` in multiple threads, and aggregate the results.
207 | 
208 |     Args:
209 |         dataflows: a list of DataFlow to be used in :func:`predict_dataflow`
210 |         model_funcs: a list of callable to be used in :func:`predict_dataflow`
211 | 
212 |     Returns:
213 |         list of dict, in the format used by
214 |         `DetectionDataset.eval_or_save_inference_results`
215 |     """
216 |     num_worker = len(model_funcs)
217 |     assert len(dataflows) == num_worker
218 |     if num_worker == 1:
219 |         return predict_dataflow(dataflows[0], model_funcs[0])
220 |     kwargs = {'thread_name_prefix': 'EvalWorker'} if sys.version_info.minor >= 6 else {}
221 |     with ThreadPoolExecutor(max_workers=num_worker, **kwargs) as executor, \
222 |             tqdm.tqdm(total=sum([df.size() for df in dataflows])) as pbar:
223 |         futures = []
224 |         for dataflow, pred in zip(dataflows, model_funcs):
225 |             futures.append(executor.submit(predict_dataflow, dataflow, pred, pbar))
226 |         all_results = list(itertools.chain(*[fut.result() for fut in futures]))
227 |         return all_results
228 | 
229 | 
230 | class EvalCallback(Callback):
231 |     """
232 |     A callback that runs evaluation once a while.
233 |     It supports multi-gpu evaluation.
234 |     """
235 | 
236 |     _chief_only = False
237 | 
238 |     def __init__(self, eval_dataset, in_names, out_names, output_dir):
239 |         self._eval_dataset = eval_dataset
240 |         self._in_names, self._out_names = in_names, out_names
241 |         self._output_dir = output_dir
242 | 
243 |     def _setup_graph(self):
244 |         num_gpu = cfg.TRAIN.NUM_GPUS
245 |         if cfg.TRAINER == 'replicated':
246 |             # TF bug in version 1.11, 1.12: https://github.com/tensorflow/tensorflow/issues/22750
247 |             buggy_tf = get_tf_version_tuple() in [(1, 11), (1, 12)]
248 | 
249 |             # Use two predictor threads per GPU to get better throughput
250 |             self.num_predictor = num_gpu if buggy_tf else num_gpu * 2
251 |             self.predictors = [self._build_predictor(k % num_gpu) for k in range(self.num_predictor)]
252 |             self.dataflows = [get_eval_dataflow(self._eval_dataset,
253 |                                                 shard=k, num_shards=self.num_predictor)
254 |                               for k in range(self.num_predictor)]
255 |         else:
256 |             # Only eval on the first machine.
257 |             # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs
258 |             self._horovod_run_eval = hvd.rank() == hvd.local_rank()
259 |             if self._horovod_run_eval:
260 |                 self.predictor = self._build_predictor(0)
261 |                 self.dataflow = get_eval_dataflow(self._eval_dataset,
262 |                                                   shard=hvd.local_rank(), num_shards=hvd.local_size())
263 | 
264 |             self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
265 | 
266 |     def _build_predictor(self, idx):
267 |         return self.trainer.get_predictor(self._in_names, self._out_names, device=idx)
268 | 
269 |     def _before_train(self):
270 |         eval_period = cfg.TRAIN.EVAL_PERIOD
271 |         self.epochs_to_eval = set()
272 |         for k in itertools.count(1):
273 |             if k * eval_period > self.trainer.max_epoch:
274 |                 break
275 |             self.epochs_to_eval.add(k * eval_period)
276 |         self.epochs_to_eval.add(self.trainer.max_epoch)
277 |         logger.info("[EvalCallback] Will evaluate every {} epochs".format(eval_period))
278 | 
279 |     def _eval(self):
280 |         logdir = self._output_dir
281 |         if cfg.TRAINER == 'replicated':
282 |             all_results = multithread_predict_dataflow(self.dataflows, self.predictors)
283 |         else:
284 |             filenames = [os.path.join(
285 |                 logdir, 'outputs{}-part{}.json'.format(self.global_step, rank)
286 |             ) for rank in range(hvd.local_size())]
287 | 
288 |             if self._horovod_run_eval:
289 |                 local_results = predict_dataflow(self.dataflow, self.predictor)
290 |                 fname = filenames[hvd.local_rank()]
291 |                 with open(fname, 'w') as f:
292 |                     json.dump(local_results, f)
293 |             self.barrier.eval()
294 |             if hvd.rank() > 0:
295 |                 return
296 |             all_results = []
297 |             for fname in filenames:
298 |                 with open(fname, 'r') as f:
299 |                     obj = json.load(f)
300 |                 all_results.extend(obj)
301 |                 os.unlink(fname)
302 | 
303 |         output_file = os.path.join(
304 |             logdir, '{}-outputs{}.json'.format(self._eval_dataset, self.global_step))
305 | 
306 |         scores = DetectionDataset().eval_or_save_inference_results(
307 |             all_results, self._eval_dataset, output_file)
308 |         for k, v in scores.items():
309 |             self.trainer.monitors.put_scalar(k, v)
310 | 
311 |     def _trigger_epoch(self):
312 |         if self.epoch_num in self.epochs_to_eval:
313 |             logger.info("Running evaluation ...")
314 |             self._eval()
315 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # File: config.py
  3 | 
  4 | import numpy as np
  5 | import os
  6 | import six
  7 | import pprint
  8 | 
  9 | from tensorpack.utils import logger
 10 | from tensorpack.utils.gpu import get_num_gpu
 11 | 
 12 | __all__ = ['config', 'finalize_configs']
 13 | 
 14 | 
 15 | class AttrDict():
 16 | 
 17 |     _freezed = False
 18 |     """ Avoid accidental creation of new hierarchies. """
 19 | 
 20 |     def __getattr__(self, name):
 21 |         if self._freezed:
 22 |             raise AttributeError(name)
 23 |         ret = AttrDict()
 24 |         setattr(self, name, ret)
 25 |         return ret
 26 | 
 27 |     def __setattr__(self, name, value):
 28 |         if self._freezed and name not in self.__dict__:
 29 |             raise AttributeError(
 30 |                 "Config was freezed! Unknown config: {}".format(name))
 31 |         super().__setattr__(name, value)
 32 | 
 33 |     def __str__(self):
 34 |         return pprint.pformat(self.to_dict(), indent=1, width=100, compact=True)
 35 | 
 36 |     __repr__ = __str__
 37 | 
 38 |     def to_dict(self):
 39 |         """Convert to a nested dict. """
 40 |         return {k: v.to_dict() if isinstance(v, AttrDict) else v
 41 |                 for k, v in self.__dict__.items() if not k.startswith('_')}
 42 | 
 43 |     def update_args(self, args):
 44 |         """Update from command line args. """
 45 |         for cfg in args:
 46 |             keys, v = cfg.split('=', maxsplit=1)
 47 |             keylist = keys.split('.')
 48 | 
 49 |             dic = self
 50 |             for i, k in enumerate(keylist[:-1]):
 51 |                 assert k in dir(dic), "Unknown config key: {}".format(keys)
 52 |                 dic = getattr(dic, k)
 53 |             key = keylist[-1]
 54 | 
 55 |             oldv = getattr(dic, key)
 56 |             if not isinstance(oldv, str):
 57 |                 v = eval(v)
 58 |             setattr(dic, key, v)
 59 | 
 60 |     def freeze(self, freezed=True):
 61 |         self._freezed = freezed
 62 |         for v in self.__dict__.values():
 63 |             if isinstance(v, AttrDict):
 64 |                 v.freeze(freezed)
 65 | 
 66 |     # avoid silent bugs
 67 |     def __eq__(self, _):
 68 |         raise NotImplementedError()
 69 | 
 70 |     def __ne__(self, _):
 71 |         raise NotImplementedError()
 72 | 
 73 | 
 74 | config = AttrDict()
 75 | _C = config     # short alias to avoid coding
 76 | 
 77 | # paths to datasets! changes these!
 78 | _C.DATA.IMAGENET_VID_ROOT = "/globalwork/data/ILSVRC_VID/ILSVRC/"
 79 | _C.DATA.GOT10K_ROOT = "/globalwork/data/GOT10k/"
 80 | _C.DATA.LASOT_ROOT = "/globalwork/data/LaSOTBenchmark/"
 81 | _C.DATA.YOUTUBE_VOS_ROOT = "/globalwork/data/youtube-vos/"
 82 | _C.DATA.DAVIS2017_ROOT = "/globalwork/data/DAVIS2017/"
 83 | _C.DATA.YOUTUBE_BB_ROOT = "/globalwork/data/youtube-bb/yt_bb_detection_train/"
 84 | _C.DATA.TRACKINGNET_ROOT = "/globalwork/data/TrackingNet/"
 85 | _C.HARD_MINING_DATA_PATH = "/globalwork/data/hard_example_mining_index/"
 86 | 
 87 | _C.DATA.IMAGENET_VID = True
 88 | _C.DATA.GOT10K = True
 89 | _C.DATA.LASOT = True
 90 | _C.DATA.YOUTUBE_VOS = True
 91 | _C.DATA.YOUTUBE_BB = False
 92 | _C.DATA.DAVIS2017 = False
 93 | _C.DATA.TRACKINGNET = False
 94 | 
 95 | # mode flags ---------------------
 96 | _C.TRAINER = 'replicated'  # options: 'horovod', 'replicated'
 97 | _C.MODE_MASK = False        # FasterRCNN or MaskRCNN
 98 | _C.MODE_FPN = True
 99 | _C.MODE_TRACK = True
100 | _C.TRACK_VIDEO_ID = None
101 | 
102 | # new flags by us
103 | _C.MODE_SHARED_CONV_REDUCE = False
104 | _C.USE_PRECOMPUTED_REF_FEATURES = False
105 | _C.EXTRACT_GT_FEATURES = False
106 | _C.MODE_THIRD_STAGE = False
107 | _C.EXTEND_PROPOSALS_BY_ACTIVE_TRACKLETS = True
108 | 
109 | # hard mining stuff
110 | _C.MODE_HARD_MINING = False
111 | _C.MODE_IF_HARD_MINING_THEN_ALSO_POSITIVES = True
112 | _C.MODE_HARD_NEGATIVES_ONLY_CROSSOVER = False
113 | _C.MODE_HARD_NEGATIVES_ONLY_CROSSOVER_YOUTUBEVOS = False
114 | _C.USE_REGRESSION_LOSS_ON_HARD_POSITIVES = False
115 | _C.HARD_NEGATIVE_LOSS_SCALING_FACTOR = 0.12
116 | _C.HARD_POSITIVE_LOSS_SCALING_FACTOR = 0.1
117 | _C.HARD_POSITIVE_BOX_LOSS_SCALING_FACTOR = 0.1
118 | _C.N_HARD_NEGATIVES_TO_SAMPLE = 100
119 | _C.N_HARD_POS_TO_SAMPLE = 30
120 | _C.HARD_MINING_KNN = 10000
121 | _C.HARD_MINING_KNN_LASOT = 50000
122 | 
123 | _C.FORWARD_VIDEO_RANGE_START = None
124 | _C.FORWARD_VIDEO_RANGE_END = None
125 | 
126 | # might lead to minor slowdown, but gives useful information
127 | _C.MEASURE_IOU_DURING_TRAINING = True
128 | 
129 | # dataset -----------------------
130 | _C.DATA.BASEDIR = '/path/to/your/DATA/DIR'
131 | # All TRAIN dataset will be concatenated for training.
132 | _C.DATA.TRAIN = ['train2014', 'valminusminival2014']   # i.e. trainval35k, AKA train2017
133 | # Each VAL dataset will be evaluated separately (instead of concatenated)
134 | #_C.DATA.VAL = ('minival2014', )  # AKA val2017
135 | _C.DATA.VAL = ()
136 | # This two config will be populated later by the dataset loader:
137 | _C.DATA.NUM_CATEGORY = 1  # without the background class (e.g., 80 for COCO)
138 | _C.DATA.CLASS_NAMES = []  # NUM_CLASS (NUM_CATEGORY+1) strings, the first is "BG".
139 | 
140 | _C.DATA.DEBUG_VIS = False
141 | _C.DATA.MULTITHREAD = True
142 | 
143 | _C.DATA.GRAYSCALE_AUGMENTATIONS = True
144 | _C.DATA.MOTION_BLUR_AUGMENTATIONS = True
145 | 
146 | # basemodel ----------------------
147 | _C.BACKBONE.WEIGHTS = ''   # /path/to/weights.npz
148 | _C.BACKBONE.RESNET_NUM_BLOCKS = [3, 4, 23, 3]     # for resnet50
149 | # RESNET_NUM_BLOCKS = [3, 4, 23, 3]    # for resnet101
150 | _C.BACKBONE.FREEZE_AFFINE = False   # do not train affine parameters inside norm layers
151 | _C.BACKBONE.NORM = 'GN'  # options: FreezeBN, SyncBN, GN, None
152 | _C.BACKBONE.FREEZE_AT = 4  # options: 0, 1, 2
153 | 
154 | # Use a base model with TF-preferred padding mode,
155 | # which may pad more pixels on right/bottom than top/left.
156 | # See https://github.com/tensorflow/tensorflow/issues/18213
157 | # In tensorpack model zoo, ResNet models with TF_PAD_MODE=False are marked with "-AlignPadding".
158 | # All other models under `ResNet/` in the model zoo are using TF_PAD_MODE=True.
159 | # Using either one should probably give the same performance.
160 | # We use the "AlignPadding" one just to be consistent with caffe2.
161 | _C.BACKBONE.TF_PAD_MODE = False
162 | _C.BACKBONE.STRIDE_1X1 = False  # True for MSRA models
163 | 
164 | # schedule -----------------------
165 | _C.TRAIN.NUM_GPUS = None         # by default, will be set from code
166 | _C.TRAIN.WEIGHT_DECAY = 1e-4
167 | _C.TRAIN.BASE_LR = 1e-2  # defined for total batch size=8. Otherwise it will be adjusted automatically
168 | _C.TRAIN.WARMUP = 1000   # in terms of iterations. This is not affected by #GPUs
169 | _C.TRAIN.WARMUP_INIT_LR = 1e-2 * 0.33  # defined for total batch size=8. Otherwise it will be adjusted automatically
170 | _C.TRAIN.STEPS_PER_EPOCH = 500
171 | _C.TRAIN.STARTING_EPOCH = 1  # the first epoch to start with, useful to continue a training
172 | _C.TRAIN.MAX_NUM_EPOCHS = 1000000000000
173 | 
174 | # LR_SCHEDULE means equivalent steps when the total batch size is 8.
175 | # When the total bs!=8, the actual iterations to decrease learning rate, and
176 | # the base learning rate are computed from BASE_LR and LR_SCHEDULE.
177 | # Therefore, there is *no need* to modify the config if you only change the number of GPUs.
178 | 
179 | # _C.TRAIN.LR_SCHEDULE = [120000, 160000, 180000    ]      # "1x" schedule in detectron
180 | #_C.TRAIN.LR_SCHEDULE = [240000, 320000, 360000]      # "2x" schedule in detectron
181 | # Longer schedules for from-scratch training (https://arxiv.org/abs/1811.08883):
182 | # _C.TRAIN.LR_SCHEDULE = [960000, 1040000, 1080000]    # "6x" schedule in detectron
183 | # _C.TRAIN.LR_SCHEDULE = [1500000, 1580000, 1620000]   # "9x" schedule in detectron
184 | #_C.TRAIN.LR_SCHEDULE = [1500000, 1580000, 1620000]
185 | _C.TRAIN.LR_SCHEDULE = [250000, 280000, 300000] # for main training, afterwards we can do hard example training
186 | _C.TRAIN.EVAL_PERIOD = 20  # period (epochs) to run evaluation
187 | 
188 | # preprocessing --------------------
189 | # Alternative old (worse & faster) setting: 600
190 | _C.PREPROC.TRAIN_SHORT_EDGE_SIZE = [640, 800]  # [min, max] to sample from
191 | _C.PREPROC.TEST_SHORT_EDGE_SIZE = 800
192 | _C.PREPROC.MAX_SIZE = 1333
193 | # mean and std in RGB order.
194 | # Un-scaled version: [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
195 | _C.PREPROC.PIXEL_MEAN = [123.675, 116.28, 103.53]
196 | _C.PREPROC.PIXEL_STD = [58.395, 57.12, 57.375]
197 | 
198 | # anchors -------------------------
199 | _C.RPN.ANCHOR_STRIDE = 16
200 | _C.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512)   # sqrtarea of the anchor box
201 | _C.RPN.ANCHOR_RATIOS = (0.5, 1., 2.)
202 | _C.RPN.POSITIVE_ANCHOR_THRESH = 0.7
203 | _C.RPN.NEGATIVE_ANCHOR_THRESH = 0.3
204 | 
205 | # rpn training -------------------------
206 | _C.RPN.FG_RATIO = 0.5  # fg ratio among selected RPN anchors
207 | _C.RPN.BATCH_PER_IM = 256  # total (across FPN levels) number of anchors that are marked valid
208 | _C.RPN.MIN_SIZE = 0
209 | _C.RPN.PROPOSAL_NMS_THRESH = 0.7
210 | # Anchors which overlap with a crowd box (IOA larger than threshold) will be ignored.
211 | # Setting this to a value larger than 1.0 will disable the feature.
212 | # It is disabled by default because Detectron does not do this.
213 | _C.RPN.CROWD_OVERLAP_THRESH = 9.99
214 | _C.RPN.HEAD_DIM = 1024      # used in C4 only
215 | 
216 | # RPN proposal selection -------------------------------
217 | # for C4
218 | _C.RPN.TRAIN_PRE_NMS_TOPK = 12000
219 | _C.RPN.TRAIN_POST_NMS_TOPK = 2000
220 | _C.RPN.TEST_PRE_NMS_TOPK = 6000
221 | _C.RPN.TEST_POST_NMS_TOPK = 1000   # if you encounter OOM in inference, set this to a smaller number
222 | # for FPN, #proposals per-level and #proposals after merging are (for now) the same
223 | # if FPN.PROPOSAL_MODE = 'Joint', these options have no effect
224 | _C.RPN.TRAIN_PER_LEVEL_NMS_TOPK = 2000
225 | #_C.RPN.TEST_PER_LEVEL_NMS_TOPK = 1000
226 | # seems we need a lot of proposals for tracking with fixed RPN. Note that this makes it quite slow!
227 | # to prevent OOM let's do 8k for validating during training
228 | # for actual forwarding, we could do 15k
229 | _C.RPN.TEST_PER_LEVEL_NMS_TOPK = 1000
230 | _C.RPN.TEST_ALTERNATIVE_ANCHOR_SAMPLING = False
231 | 
232 | # fastrcnn training ---------------------
233 | _C.FRCNN.BATCH_PER_IM = 512
234 | _C.FRCNN.BBOX_REG_WEIGHTS = [10., 10., 5., 5.]  # Better but non-standard setting: [20, 20, 10, 10]
235 | _C.FRCNN.FG_THRESH = 0.5
236 | _C.FRCNN.FG_RATIO = 0.25  # fg ratio in a ROI batch
237 | 
238 | _C.FRCNN.USE_FOCAL_LOSS = False
239 | _C.FRCNN.FG_LOSS_WEIGHTING_FACTOR = 3.0
240 | _C.FRCNN.BOX_LOSS_WEIGHTING_FACTOR = 1.0
241 | 
242 | # FPN -------------------------
243 | _C.FPN.ANCHOR_STRIDES = (4, 8, 16, 32, 64)  # strides for each FPN level. Must be the same length as ANCHOR_SIZES
244 | _C.FPN.PROPOSAL_MODE = 'Level'  # 'Level', 'Joint'
245 | _C.FPN.NUM_CHANNEL = 256
246 | _C.FPN.NORM = 'GN'  # 'None', 'GN'
247 | # The head option is only used in FPN. For C4 models, the head is C5
248 | _C.FPN.FRCNN_HEAD_FUNC = 'fastrcnn_4conv1fc_gn_head'
249 | # choices: fastrcnn_2fc_head, fastrcnn_4conv1fc_{,gn_}head
250 | _C.FPN.FRCNN_CONV_HEAD_DIM = 256
251 | _C.FPN.FRCNN_FC_HEAD_DIM = 1024
252 | _C.FPN.MRCNN_HEAD_FUNC = 'maskrcnn_up4conv_gn_head'   # choices: maskrcnn_up4conv_{,gn_}head
253 | 
254 | # Mask-RCNN
255 | _C.MRCNN.HEAD_DIM = 256
256 | 
257 | # Cascade-RCNN, only available in FPN mode
258 | _C.FPN.CASCADE = True
259 | _C.CASCADE.IOUS = [0.5, 0.6, 0.7]
260 | _C.CASCADE.BBOX_REG_WEIGHTS = [[10., 10., 5., 5.], [20., 20., 10., 10.], [30., 30., 15., 15.]]
261 | 
262 | # testing -----------------------
263 | _C.TEST.FRCNN_NMS_THRESH = 0.5
264 | 
265 | # Smaller threshold value gives significantly better mAP. But we use 0.05 for consistency with Detectron.
266 | # mAP with 1e-4 threshold can be found at https://github.com/tensorpack/tensorpack/commit/26321ae58120af2568bdbf2269f32aa708d425a8#diff-61085c48abee915b584027e1085e1043  # noqa
267 | _C.TEST.RESULT_SCORE_THRESH = 0.005
268 | _C.TEST.RESULT_SCORE_THRESH_VIS = 0.005   # only visualize confident results
269 | _C.TEST.RESULTS_PER_IM = 100
270 | 
271 | _C.freeze()  # avoid typo / wrong config keys
272 | 
273 | 
274 | def finalize_configs(is_training):
275 |     """
276 |     Run some sanity checks, and populate some configs from others
277 |     """
278 |     _C.freeze(False)  # populate new keys now
279 |     _C.DATA.NUM_CLASS = _C.DATA.NUM_CATEGORY + 1  # +1 background
280 |     _C.DATA.BASEDIR = os.path.expanduser(_C.DATA.BASEDIR)
281 |     if isinstance(_C.DATA.VAL, six.string_types):  # support single string (the typical case) as well
282 |         _C.DATA.VAL = (_C.DATA.VAL, )
283 | 
284 |     assert _C.BACKBONE.NORM in ['FreezeBN', 'SyncBN', 'GN', 'None'], _C.BACKBONE.NORM
285 |     if _C.BACKBONE.NORM != 'FreezeBN':
286 |         assert not _C.BACKBONE.FREEZE_AFFINE
287 |     assert _C.BACKBONE.FREEZE_AT in [0, 1, 2, 3, 4]
288 | 
289 |     _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS)
290 |     assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES)
291 |     # image size into the backbone has to be multiple of this number
292 |     _C.FPN.RESOLUTION_REQUIREMENT = _C.FPN.ANCHOR_STRIDES[3]  # [3] because we build FPN with features r2,r3,r4,r5
293 | 
294 |     if _C.MODE_FPN:
295 |         size_mult = _C.FPN.RESOLUTION_REQUIREMENT * 1.
296 |         _C.PREPROC.MAX_SIZE = np.ceil(_C.PREPROC.MAX_SIZE / size_mult) * size_mult
297 |         assert _C.FPN.PROPOSAL_MODE in ['Level', 'Joint']
298 |         assert _C.FPN.FRCNN_HEAD_FUNC.endswith('_head')
299 |         assert _C.FPN.MRCNN_HEAD_FUNC.endswith('_head')
300 |         assert _C.FPN.NORM in ['None', 'GN']
301 | 
302 |         if _C.FPN.CASCADE:
303 |             # the first threshold is the proposal sampling threshold
304 |             assert _C.CASCADE.IOUS[0] == _C.FRCNN.FG_THRESH
305 |             assert len(_C.CASCADE.BBOX_REG_WEIGHTS) == len(_C.CASCADE.IOUS)
306 | 
307 |     if is_training:
308 |         train_scales = _C.PREPROC.TRAIN_SHORT_EDGE_SIZE
309 |         if isinstance(train_scales, (list, tuple)) and train_scales[1] - train_scales[0] > 100:
310 |             # don't autotune if augmentation is on
311 |             os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
312 |         os.environ['TF_AUTOTUNE_THRESHOLD'] = '1'
313 |         assert _C.TRAINER in ['horovod', 'replicated'], _C.TRAINER
314 | 
315 |         # setup NUM_GPUS
316 |         if _C.TRAINER == 'horovod':
317 |             import horovod.tensorflow as hvd
318 |             ngpu = hvd.size()
319 | 
320 |             if ngpu == hvd.local_size():
321 |                 logger.warn("It's not recommended to use horovod for single-machine training. "
322 |                             "Replicated trainer is more stable and has the same efficiency.")
323 |         else:
324 |             assert 'OMPI_COMM_WORLD_SIZE' not in os.environ
325 |             ngpu = get_num_gpu()
326 |         assert ngpu % 8 == 0 or 8 % ngpu == 0, "Can only train with 1,2,4 or >=8 GPUs, but found {} GPUs".format(ngpu)
327 |     else:
328 |         # autotune is too slow for inference
329 |         os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
330 |         ngpu = get_num_gpu()
331 | 
332 |     assert ngpu > 0, "Has to run with GPU!"
333 |     if _C.TRAIN.NUM_GPUS is None:
334 |         _C.TRAIN.NUM_GPUS = ngpu
335 |     else:
336 |         if _C.TRAINER == 'horovod':
337 |             assert _C.TRAIN.NUM_GPUS == ngpu
338 |         else:
339 |             assert _C.TRAIN.NUM_GPUS <= ngpu
340 | 
341 |     _C.freeze()
342 |     logger.info("Config: ------------------------------------------\n" + str(_C))
343 | 


--------------------------------------------------------------------------------
/tracking/three_stage_tracker.py:
--------------------------------------------------------------------------------
  1 | import platform
  2 | import time
  3 | import numpy as np
  4 | import scipy.sparse
  5 | from tensorpack import PredictConfig, get_model_loader, OfflinePredictor
  6 | from config import config as cfg
  7 | 
  8 | from tracking.argmax_tracker import PrecomputingReferenceTracker
  9 | from tracking.util import resize_and_clip_boxes, generate_colors, xyxy_to_cxcywh_np
 10 | 
 11 | VIZ_WITH_OPENCV = True
 12 | 
 13 | 
 14 | class Tracklet:
 15 |     def __init__(self, start_time):
 16 |         self.start_time = start_time
 17 |         self.end_time = start_time
 18 |         self.feats = []
 19 |         self.boxes = []
 20 |         self.ff_gt_scores = []
 21 |         self.ff_gt_tracklet_scores = []
 22 | 
 23 |     def add_detection(self, feat, box, ff_gt_score, ff_gt_tracklet_score):
 24 |         self.feats = [feat]
 25 |         self.boxes.append(box)
 26 |         self.ff_gt_scores.append(ff_gt_score)
 27 |         self.ff_gt_tracklet_scores.append(ff_gt_tracklet_score)
 28 |         self.end_time += 1
 29 | 
 30 | 
 31 | class ThreeStageTracker(PrecomputingReferenceTracker):
 32 |     def __init__(self, tracklet_distance_threshold=0.06, tracklet_merging_threshold=0.3,
 33 |                  tracklet_merging_second_best_relative_threshold=0.3, ff_gt_score_weight=0.1,
 34 |                  ff_gt_tracklet_score_weight=0.9, location_score_weight=7.0, do_viz=False,
 35 |                  name="ThreeStageTracker", model="best", n_proposals=None, resolution=None):
 36 |         """
 37 |         :param tracklet_merging_threshold: minimum score required to merge a detection into tracklet
 38 |         :param tracklet_merging_second_best_relative_threshold: minimum score gap to second best match allowed to merge the best detection into tracklet
 39 |         """
 40 |         if n_proposals is not None:
 41 |             cfg.RPN.TEST_PER_LEVEL_NMS_TOPK = n_proposals
 42 |         if resolution is not None:
 43 |             if resolution == "full":
 44 |                 # nothing do do...
 45 |                 pass
 46 |             elif resolution == "half":
 47 |                 cfg.PREPROC.TEST_SHORT_EDGE_SIZE = 400
 48 |                 cfg.PREPROC.MAX_SIZE = 667
 49 |             else:
 50 |                 assert False, ("unknown resolution", resolution)
 51 |         super().__init__(name=name, need_network=True, need_img=True, model=model)
 52 |         self._n_proposals = n_proposals
 53 |         self._resolution = resolution
 54 |         self._ff_box = None
 55 |         self._ff_gt_tracklet = None
 56 |         self._all_tracklets = None
 57 |         self._time_idx = None
 58 |         self._imgs_for_viz = None
 59 |         self._ff_img_noresize = None
 60 |         self._ax = None
 61 |         self._cv_img = None
 62 |         self._do_viz = do_viz
 63 |         self._video_idx = -1
 64 |         self._video_name = None
 65 | 
 66 |         self._dynprog_scores = None
 67 |         self._tracklet_merging_threshold = tracklet_merging_threshold
 68 |         self._tracklet_merging_second_best_relative_threshold = tracklet_merging_second_best_relative_threshold
 69 |         self._tracklet_distance_threshold = tracklet_distance_threshold
 70 | 
 71 |         self._ff_gt_score_weight = ff_gt_score_weight
 72 |         self._ff_gt_tracklet_score_weight = ff_gt_tracklet_score_weight
 73 |         self._location_score_weight = location_score_weight
 74 | 
 75 |     def set_video_name(self, vid_name):
 76 |         self._video_name = vid_name
 77 | 
 78 |     def init(self, image, box):
 79 |         self._ff_box = None
 80 |         self._ff_gt_tracklet = None
 81 |         self._all_tracklets = None
 82 |         self._time_idx = 0
 83 |         self._ff_img_noresize = np.array(image)[..., ::-1]
 84 |         if self._do_viz:
 85 |             self._imgs_for_viz = [self._ff_img_noresize]
 86 |         self._video_idx += 1
 87 |         self._dynprog_scores = None
 88 | 
 89 |         super().init(image, box)
 90 |         self._ff_box = self._prev_box.copy()
 91 |         self._ff_gt_tracklet = Tracklet(start_time=0)
 92 |         self._ff_gt_tracklet.add_detection(self._ff_gt_feats, self._ff_box, 1.0, 1.0)
 93 |         self._all_tracklets = [self._ff_gt_tracklet]
 94 | 
 95 |     def _make_pred_func(self, load):
 96 |         cfg.MODE_THIRD_STAGE = True
 97 |         from train import ResNetFPNTrackModel
 98 |         pred_model = ResNetFPNTrackModel()
 99 |         predcfg = PredictConfig(
100 |             model=pred_model,
101 |             session_init=get_model_loader(load),
102 |             input_names=pred_model.get_inference_tensor_names()[0],
103 |             output_names=pred_model.get_inference_tensor_names()[1])
104 |         return OfflinePredictor(predcfg)
105 | 
106 |     def _update(self, img):
107 |         if self._do_viz:
108 |             # we currently only need the most recent frame for viz
109 |             self._imgs_for_viz = [img]
110 |         self._time_idx += 1
111 |         start = time.time()
112 |         self._update_tracklets(img)
113 |         best_box, score = self._track()
114 |         end = time.time()
115 |         # print("tracking step elapsed (with network)", end - start)
116 |         if self._do_viz:
117 |             self._viz_tracklets()
118 |             self._viz_result(best_box)
119 |             # save out viz
120 |             #import cv2
121 |             #cv2.imwrite("/tmp/viz/%05d.jpg" % self._time_idx, self._cv_img)
122 |         return best_box, score
123 | 
124 |     def _update_tracklets(self, img):
125 |         active_tracklets = [t for t in self._all_tracklets if t.end_time == self._time_idx]
126 |         if len(active_tracklets) == 0:
127 |             active_tracklets_boxes_noresize = np.zeros((0, 4), dtype=np.float32)
128 |             active_tracklets_feats = np.zeros((0, 256, 7, 7))
129 |         else:
130 |             active_tracklets_boxes_noresize = np.stack([t.boxes[-1] for t in active_tracklets], axis=0)
131 |             active_tracklets_feats = np.stack([t.feats[-1] for t in active_tracklets], axis=0)
132 |         resized_img, active_tracklets_boxes = self._resize_image_together_with_boxes(img,
133 |                                                                                      active_tracklets_boxes_noresize)
134 |         boxes, scores, third_stage_feats_out, ff_gt_tracklet_scores, sparse_tracklet_scores, \
135 |         tracklet_score_indices = self._pred_func(
136 |                 resized_img, self._ff_gt_feats, self._ff_gt_tracklet.feats[-1],  active_tracklets_feats,
137 |                 active_tracklets_boxes, self._tracklet_distance_threshold)
138 |         boxes = resize_and_clip_boxes(img, resized_img, boxes)
139 |         # for simplicity let's just convert it to a dense matrix. If that gets too large, we can still change it.
140 |         tracklet_scores = scipy.sparse.coo_matrix((sparse_tracklet_scores, (tracklet_score_indices[:, 0],
141 |                                                                             tracklet_score_indices[:, 1])),
142 |                                                   shape=(len(active_tracklets), scores.size)
143 |                                                   ).toarray()
144 |         # free memory
145 |         for t in self._all_tracklets:
146 |             if t.end_time != self._time_idx and t.start_time != 0:
147 |                 t.feats = None
148 |         self._update_tracklets_with_network_outputs(active_tracklets, boxes, scores, third_stage_feats_out,
149 |                                                     ff_gt_tracklet_scores, tracklet_scores)
150 | 
151 |     def _update_tracklets_with_network_outputs(self, active_tracklets, boxes, scores, third_stage_feats_out,
152 |                                                ff_gt_tracklet_scores, tracklet_scores):
153 |         n_dets = scores.size
154 |         for det_idx in range(n_dets):
155 |             merged = False
156 |             det_args = (third_stage_feats_out[det_idx], boxes[det_idx], scores[det_idx],
157 |                         ff_gt_tracklet_scores[det_idx])
158 | 
159 |             # try to extend tracklets in active_tracklets
160 |             if tracklet_scores.size > 0:
161 |                 if tracklet_scores[:, det_idx].max() > self._tracklet_merging_threshold:
162 |                     tracklet_idx = tracklet_scores[:, det_idx].argmax()
163 |                     max_score = tracklet_scores[tracklet_idx, det_idx]
164 |                     # there should be no other det which has a high similarity
165 |                     if (tracklet_scores[tracklet_idx] >= max_score - self._tracklet_merging_second_best_relative_threshold).sum() == 1:
166 |                         # there should be no other tracklet to which this det is similar...
167 |                         if (tracklet_scores[:, det_idx] >= max_score - self._tracklet_merging_second_best_relative_threshold).sum() == 1:
168 |                             active_tracklets[tracklet_idx].add_detection(*det_args)
169 |                             merged = True
170 | 
171 |             # otherwise start new tracklet
172 |             if not merged:
173 |                 tracklet = Tracklet(start_time=self._time_idx)
174 |                 tracklet.add_detection(*det_args)
175 |                 self._all_tracklets.append(tracklet)
176 | 
177 |     def _track(self):
178 |         # we know that the tracklets are always sorted by time!
179 |         n_tracklets = len(self._all_tracklets)
180 |         last_dynprog_scores = self._dynprog_scores
181 |         self._dynprog_scores = np.full(n_tracklets, fill_value=-1e20, dtype=np.float32)
182 |         # init gt tracklet score
183 |         self._dynprog_scores[0] = 0.0
184 |         if last_dynprog_scores is not None:
185 |             self._dynprog_scores[:last_dynprog_scores.size] = last_dynprog_scores
186 |         end_times = np.array([t.end_time for t in self._all_tracklets])
187 |         im_h, im_w = self._ff_img_noresize.shape[:2]
188 |         norm = np.array([im_w, im_h, im_w, im_h], np.float32)
189 | 
190 |         active_indices, = np.where(end_times >= self._time_idx + 1)
191 |         active_tracklets = [self._all_tracklets[idx] for idx in active_indices]
192 | 
193 |         TRACKLET_KEEP_ALIVE_TIME = 1500
194 |         if len(active_tracklets) > 0:
195 |             if len(active_tracklets) == n_tracklets:
196 |                 alive_start_time = 0
197 |             else:
198 |                 # select non-active tracklets: end_times < self._time_idx + 1
199 |                 alive_start_time = end_times[end_times < self._time_idx + 1].max()
200 | 
201 |             alive_indices, = np.where(end_times >= alive_start_time + 1 - TRACKLET_KEEP_ALIVE_TIME)
202 |             alive_tracklets = [self._all_tracklets[idx] for idx in alive_indices]
203 |             alive_end_boxes_cxcywh = xyxy_to_cxcywh_np(np.array([t.boxes[-1] for t in alive_tracklets]))
204 |             alive_end_times = end_times[alive_indices]
205 |             alive_dynprog_scores = self._dynprog_scores[alive_indices]
206 |             active_start_boxes_cxcywh = xyxy_to_cxcywh_np(np.array([t.boxes[0] for t in active_tracklets]))
207 |             all_pairwise_diffs = np.abs(active_start_boxes_cxcywh[:, np.newaxis] - alive_end_boxes_cxcywh[np.newaxis]) / norm
208 |             all_pairwise_diffs = -all_pairwise_diffs.mean(axis=2)
209 | 
210 |         for idx, t_idx in enumerate(active_indices):
211 |             tracklet = self._all_tracklets[t_idx]
212 |             unary = self._ff_gt_score_weight * sum(tracklet.ff_gt_scores) + \
213 |                 self._ff_gt_tracklet_score_weight * sum(tracklet.ff_gt_tracklet_scores)
214 | 
215 |             valid_mask = tracklet.start_time >= alive_end_times
216 |             if valid_mask.any():
217 |                 pairwise_scores = all_pairwise_diffs[idx]
218 |                 pred_scores = alive_dynprog_scores + self._location_score_weight * pairwise_scores
219 |                 pred_scores[np.logical_not(valid_mask)] = -1e20
220 |                 best_pred_idx = pred_scores.argmax()
221 |                 best_pred_score = pred_scores[best_pred_idx]
222 |                 if best_pred_score > -1e20:
223 |                     self._dynprog_scores[t_idx] = best_pred_score + unary
224 | 
225 |         t_idx = self._dynprog_scores.argmax()
226 |         tracklet = self._all_tracklets[t_idx]
227 |         # add current frame score weighted with epsilon to change relative ranking within tracklet
228 |         EPSILON = 0.00001
229 |         if tracklet.end_time >= self._time_idx + 1:
230 |             score = self._ff_gt_score_weight * max(tracklet.ff_gt_scores) + \
231 |                 self._ff_gt_tracklet_score_weight * max(tracklet.ff_gt_tracklet_scores) \
232 |                 + EPSILON * tracklet.ff_gt_scores[-1]
233 |         else:
234 |             score = -1.0 + EPSILON * tracklet.ff_gt_scores[-1]
235 |         # or we could select the best tracklet in current frame
236 |         return tracklet.boxes[-1], score
237 | 
238 |     if VIZ_WITH_OPENCV:
239 |         def _viz_tracklets(self):
240 |             print("viz tracklets frame", self._time_idx)
241 |             import cv2
242 |             self._cv_img = self._imgs_for_viz[-1].copy()
243 |             colors = generate_colors()
244 |             t = self._time_idx
245 |             for idx, tracklet in enumerate(self._all_tracklets):
246 |                 # probably filter by confidence and tracklet length
247 |                 #if tracklet.end_time - tracklet.start_time < 2:
248 |                 #    continue
249 |                 if max(tracklet.ff_gt_scores) < 0.2:
250 |                     continue
251 |                 if tracklet.start_time <= t < tracklet.end_time:
252 |                     color = colors[idx % len(colors)]
253 |                     box = tracklet.boxes[t - tracklet.start_time]
254 |                     #cv2.rectangle(self._cv_img, (box[0], box[1]), (box[2], box[3]), [255 * x for x in color], 1)
255 | 
256 |         def _viz_result(self, box):
257 |             import cv2
258 |             #cv2.rectangle(self._cv_img, (box[0], box[1]), (box[2], box[3]), (0, 0, 255), 6)
259 |             cv2.rectangle(self._cv_img, (box[0], box[1]), (box[2], box[3]), (0, 252, 124), 6)
260 |             cv2.imshow('SUPERTRACK', self._cv_img)
261 |             cv2.waitKey(1)
262 |             #cv2.waitKey(0)
263 |     else:
264 |         def _viz_tracklets(self):
265 |             print("viz tracklets frame", self._time_idx)
266 |             import matplotlib.pyplot as plt
267 |             from matplotlib.patches import Rectangle
268 |             if self._ax is None:
269 |                 fig, self._ax = plt.subplots(1)
270 |             colors = generate_colors()
271 |             t = self._time_idx
272 |             img = self._imgs_for_viz[-1]
273 |             self._ax.clear()
274 |             self._ax.imshow(img[..., ::-1])
275 |             for idx, tracklet in enumerate(self._all_tracklets):
276 |                 # probably filter by confidence and tracklet length
277 |                 if tracklet.end_time - tracklet.start_time < 2:
278 |                     continue
279 |                 if max(tracklet.ff_gt_scores) < 0.2:
280 |                     continue
281 |                 if tracklet.start_time <= t < tracklet.end_time:
282 |                     color = colors[idx % len(colors)]
283 |                     box = tracklet.boxes[t - tracklet.start_time]
284 |                     width = box[2] - box[0]
285 |                     height = box[3] - box[1]
286 |                     rect = Rectangle((box[0], box[1]), width, height, color=color, fill=False)
287 |                     self._ax.add_patch(rect)
288 |             # plt.pause(0.0001)
289 | 
290 |         def _viz_result(self, box):
291 |             width = box[2] - box[0]
292 |             height = box[3] - box[1]
293 |             import matplotlib.pyplot as plt
294 |             from matplotlib.patches import Rectangle
295 |             rect = Rectangle((box[0], box[1]), width, height, color="red", fill=False, linewidth=4.0)
296 |             self._ax.add_patch(rect)
297 |             plt.pause(0.00001)
298 | 


--------------------------------------------------------------------------------
/tracking/do_tracking.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import os
  4 | 
  5 | from got10k.experiments import ExperimentGOT10k, ExperimentVOT, ExperimentOTB, ExperimentUAV123, ExperimentLaSOT, ExperimentDAVIS, ExperimentYouTubeVOS, ExperimentTrackingNet, ExperimentOxuva, ExperimentNfS, ExperimentTColor128
  6 | from got10k.experiments.custom import ExperimentCustom
  7 | 
  8 | from tracking.argmax_tracker import ArgmaxTracker
  9 | from tracking.three_stage_tracker import ThreeStageTracker
 10 | 
 11 | # change these data paths to where you have the datasets!
 12 | DATASET_PREFIX = "/globalwork/data/"
 13 | VOT18_ROOT_DIR = os.path.join(DATASET_PREFIX, 'vot18')
 14 | VOT17_ROOT_DIR = os.path.join(DATASET_PREFIX, 'vot17')
 15 | VOT16_ROOT_DIR = os.path.join(DATASET_PREFIX, 'vot16')
 16 | VOT15_ROOT_DIR = os.path.join(DATASET_PREFIX, 'vot15')
 17 | VOT18_LT_ROOT_DIR = os.path.join(DATASET_PREFIX, 'vot18-lt')
 18 | OTB_2015_ROOT_DIR = os.path.join(DATASET_PREFIX, 'OTB_new')
 19 | OTB_2013_ROOT_DIR = os.path.join(DATASET_PREFIX, 'OTB2013')
 20 | DAVIS_2017_ROOT_DIR = os.path.join(DATASET_PREFIX, 'DAVIS2017')
 21 | YOUTUBE_VOS_2019_ROOT_DIR = os.path.join(DATASET_PREFIX, "youtube-vos-2019")
 22 | GOT10K_ROOT_DIR = os.path.join(DATASET_PREFIX, 'GOT10k')
 23 | UAV123_ROOT_DIR = os.path.join(DATASET_PREFIX, 'UAV123')
 24 | LASOT_ROOT_DIR = os.path.join(DATASET_PREFIX, 'LaSOTBenchmark')
 25 | TRACKINGNET_ROOT_DIR = os.path.join(DATASET_PREFIX, 'TrackingNet')
 26 | NFS_ROOT_DIR = os.path.join(DATASET_PREFIX, 'nfs')
 27 | TC128_ROOT_DIR = os.path.join(DATASET_PREFIX, 'tc128/Temple-color-128')
 28 | OXUVA_ROOT_DIR = os.path.join(DATASET_PREFIX, 'oxuva')
 29 | 
 30 | RESULT_DIR = 'tracking_data/results/'
 31 | REPORT_DIR = 'tracking_data/reports/'
 32 | 
 33 | parser = argparse.ArgumentParser()
 34 | parser.add_argument('--start_idx', type=int, help='first video index to process', default=0)
 35 | parser.add_argument('--end_idx', type=int, help='last video index to process (exclusive)', default=None)
 36 | 
 37 | # TDPA parameters. You can just leave them at the default values which will work well on a wide range of datasets
 38 | parser.add_argument('--tracklet_distance_threshold', type=float, default=0.06)
 39 | parser.add_argument('--tracklet_merging_threshold', type=float, default=0.3)
 40 | parser.add_argument('--tracklet_merging_second_best_relative_threshold', type=float, default=0.3)
 41 | parser.add_argument('--ff_gt_score_weight', type=float, default=0.1)
 42 | parser.add_argument('--ff_gt_tracklet_score_weight', type=float, default=0.9)
 43 | parser.add_argument('--location_score_weight', type=float, default=7.0)
 44 | 
 45 | parser.add_argument('--model', type=str, default="best", help='one of "best", "nohardexamples", or "gotonly"')
 46 | parser.add_argument('--tracker', type=str, default='ThreeStageTracker')
 47 | parser.add_argument('--n_proposals', type=int, default=None)
 48 | parser.add_argument('--resolution', type=str, default=None)
 49 | parser.add_argument('--visualize_tracker', action='store_true',
 50 |                     help='use visualization of tracker (recommended over --visualize_experiment)')
 51 | parser.add_argument('--visualize_experiment', action='store_true',
 52 |                     help='use visualization of got experiment (not recommended, usually --visualize_tracker is better)')
 53 | parser.add_argument('--custom_dataset_name', type=str, default=None)
 54 | parser.add_argument('--custom_dataset_root_dir', type=str, default=None)
 55 | parser.add_argument('--main', type=str)
 56 | args = parser.parse_args()
 57 | 
 58 | 
 59 | def build_tracker():
 60 |     if args.tracker == "ArgmaxTracker":
 61 |         return ArgmaxTracker()
 62 |     elif args.tracker == "ThreeStageTracker":
 63 |         pass
 64 |     else:
 65 |         assert False, ("Unknown tracker", args.tracker)
 66 | 
 67 |     tracklet_param_str = str(args.tracklet_distance_threshold) + "_" + str(args.tracklet_merging_threshold) + "_" + \
 68 |         str(args.tracklet_merging_second_best_relative_threshold)
 69 |     if args.n_proposals is not None:
 70 |         tracklet_param_str += "_proposals" + str(args.n_proposals)
 71 |     if args.resolution is not None:
 72 |         tracklet_param_str += "_resolution-" + str(args.resolution)
 73 |     if args.model != "best":
 74 |         tracklet_param_str = args.model + "_" + tracklet_param_str
 75 |     if args.visualize_tracker:
 76 |         tracklet_param_str2 = "viz_" + tracklet_param_str
 77 |     else:
 78 |         tracklet_param_str2 = tracklet_param_str
 79 |     param_str = tracklet_param_str2 + "_" + str(args.ff_gt_score_weight) + "_" + \
 80 |         str(args.ff_gt_tracklet_score_weight) + "_" + str(args.location_score_weight)
 81 | 
 82 |     name = "ThreeStageTracker_" + param_str
 83 |     tracker = ThreeStageTracker(tracklet_distance_threshold=args.tracklet_distance_threshold,
 84 |                                 tracklet_merging_threshold=args.tracklet_merging_threshold,
 85 |                                 tracklet_merging_second_best_relative_threshold=
 86 |                                 args.tracklet_merging_second_best_relative_threshold,
 87 |                                 ff_gt_score_weight=args.ff_gt_score_weight,
 88 |                                 ff_gt_tracklet_score_weight=args.ff_gt_tracklet_score_weight,
 89 |                                 location_score_weight=args.location_score_weight,
 90 |                                 name=name,
 91 |                                 do_viz=args.visualize_tracker,
 92 |                                 model=args.model,
 93 |                                 n_proposals=args.n_proposals,
 94 |                                 resolution=args.resolution)
 95 |     return tracker
 96 | 
 97 | 
 98 | def main_vot18(reset=True):
 99 |     root_dir = VOT18_ROOT_DIR
100 |     if reset:
101 |         experiments = "supervised"
102 |     else:
103 |         experiments = "unsupervised"
104 |     tracker = build_tracker()
105 |     experiment = ExperimentVOT(
106 |         root_dir=root_dir,
107 |         version=2018,
108 |         result_dir=RESULT_DIR,
109 |         report_dir=REPORT_DIR,
110 |         experiments=experiments,
111 |         start_idx=args.start_idx,
112 |         end_idx=args.end_idx
113 |     )
114 |     experiment.run(tracker, visualize=args.visualize_experiment)
115 |     experiment.report([tracker.name])
116 | 
117 | 
118 | def main_vot18_noreset():
119 |     main_vot18(reset=False)
120 | 
121 | 
122 | def main_vot18_threestage():
123 |     tracker = build_tracker()
124 |     root_dir = VOT18_ROOT_DIR
125 |     experiment = ExperimentVOT(
126 |         root_dir=root_dir,
127 |         version=2018,
128 |         result_dir=RESULT_DIR,
129 |         report_dir=REPORT_DIR,
130 |         experiments="supervised",
131 |         start_idx=args.start_idx,
132 |         end_idx=args.end_idx
133 |     )
134 |     experiment.run(tracker, visualize=args.visualize_experiment)
135 |     experiment.report([tracker.name])
136 | 
137 | 
138 | def main_vot17():
139 |     root_dir = VOT17_ROOT_DIR
140 |     experiments = "supervised"
141 |     tracker = build_tracker()
142 |     experiment = ExperimentVOT(
143 |         root_dir=root_dir,
144 |         version=2017,
145 |         result_dir=RESULT_DIR,
146 |         report_dir=REPORT_DIR,
147 |         experiments=experiments,
148 |         start_idx=args.start_idx,
149 |         end_idx=args.end_idx
150 |     )
151 |     experiment.run(tracker, visualize=args.visualize_experiment)
152 | 
153 | 
154 | def main_vot16():
155 |     root_dir = VOT16_ROOT_DIR
156 |     experiments = "supervised"
157 |     tracker = build_tracker()
158 |     experiment = ExperimentVOT(
159 |         root_dir=root_dir,
160 |         version=2016,
161 |         result_dir=RESULT_DIR,
162 |         report_dir=REPORT_DIR,
163 |         experiments=experiments,
164 |         start_idx=args.start_idx,
165 |         end_idx=args.end_idx
166 |     )
167 |     experiment.run(tracker, visualize=args.visualize_experiment)
168 | 
169 | 
170 | def main_vot15():
171 |     root_dir = VOT15_ROOT_DIR
172 |     experiments = "supervised"
173 |     tracker = build_tracker()
174 |     experiment = ExperimentVOT(
175 |         root_dir=root_dir,
176 |         version=2015,
177 |         result_dir=RESULT_DIR,
178 |         report_dir=REPORT_DIR,
179 |         experiments=experiments,
180 |         start_idx=args.start_idx,
181 |         end_idx=args.end_idx
182 |     )
183 |     experiment.run(tracker, visualize=args.visualize_experiment)
184 | 
185 | 
186 | def main_vot18lt():
187 |     tracker = build_tracker()
188 |     experiment = ExperimentVOT(
189 |         root_dir=VOT18_LT_ROOT_DIR,
190 |         version='LT2018',
191 |         result_dir=RESULT_DIR,
192 |         report_dir=REPORT_DIR,
193 |         experiments="unsupervised",
194 |         start_idx=args.start_idx,
195 |         end_idx=args.end_idx
196 |     )
197 |     experiment.run(tracker, visualize=args.visualize_experiment)
198 |     # this needs to be eval'ed from matlab, so do not call report()
199 | 
200 | 
201 | def main_otb():
202 |     tracker = build_tracker()
203 |     root_dir = OTB_2015_ROOT_DIR
204 |     experiment = ExperimentOTB(
205 |         root_dir=root_dir,
206 |         result_dir=RESULT_DIR,
207 |         report_dir=REPORT_DIR,
208 |         start_idx=args.start_idx,
209 |         end_idx=args.end_idx
210 |     )
211 |     experiment.run(tracker, visualize=args.visualize_experiment)
212 |     experiment.report([tracker.name])
213 | 
214 | 
215 | def main_otb2013():
216 |     tracker = build_tracker()
217 |     root_dir = OTB_2013_ROOT_DIR
218 |     experiment = ExperimentOTB(
219 |         version=2013,
220 |         root_dir=root_dir,
221 |         result_dir=RESULT_DIR,
222 |         report_dir=REPORT_DIR,
223 |         start_idx=args.start_idx,
224 |         end_idx=args.end_idx
225 |     )
226 |     experiment.run(tracker, visualize=args.visualize_experiment)
227 |     experiment.report([tracker.name])
228 | 
229 | 
230 | def main_otb50():
231 |     tracker = build_tracker()
232 |     root_dir = OTB_2015_ROOT_DIR
233 |     experiment = ExperimentOTB(
234 |         version='tb50',
235 |         root_dir=root_dir,
236 |         result_dir=RESULT_DIR,
237 |         report_dir=REPORT_DIR,
238 |         start_idx=args.start_idx,
239 |         end_idx=args.end_idx
240 |     )
241 |     experiment.run(tracker, visualize=args.visualize_experiment)
242 |     experiment.report([tracker.name])
243 | 
244 | 
245 | def main_davis(version="2017_val"):
246 |     tracker = build_tracker()
247 |     root_dir = DAVIS_2017_ROOT_DIR
248 |     experiment = ExperimentDAVIS(
249 |         root_dir=root_dir,
250 |         result_dir=RESULT_DIR,
251 |         report_dir=REPORT_DIR,
252 |         start_idx=args.start_idx,
253 |         end_idx=args.end_idx,
254 |         version=version
255 |     )
256 |     experiment.run(tracker, visualize=args.visualize_experiment)
257 |     experiment.report([tracker.name])
258 | 
259 | 
260 | def main_davis2016():
261 |     main_davis(version="2016_val")
262 | 
263 | 
264 | def main_davis2017():
265 |     main_davis(version="2017_val")
266 | 
267 | 
268 | def main_davis2017_testdev():
269 |     main_davis(version="2017_testdev")
270 | 
271 | 
272 | def main_davis2017_train():
273 |     main_davis(version="2017_train")
274 | 
275 | 
276 | def main_davis2017_train_multiobj():
277 |     main_davis(version="2017_train_multiobj")
278 | 
279 | 
280 | def main_youtubevos(version="valid"):
281 |     tracker = build_tracker()
282 |     root_dir = YOUTUBE_VOS_2019_ROOT_DIR
283 |     experiment = ExperimentYouTubeVOS(
284 |         root_dir=root_dir,
285 |         result_dir=RESULT_DIR,
286 |         report_dir=REPORT_DIR,
287 |         start_idx=args.start_idx,
288 |         end_idx=args.end_idx,
289 |         version=version
290 |     )
291 |     experiment.run(tracker, visualize=args.visualize_experiment)
292 | 
293 | 
294 | def main_got(subset='val'):
295 |     dataset_name = "GOT10k"
296 |     if subset != 'val':
297 |         dataset_name += "_" + subset
298 |     tracker = build_tracker()
299 |     experiment = ExperimentGOT10k(
300 |         root_dir=GOT10K_ROOT_DIR,  # GOT-10k's root directory
301 |         subset=subset,  # 'train' | 'val' | 'test'
302 |         result_dir=RESULT_DIR,  # where to store tracking results
303 |         report_dir=REPORT_DIR,  # where to store evaluation reports
304 |         start_idx=args.start_idx,
305 |         end_idx=args.end_idx
306 |     )
307 |     experiment.run(tracker, visualize=args.visualize_experiment)
308 |     experiment.report([tracker.name])
309 | 
310 | 
311 | def main_got_test():
312 |     main_got(subset='test')
313 | 
314 | 
315 | def main_uav123():
316 |     tracker = build_tracker()
317 |     experiment = ExperimentUAV123(
318 |         root_dir=UAV123_ROOT_DIR,
319 |         result_dir=RESULT_DIR,
320 |         report_dir=REPORT_DIR,
321 |         start_idx=args.start_idx,
322 |         end_idx=args.end_idx
323 |     )
324 |     experiment.run(tracker, visualize=args.visualize_experiment)
325 |     experiment.report([tracker.name])
326 | 
327 | 
328 | def main_uav20l():
329 |     tracker = build_tracker()
330 |     experiment = ExperimentUAV123(
331 |         root_dir=UAV123_ROOT_DIR,
332 |         version='UAV20L',
333 |         result_dir=RESULT_DIR,
334 |         report_dir=REPORT_DIR,
335 |         start_idx=args.start_idx,
336 |         end_idx=args.end_idx
337 |     )
338 |     experiment.run(tracker, visualize=args.visualize_experiment)
339 |     experiment.report([tracker.name])
340 | 
341 | 
342 | def main_lasot():
343 |     tracker = build_tracker()
344 |     experiment = ExperimentLaSOT(
345 |         root_dir=LASOT_ROOT_DIR,
346 |         result_dir=RESULT_DIR,
347 |         report_dir=REPORT_DIR,
348 |         subset='test',
349 |         start_idx=args.start_idx,
350 |         end_idx=args.end_idx
351 |     )
352 |     experiment.run(tracker, visualize=args.visualize_experiment)
353 |     experiment.report([tracker.name])
354 | 
355 | 
356 | def main_trackingnet():
357 |     tracker = build_tracker()
358 |     experiment = ExperimentTrackingNet(
359 |         root_dir=TRACKINGNET_ROOT_DIR,
360 |         result_dir=RESULT_DIR,
361 |         report_dir=REPORT_DIR,
362 |         subset='test',
363 |         start_idx=args.start_idx,
364 |         end_idx=args.end_idx
365 |     )
366 |     experiment.run(tracker, visualize=args.visualize_experiment)
367 | 
368 | 
369 | def main_nfs():
370 |     tracker = build_tracker()
371 |     experiment = ExperimentNfS(
372 |         root_dir=NFS_ROOT_DIR,
373 |         fps=30,
374 |         result_dir=RESULT_DIR,
375 |         report_dir=REPORT_DIR,
376 |         start_idx=args.start_idx,
377 |         end_idx=args.end_idx
378 |     )
379 |     experiment.run(tracker, visualize=args.visualize_experiment)
380 |     experiment.report([tracker.name])
381 | 
382 | 
383 | def main_tc128():
384 |     tracker = build_tracker()
385 |     experiment = ExperimentTColor128(
386 |         root_dir=TC128_ROOT_DIR,
387 |         result_dir=RESULT_DIR,
388 |         report_dir=REPORT_DIR,
389 |         start_idx=args.start_idx,
390 |         end_idx=args.end_idx
391 |     )
392 |     experiment.run(tracker, visualize=args.visualize_experiment)
393 |     experiment.report([tracker.name])
394 | 
395 | 
396 | def main_oxuva(testset=True):
397 |     tracker = build_tracker()
398 |     experiment = ExperimentOxuva(
399 |         root_dir=OXUVA_ROOT_DIR,
400 |         result_dir=RESULT_DIR,
401 |         report_dir=REPORT_DIR,
402 |         subset='test' if testset else 'dev',
403 |         start_idx=args.start_idx,
404 |         end_idx=args.end_idx
405 |     )
406 |     experiment.run(tracker, visualize=args.visualize_experiment)
407 | 
408 | 
409 | def main_oxuva_dev():
410 |     main_oxuva(testset=False)
411 | 
412 | 
413 | def main_custom():
414 |     custom_dataset_root_dir = args.custom_dataset_root_dir
415 |     assert custom_dataset_root_dir is not None
416 |     custom_dataset_name = args.custom_dataset_name
417 |     assert custom_dataset_name is not None
418 |     tracker = build_tracker()
419 |     experiment = ExperimentCustom(
420 |         root_dir=custom_dataset_root_dir,
421 |         name=custom_dataset_name
422 |     )
423 |     experiment.run(tracker, visualize=args.visualize_experiment)
424 | 
425 | 
426 | if __name__ == "__main__":
427 |     assert args.main is not None, "--main not supplied, e.g. --main main_otb"
428 |     eval(args.main + "()")
429 | 


--------------------------------------------------------------------------------
/model_frcnn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # File: model.py
  3 | 
  4 | import tensorflow as tf
  5 | 
  6 | from tensorpack.models import Conv2D, FullyConnected, layer_register
  7 | from tensorpack.tfutils.argscope import argscope
  8 | from tensorpack.tfutils.common import get_tf_version_tuple
  9 | from tensorpack.tfutils.scope_utils import under_name_scope
 10 | from tensorpack.tfutils.summary import add_moving_summary
 11 | from tensorpack.utils.argtools import memoized_method
 12 | 
 13 | from basemodel import GroupNorm
 14 | from config import config as cfg
 15 | from model_box import decode_bbox_target, encode_bbox_target
 16 | from utils.box_ops import pairwise_iou
 17 | 
 18 | 
 19 | @under_name_scope()
 20 | def proposal_metrics(iou):
 21 |     """
 22 |     Add summaries for RPN proposals.
 23 | 
 24 |     Args:
 25 |         iou: nxm, #proposal x #gt
 26 |     """
 27 |     # find best roi for each gt, for summary only
 28 |     best_iou = tf.reduce_max(iou, axis=0)
 29 |     mean_best_iou = tf.reduce_mean(best_iou, name='best_iou_per_gt')
 30 |     summaries = [mean_best_iou]
 31 |     with tf.device('/cpu:0'):
 32 |         for th in [0.3, 0.5]:
 33 |             recall = tf.truediv(
 34 |                 tf.count_nonzero(best_iou >= th),
 35 |                 tf.size(best_iou, out_type=tf.int64),
 36 |                 name='recall_iou{}'.format(th))
 37 |             summaries.append(recall)
 38 |     add_moving_summary(*summaries)
 39 | 
 40 | 
 41 | @under_name_scope()
 42 | def sample_fast_rcnn_targets(boxes, gt_boxes, gt_labels):
 43 |     """
 44 |     Sample some boxes from all proposals for training.
 45 |     #fg is guaranteed to be > 0, because ground truth boxes will be added as proposals.
 46 | 
 47 |     Args:
 48 |         boxes: nx4 region proposals, floatbox
 49 |         gt_boxes: mx4, floatbox
 50 |         gt_labels: m, int32
 51 | 
 52 |     Returns:
 53 |         A BoxProposals instance.
 54 |         sampled_boxes: tx4 floatbox, the rois
 55 |         sampled_labels: t int64 labels, in [0, #class). Positive means foreground.
 56 |         fg_inds_wrt_gt: #fg indices, each in range [0, m-1].
 57 |             It contains the matching GT of each foreground roi.
 58 |     """
 59 |     iou = pairwise_iou(boxes, gt_boxes)     # nxm
 60 |     proposal_metrics(iou)
 61 | 
 62 |     # add ground truth as proposals as well
 63 |     boxes = tf.concat([boxes, gt_boxes], axis=0)    # (n+m) x 4
 64 |     iou = tf.concat([iou, tf.eye(tf.shape(gt_boxes)[0])], axis=0)   # (n+m) x m
 65 |     # #proposal=n+m from now on
 66 | 
 67 |     def sample_fg_bg(iou):
 68 |         fg_mask = tf.reduce_max(iou, axis=1) >= cfg.FRCNN.FG_THRESH
 69 | 
 70 |         fg_inds = tf.reshape(tf.where(fg_mask), [-1])
 71 |         num_fg = tf.minimum(int(
 72 |             cfg.FRCNN.BATCH_PER_IM * cfg.FRCNN.FG_RATIO),
 73 |             tf.size(fg_inds), name='num_fg')
 74 |         fg_inds = tf.random_shuffle(fg_inds)[:num_fg]
 75 | 
 76 |         bg_inds = tf.reshape(tf.where(tf.logical_not(fg_mask)), [-1])
 77 |         num_bg = tf.minimum(
 78 |             cfg.FRCNN.BATCH_PER_IM - num_fg,
 79 |             tf.size(bg_inds), name='num_bg')
 80 |         bg_inds = tf.random_shuffle(bg_inds)[:num_bg]
 81 | 
 82 |         add_moving_summary(num_fg, num_bg)
 83 |         return fg_inds, bg_inds
 84 | 
 85 |     fg_inds, bg_inds = sample_fg_bg(iou)
 86 |     # fg,bg indices w.r.t proposals
 87 | 
 88 |     best_iou_ind = tf.argmax(iou, axis=1)   # #proposal, each in 0~m-1
 89 |     fg_inds_wrt_gt = tf.gather(best_iou_ind, fg_inds)   # num_fg
 90 | 
 91 |     all_indices = tf.concat([fg_inds, bg_inds], axis=0)   # indices w.r.t all n+m proposal boxes
 92 |     ret_boxes = tf.gather(boxes, all_indices)
 93 | 
 94 |     ret_labels = tf.concat(
 95 |         [tf.gather(gt_labels, fg_inds_wrt_gt),
 96 |          tf.zeros_like(bg_inds, dtype=tf.int64)], axis=0)
 97 |     # stop the gradient -- they are meant to be training targets
 98 |     return BoxProposals(
 99 |         tf.stop_gradient(ret_boxes, name='sampled_proposal_boxes'),
100 |         tf.stop_gradient(ret_labels, name='sampled_labels'),
101 |         tf.stop_gradient(fg_inds_wrt_gt))
102 | 
103 | 
104 | @layer_register(log_shape=True)
105 | def fastrcnn_outputs(feature, num_classes, class_agnostic_regression=False):
106 |     """
107 |     Args:
108 |         feature (any shape):
109 |         num_classes(int): num_category + 1
110 |         class_agnostic_regression (bool): if True, regression to N x 1 x 4
111 | 
112 |     Returns:
113 |         cls_logits: N x num_class classification logits
114 |         reg_logits: N x num_classx4 or Nx2x4 if class agnostic
115 |     """
116 |     classification = FullyConnected(
117 |         'class', feature, num_classes,
118 |         kernel_initializer=tf.random_normal_initializer(stddev=0.01))
119 |     num_classes_for_box = 1 if class_agnostic_regression else num_classes
120 |     box_regression = FullyConnected(
121 |         'box', feature, num_classes_for_box * 4,
122 |         kernel_initializer=tf.random_normal_initializer(stddev=0.001))
123 |     box_regression = tf.reshape(box_regression, (-1, num_classes_for_box, 4), name='output_box')
124 |     return classification, box_regression
125 | 
126 | 
127 | @under_name_scope()
128 | def fastrcnn_losses(labels, label_logits, fg_boxes, fg_box_logits):
129 |     """
130 |     Args:
131 |         labels: n,
132 |         label_logits: nxC
133 |         fg_boxes: nfgx4, encoded
134 |         fg_box_logits: nfgxCx4 or nfgx1x4 if class agnostic
135 | 
136 |     Returns:
137 |         label_loss, box_loss
138 |     """
139 |     label_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
140 |         labels=labels, logits=label_logits)
141 |     if cfg.FRCNN.USE_FOCAL_LOSS:
142 |         indices = tf.stack((tf.cast(tf.range(tf.shape(labels)[0]), tf.int64), labels), axis=1)
143 |         posteriors = tf.nn.softmax(label_logits, axis=1)
144 |         gathered_posteriors = tf.gather_nd(posteriors, indices)
145 |         gamma = 2.0
146 |         label_loss = 5 * (1 - gathered_posteriors) ** gamma * label_loss
147 |     # possibly upweight the foreground labels for balancing
148 |     if cfg.FRCNN.FG_LOSS_WEIGHTING_FACTOR != 1.0:
149 |         label_loss *= (tf.constant(cfg.FRCNN.FG_LOSS_WEIGHTING_FACTOR - 1.0, dtype=tf.float32) * \
150 |                       tf.cast(labels > 0, tf.float32)) + 1.0
151 | 
152 |     label_loss = tf.reduce_mean(label_loss, name='label_loss')
153 | 
154 |     fg_inds = tf.where(labels > 0)[:, 0]
155 |     fg_labels = tf.gather(labels, fg_inds)
156 |     num_fg = tf.size(fg_inds, out_type=tf.int64)
157 |     empty_fg = tf.equal(num_fg, 0)
158 |     if int(fg_box_logits.shape[1]) > 1:
159 |         indices = tf.stack(
160 |             [tf.range(num_fg), fg_labels], axis=1)  # #fgx2
161 |         fg_box_logits = tf.gather_nd(fg_box_logits, indices)
162 |     else:
163 |         fg_box_logits = tf.reshape(fg_box_logits, [-1, 4])
164 | 
165 |     with tf.name_scope('label_metrics'), tf.device('/cpu:0'):
166 |         prediction = tf.argmax(label_logits, axis=1, name='label_prediction')
167 |         correct = tf.cast(tf.equal(prediction, labels), tf.float32)  # boolean/integer gather is unavailable on GPU
168 |         accuracy = tf.reduce_mean(correct, name='accuracy')
169 |         fg_label_pred = tf.argmax(tf.gather(label_logits, fg_inds), axis=1)
170 |         num_zero = tf.reduce_sum(tf.cast(tf.equal(fg_label_pred, 0), tf.int64), name='num_zero')
171 |         false_negative = tf.where(
172 |             empty_fg, 0., tf.cast(tf.truediv(num_zero, num_fg), tf.float32), name='false_negative')
173 |         fg_accuracy = tf.where(
174 |             empty_fg, 0., tf.reduce_mean(tf.gather(correct, fg_inds)), name='fg_accuracy')
175 | 
176 |     box_loss = tf.losses.huber_loss(
177 |         fg_boxes, fg_box_logits, reduction=tf.losses.Reduction.SUM)
178 |     box_loss *= cfg.FRCNN.BOX_LOSS_WEIGHTING_FACTOR
179 |     box_loss = tf.truediv(
180 |         box_loss, tf.cast(tf.shape(labels)[0], tf.float32), name='box_loss')
181 | 
182 |     add_moving_summary(label_loss, box_loss, accuracy,
183 |                        fg_accuracy, false_negative, tf.cast(num_fg, tf.float32, name='num_fg_label'))
184 |     return [label_loss, box_loss]
185 | 
186 | 
187 | @under_name_scope()
188 | def fastrcnn_predictions(boxes, scores):
189 |     """
190 |     Generate final results from predictions of all proposals.
191 | 
192 |     Args:
193 |         boxes: n#classx4 floatbox in float32
194 |         scores: nx#class
195 | 
196 |     Returns:
197 |         boxes: Kx4
198 |         scores: K
199 |         labels: K
200 |     """
201 |     assert boxes.shape[1] == cfg.DATA.NUM_CLASS
202 |     assert scores.shape[1] == cfg.DATA.NUM_CLASS
203 |     boxes = tf.transpose(boxes, [1, 0, 2])[1:, :, :]  # #catxnx4
204 |     scores = tf.transpose(scores[:, 1:], [1, 0])  # #catxn
205 | 
206 |     def f(X):
207 |         """
208 |         prob: n probabilities
209 |         box: nx4 boxes
210 | 
211 |         Returns: n boolean, the selection
212 |         """
213 |         prob, box = X
214 |         output_shape = tf.shape(prob, out_type=tf.int64)
215 |         # filter by score threshold
216 |         ids = tf.reshape(tf.where(prob > cfg.TEST.RESULT_SCORE_THRESH), [-1])
217 |         prob = tf.gather(prob, ids)
218 |         box = tf.gather(box, ids)
219 |         # NMS within each class
220 |         nms_thres = cfg.TEST.FRCNN_NMS_THRESH
221 |         selection = tf.image.non_max_suppression(
222 |             box, prob, cfg.TEST.RESULTS_PER_IM, nms_thres)
223 |         selection = tf.gather(ids, selection)
224 | 
225 |         if get_tf_version_tuple() >= (1, 13):
226 |             sorted_selection = tf.sort(selection, direction='ASCENDING')
227 |             mask = tf.sparse.SparseTensor(indices=tf.expand_dims(sorted_selection, 1),
228 |                                           values=tf.ones_like(sorted_selection, dtype=tf.bool),
229 |                                           dense_shape=output_shape)
230 |             mask = tf.sparse.to_dense(mask, default_value=False)
231 |         else:
232 |             # this function is deprecated by TF
233 |             sorted_selection = -tf.nn.top_k(-selection, k=tf.size(selection))[0]
234 |             mask = tf.sparse_to_dense(
235 |                 sparse_indices=sorted_selection,
236 |                 output_shape=output_shape,
237 |                 sparse_values=True,
238 |                 default_value=False)
239 |         return mask
240 | 
241 |     # TF bug in version 1.11, 1.12: https://github.com/tensorflow/tensorflow/issues/22750
242 |     buggy_tf = get_tf_version_tuple() in [(1, 11), (1, 12)]
243 |     masks = tf.map_fn(f, (scores, boxes), dtype=tf.bool,
244 |                       parallel_iterations=1 if buggy_tf else 10)     # #cat x N
245 |     selected_indices = tf.where(masks)  # #selection x 2, each is (cat_id, box_id)
246 |     scores = tf.boolean_mask(scores, masks)
247 | 
248 |     # filter again by sorting scores
249 |     topk_scores, topk_indices = tf.nn.top_k(
250 |         scores,
251 |         tf.minimum(cfg.TEST.RESULTS_PER_IM, tf.size(scores)),
252 |         sorted=False)
253 |     filtered_selection = tf.gather(selected_indices, topk_indices)
254 |     cat_ids, box_ids = tf.unstack(filtered_selection, axis=1)
255 | 
256 |     final_scores = tf.identity(topk_scores, name='scores')
257 |     final_labels = tf.add(cat_ids, 1, name='labels')
258 |     final_ids = tf.stack([cat_ids, box_ids], axis=1, name='all_ids')
259 |     final_boxes = tf.gather_nd(boxes, final_ids, name='boxes')
260 |     return final_boxes, final_scores, final_labels
261 | 
262 | 
263 | """
264 | FastRCNN heads for FPN:
265 | """
266 | 
267 | 
268 | @layer_register(log_shape=True)
269 | def fastrcnn_2fc_head(feature):
270 |     """
271 |     Args:
272 |         feature (any shape):
273 | 
274 |     Returns:
275 |         2D head feature
276 |     """
277 |     dim = cfg.FPN.FRCNN_FC_HEAD_DIM
278 |     init = tf.variance_scaling_initializer()
279 |     hidden = FullyConnected('fc6', feature, dim, kernel_initializer=init, activation=tf.nn.relu)
280 |     hidden = FullyConnected('fc7', hidden, dim, kernel_initializer=init, activation=tf.nn.relu)
281 |     return hidden
282 | 
283 | 
284 | @layer_register(log_shape=True)
285 | def fastrcnn_Xconv1fc_head(feature, num_convs, norm=None):
286 |     """
287 |     Args:
288 |         feature (NCHW):
289 |         num_classes(int): num_category + 1
290 |         num_convs (int): number of conv layers
291 |         norm (str or None): either None or 'GN'
292 | 
293 |     Returns:
294 |         2D head feature
295 |     """
296 |     assert norm in [None, 'GN'], norm
297 |     l = feature
298 |     with argscope(Conv2D, data_format='channels_first',
299 |                   kernel_initializer=tf.variance_scaling_initializer(
300 |                       scale=2.0, mode='fan_out',
301 |                       distribution='untruncated_normal' if get_tf_version_tuple() >= (1, 12) else 'normal')):
302 |         for k in range(num_convs):
303 |             l = Conv2D('conv{}'.format(k), l, cfg.FPN.FRCNN_CONV_HEAD_DIM, 3, activation=tf.nn.relu)
304 |             if norm is not None:
305 |                 l = GroupNorm('gn{}'.format(k), l)
306 |         l = FullyConnected('fc', l, cfg.FPN.FRCNN_FC_HEAD_DIM,
307 |                            kernel_initializer=tf.variance_scaling_initializer(), activation=tf.nn.relu)
308 |     return l
309 | 
310 | 
311 | def fastrcnn_4conv1fc_head(*args, **kwargs):
312 |     return fastrcnn_Xconv1fc_head(*args, num_convs=4, **kwargs)
313 | 
314 | 
315 | def fastrcnn_4conv1fc_gn_head(*args, **kwargs):
316 |     return fastrcnn_Xconv1fc_head(*args, num_convs=4, norm='GN', **kwargs)
317 | 
318 | 
319 | class BoxProposals(object):
320 |     """
321 |     A structure to manage box proposals and their relations with ground truth.
322 |     """
323 |     def __init__(self, boxes, labels=None, fg_inds_wrt_gt=None):
324 |         """
325 |         Args:
326 |             boxes: Nx4
327 |             labels: N, each in [0, #class), the true label for each input box
328 |             fg_inds_wrt_gt: #fg, each in [0, M)
329 | 
330 |         The last four arguments could be None when not training.
331 |         """
332 |         for k, v in locals().items():
333 |             if k != 'self' and v is not None:
334 |                 setattr(self, k, v)
335 | 
336 |     @memoized_method
337 |     def fg_inds(self):
338 |         """ Returns: #fg indices in [0, N-1] """
339 |         return tf.reshape(tf.where(self.labels > 0), [-1], name='fg_inds')
340 | 
341 |     @memoized_method
342 |     def fg_boxes(self):
343 |         """ Returns: #fg x4"""
344 |         return tf.gather(self.boxes, self.fg_inds(), name='fg_boxes')
345 | 
346 |     @memoized_method
347 |     def fg_labels(self):
348 |         """ Returns: #fg"""
349 |         return tf.gather(self.labels, self.fg_inds(), name='fg_labels')
350 | 
351 | 
352 | class FastRCNNHead(object):
353 |     """
354 |     A class to process & decode inputs/outputs of a fastrcnn classification+regression head.
355 |     """
356 |     def __init__(self, proposals, box_logits, label_logits, gt_boxes, bbox_regression_weights):
357 |         """
358 |         Args:
359 |             proposals: BoxProposals
360 |             box_logits: Nx#classx4 or Nx1x4, the output of the head
361 |             label_logits: Nx#class, the output of the head
362 |             gt_boxes: Mx4
363 |             bbox_regression_weights: a 4 element tensor
364 |         """
365 |         for k, v in locals().items():
366 |             if k != 'self' and v is not None:
367 |                 setattr(self, k, v)
368 |         self._bbox_class_agnostic = int(box_logits.shape[1]) == 1
369 | 
370 |     @memoized_method
371 |     def fg_box_logits(self):
372 |         """ Returns: #fg x ? x 4 """
373 |         return tf.gather(self.box_logits, self.proposals.fg_inds(), name='fg_box_logits')
374 | 
375 |     @memoized_method
376 |     def losses(self):
377 |         encoded_fg_gt_boxes = encode_bbox_target(
378 |             tf.gather(self.gt_boxes, self.proposals.fg_inds_wrt_gt),
379 |             self.proposals.fg_boxes()) * self.bbox_regression_weights
380 |         return fastrcnn_losses(
381 |             self.proposals.labels, self.label_logits,
382 |             encoded_fg_gt_boxes, self.fg_box_logits()
383 |         )
384 | 
385 |     @memoized_method
386 |     def decoded_output_boxes(self):
387 |         """ Returns: N x #class x 4 """
388 |         anchors = tf.tile(tf.expand_dims(self.proposals.boxes, 1),
389 |                           [1, cfg.DATA.NUM_CLASS, 1])   # N x #class x 4
390 |         decoded_boxes = decode_bbox_target(
391 |             self.box_logits / self.bbox_regression_weights,
392 |             anchors
393 |         )
394 |         return decoded_boxes
395 | 
396 |     @memoized_method
397 |     def decoded_output_boxes_for_true_label(self):
398 |         """ Returns: Nx4 decoded boxes """
399 |         return self._decoded_output_boxes_for_label(self.proposals.labels)
400 | 
401 |     @memoized_method
402 |     def decoded_output_boxes_for_predicted_label(self):
403 |         """ Returns: Nx4 decoded boxes """
404 |         return self._decoded_output_boxes_for_label(self.predicted_labels())
405 | 
406 |     @memoized_method
407 |     def decoded_output_boxes_for_label(self, labels):
408 |         assert not self._bbox_class_agnostic
409 |         indices = tf.stack([
410 |             tf.range(tf.size(labels, out_type=tf.int64)),
411 |             labels
412 |         ])
413 |         needed_logits = tf.gather_nd(self.box_logits, indices)
414 |         decoded = decode_bbox_target(
415 |             needed_logits / self.bbox_regression_weights,
416 |             self.proposals.boxes
417 |         )
418 |         return decoded
419 | 
420 |     @memoized_method
421 |     def decoded_output_boxes_class_agnostic(self):
422 |         """ Returns: Nx4 """
423 |         assert self._bbox_class_agnostic
424 |         box_logits = tf.reshape(self.box_logits, [-1, 4])
425 |         decoded = decode_bbox_target(
426 |             box_logits / self.bbox_regression_weights,
427 |             self.proposals.boxes
428 |         )
429 |         return decoded
430 | 
431 |     @memoized_method
432 |     def output_scores(self, name=None):
433 |         """ Returns: N x #class scores, summed to one for each box."""
434 |         return tf.nn.softmax(self.label_logits, name=name)
435 | 
436 |     @memoized_method
437 |     def predicted_labels(self):
438 |         """ Returns: N ints """
439 |         return tf.argmax(self.label_logits, axis=1, name='predicted_labels')
440 | 


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # File: coco.py
  3 | 
  4 | import numpy as np
  5 | import random
  6 | import os
  7 | import tqdm
  8 | import json
  9 | import glob
 10 | 
 11 | from tensorpack.utils import logger
 12 | from tensorpack.utils.timer import timed_operation
 13 | 
 14 | from config import config as cfg
 15 | 
 16 | __all__ = ['COCODetection', 'DetectionDataset']
 17 | 
 18 | 
 19 | class COCODetection(object):
 20 |     # handle the weird (but standard) split of train and val
 21 |     _INSTANCE_TO_BASEDIR = {
 22 |         'valminusminival2014': 'val2014',
 23 |         'minival2014': 'val2014',
 24 |     }
 25 | 
 26 |     COCO_id_to_category_id = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 27: 25, 28: 26, 31: 27, 32: 28, 33: 29, 34: 30, 35: 31, 36: 32, 37: 33, 38: 34, 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40, 46: 41, 47: 42, 48: 43, 49: 44, 50: 45, 51: 46, 52: 47, 53: 48, 54: 49, 55: 50, 56: 51, 57: 52, 58: 53, 59: 54, 60: 55, 61: 56, 62: 57, 63: 58, 64: 59, 65: 60, 67: 61, 70: 62, 72: 63, 73: 64, 74: 65, 75: 66, 76: 67, 77: 68, 78: 69, 79: 70, 80: 71, 81: 72, 82: 73, 84: 74, 85: 75, 86: 76, 87: 77, 88: 78, 89: 79, 90: 80}  # noqa
 27 |     """
 28 |     Mapping from the incontinuous COCO category id to an id in [1, #category]
 29 |     For your own dataset, this should usually be an identity mapping.
 30 |     """
 31 | 
 32 |     # 80 names for COCO
 33 |     class_names = [
 34 |         "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]  # noqa
 35 | 
 36 |     def __init__(self, basedir, name):
 37 |         self.name = name
 38 |         self._imgdir = os.path.realpath(os.path.join(
 39 |             basedir, self._INSTANCE_TO_BASEDIR.get(name, name)))
 40 |         assert os.path.isdir(self._imgdir), self._imgdir
 41 |         annotation_file = os.path.join(
 42 |             basedir, 'annotations/instances_{}.json'.format(name))
 43 |         assert os.path.isfile(annotation_file), annotation_file
 44 | 
 45 |         from pycocotools.coco import COCO
 46 |         self.coco = COCO(annotation_file)
 47 |         logger.info("Instances loaded from {}.".format(annotation_file))
 48 | 
 49 |     # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
 50 |     def print_coco_metrics(self, json_file):
 51 |         """
 52 |         Args:
 53 |             json_file (str): path to the results json file in coco format
 54 |         Returns:
 55 |             dict: the evaluation metrics
 56 |         """
 57 |         from pycocotools.cocoeval import COCOeval
 58 |         ret = {}
 59 |         cocoDt = self.coco.loadRes(json_file)
 60 |         cocoEval = COCOeval(self.coco, cocoDt, 'bbox')
 61 |         cocoEval.evaluate()
 62 |         cocoEval.accumulate()
 63 |         cocoEval.summarize()
 64 |         fields = ['IoU=0.5:0.95', 'IoU=0.5', 'IoU=0.75', 'small', 'medium', 'large']
 65 |         for k in range(6):
 66 |             ret['mAP(bbox)/' + fields[k]] = cocoEval.stats[k]
 67 | 
 68 |         json_obj = json.load(open(json_file))
 69 |         if len(json_obj) > 0 and 'segmentation' in json_obj[0]:
 70 |             cocoEval = COCOeval(self.coco, cocoDt, 'segm')
 71 |             cocoEval.evaluate()
 72 |             cocoEval.accumulate()
 73 |             cocoEval.summarize()
 74 |             for k in range(6):
 75 |                 ret['mAP(segm)/' + fields[k]] = cocoEval.stats[k]
 76 |         return ret
 77 | 
 78 |     def load(self, add_gt=True, add_mask=False):
 79 |         """
 80 |         Args:
 81 |             add_gt: whether to add ground truth bounding box annotations to the dicts
 82 |             add_mask: whether to also add ground truth mask
 83 | 
 84 |         Returns:
 85 |             a list of dict, each has keys including:
 86 |                 'height', 'width', 'id', 'file_name',
 87 |                 and (if add_gt is True) 'boxes', 'class', 'is_crowd', and optionally
 88 |                 'segmentation'.
 89 |         """
 90 |         if add_mask:
 91 |             assert add_gt
 92 |         with timed_operation('Load Groundtruth Boxes for {}'.format(self.name)):
 93 |             img_ids = self.coco.getImgIds()
 94 |             img_ids.sort()
 95 |             # list of dict, each has keys: height,width,id,file_name
 96 |             imgs = self.coco.loadImgs(img_ids)
 97 | 
 98 |             for img in tqdm.tqdm(imgs):
 99 |                 self._use_absolute_file_name(img)
100 |                 if add_gt:
101 |                     self._add_detection_gt(img, add_mask)
102 |             return imgs
103 | 
104 |     def _use_absolute_file_name(self, img):
105 |         """
106 |         Change relative filename to abosolute file name.
107 |         """
108 |         img['file_name'] = os.path.join(
109 |             self._imgdir, img['file_name'])
110 |         assert os.path.isfile(img['file_name']), img['file_name']
111 | 
112 |     def _add_detection_gt(self, img, add_mask):
113 |         """
114 |         Add 'boxes', 'class', 'is_crowd' of this image to the dict, used by detection.
115 |         If add_mask is True, also add 'segmentation' in coco poly format.
116 |         """
117 |         # ann_ids = self.coco.getAnnIds(imgIds=img['id'])
118 |         # objs = self.coco.loadAnns(ann_ids)
119 |         objs = self.coco.imgToAnns[img['id']]  # equivalent but faster than the above two lines
120 | 
121 |         # clean-up boxes
122 |         valid_objs = []
123 |         width = img['width']
124 |         height = img['height']
125 |         for objid, obj in enumerate(objs):
126 |             if obj.get('ignore', 0) == 1:
127 |                 continue
128 |             x1, y1, w, h = obj['bbox']
129 |             # bbox is originally in float
130 |             # x1/y1 means upper-left corner and w/h means true w/h. This can be verified by segmentation pixels.
131 |             # But we do make an assumption here that (0.0, 0.0) is upper-left corner of the first pixel
132 | 
133 |             x1 = np.clip(float(x1), 0, width)
134 |             y1 = np.clip(float(y1), 0, height)
135 |             w = np.clip(float(x1 + w), 0, width) - x1
136 |             h = np.clip(float(y1 + h), 0, height) - y1
137 |             # Require non-zero seg area and more than 1x1 box size
138 |             if obj['area'] > 1 and w > 0 and h > 0 and w * h >= 4:
139 |                 obj['bbox'] = [x1, y1, x1 + w, y1 + h]
140 |                 valid_objs.append(obj)
141 | 
142 |                 if add_mask:
143 |                     segs = obj['segmentation']
144 |                     if not isinstance(segs, list):
145 |                         assert obj['iscrowd'] == 1
146 |                         obj['segmentation'] = None
147 |                     else:
148 |                         valid_segs = [np.asarray(p).reshape(-1, 2).astype('float32') for p in segs if len(p) >= 6]
149 |                         if len(valid_segs) == 0:
150 |                             logger.error("Object {} in image {} has no valid polygons!".format(objid, img['file_name']))
151 |                         elif len(valid_segs) < len(segs):
152 |                             logger.warn("Object {} in image {} has invalid polygons!".format(objid, img['file_name']))
153 | 
154 |                         obj['segmentation'] = valid_segs
155 | 
156 |         # all geometrically-valid boxes are returned
157 |         boxes = np.asarray([obj['bbox'] for obj in valid_objs], dtype='float32')  # (n, 4)
158 |         cls = np.asarray([
159 |             self.COCO_id_to_category_id[obj['category_id']]
160 |             for obj in valid_objs], dtype='int32')  # (n,)
161 |         is_crowd = np.asarray([obj['iscrowd'] for obj in valid_objs], dtype='int8')
162 | 
163 |         # add the keys
164 |         img['boxes'] = boxes        # nx4
165 |         img['class'] = cls          # n, always >0
166 |         img['is_crowd'] = is_crowd  # n,
167 |         if add_mask:
168 |             # also required to be float32
169 |             img['segmentation'] = [
170 |                 obj['segmentation'] for obj in valid_objs]
171 | 
172 |     @staticmethod
173 |     def load_many(basedir, names, add_gt=True, add_mask=False):
174 |         """
175 |         Load and merges several instance files together.
176 | 
177 |         Returns the same format as :meth:`COCODetection.load`.
178 |         """
179 |         if not isinstance(names, (list, tuple)):
180 |             names = [names]
181 |         ret = []
182 |         for n in names:
183 |             coco = COCODetection(basedir, n)
184 |             ret.extend(coco.load(add_gt, add_mask=add_mask))
185 |         return ret
186 | 
187 | 
188 | if cfg.DATA.IMAGENET_VID or cfg.DATA.DAVIS2017 or cfg.DATA.GOT10K or cfg.DATA.TRACKINGNET or cfg.DATA.COCO \
189 |         or cfg.DATA.YOUTUBE_BB or cfg.DATA.DAVIS_LUCID or cfg.DATA.LASOT:
190 | 
191 |     def calculate_ious(bboxes1, bboxes2):
192 |         # assume layout (x0, y0, x1, y1)
193 |         min_ = np.minimum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
194 |         max_ = np.maximum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
195 |         I = np.maximum(min_[..., 2] - max_[..., 0], 0) * np.maximum(min_[..., 3] - max_[..., 1], 0)
196 |         area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1])
197 |         area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1])
198 |         U = area1[:, np.newaxis] + area2[np.newaxis, :] - I
199 |         assert (U > 0).all()
200 |         IOUs = I / U
201 |         assert (IOUs >= 0).all()
202 |         assert (IOUs <= 1).all()
203 |         return IOUs
204 | 
205 |     class DetectionDataset(object):
206 |         occluders = None
207 |         coco = None
208 |         coco_anns = None
209 | 
210 |         def __init__(self):
211 |             """
212 |             This function is responsible for setting the dataset-specific
213 |             attributes in both cfg and self.
214 |             """
215 |             # we do it category agnostic, so only foreground and background
216 |             #self.num_category = cfg.DATA.NUM_CATEGORY = 1
217 |             self.num_category = cfg.DATA.NUM_CATEGORY
218 |             cfg.DATA.TRAIN = ["train"]
219 |             cfg.DATA.VAL = ["val"]
220 |             self.num_classes = self.num_category + 1
221 |             self.class_names = cfg.DATA.CLASS_NAMES = ["BG", "FG"]
222 | 
223 |         def _load_roidb_imagenet_vid(self, subset):
224 |             imageset_postfix = "ImageSets/VID/" + subset + ".txt"
225 |             imagesets_file = os.path.join(cfg.DATA.IMAGENET_VID_ROOT, imageset_postfix)
226 |             vid_names = set()
227 |             with open(imagesets_file) as f:
228 |                 for l in f:
229 |                     sp = l.split("/")
230 |                     vid_name = sp[0] + "/" + sp[1]
231 |                     vid_names.add(vid_name)
232 |             vid_names = list(vid_names)
233 |             return vid_names
234 | 
235 |         def _load_roidb_davis(self, subset):
236 |             imagesets_file = os.path.join(cfg.DATA.DAVIS2017_ROOT, "ImageSets", "2017", subset + ".txt")
237 |             vid_names = []
238 |             with open(imagesets_file) as f:
239 |                 for l in f:
240 |                     vid_name = l.strip()
241 |                     vid_names.append(vid_name)
242 |             return vid_names
243 | 
244 |         def _load_roidb_davis_lucid(self, subset):
245 |             vid_names = sorted(glob.glob(cfg.DATA.DAVIS_LUCID_ROOT + "*/*/"))
246 |             vid_names = ['/'.join(v.split("/")[-3:]) for v in vid_names]
247 | 
248 |             if cfg.TRACK_VIDEO_ID is not None:
249 |                 vid_names = sorted(glob.glob(cfg.DATA.DAVIS_LUCID_ROOT + "test-challenge/*/"))
250 |                 vid_names = ['/'.join(v.split("/")[-3:]) for v in vid_names]
251 |                 vid_names = [vid_names[cfg.TRACK_VIDEO_ID]]
252 | 
253 |                 print("!!!!!!!!!!!!!!!ONLY DOING: ", vid_names[0], "!!!!!!!!!!!!!!!!!!!!!!!!!")
254 | 
255 |             # vid_names = ['test-challenge/speed-skating/']
256 | 
257 |             return vid_names
258 | 
259 |         def _load_roidb_youtubevos(self, subset):
260 |             meta_file = os.path.join(cfg.DATA.YOUTUBE_VOS_ROOT, subset, "meta.json")
261 |             with open(meta_file) as f:
262 |                 metadata = json.load(f)
263 |             vid_names = list(metadata["videos"].keys())
264 |             return vid_names
265 | 
266 |         def _load_roidb_got10k(self, subset):
267 |             vid_names = []
268 |             with open(os.path.join(cfg.DATA.GOT10K_ROOT, 'train/list.txt')) as f:
269 |                 for l in f:
270 |                     vid_names.append(l.strip())
271 |             assert len(vid_names) > 0
272 |             return vid_names
273 | 
274 |         def _load_roidb_lasot(self, subset):
275 |             vid_names = []
276 |             with open(os.path.join(cfg.DATA.LASOT_ROOT, 'training_set.txt')) as f:
277 |                 for l in f:
278 |                     vid_names.append(l.strip())
279 |             assert len(vid_names) > 0
280 |             return vid_names
281 | 
282 |         def _load_roidb_youtube_bb(self, subset):
283 |             clips_fn = os.path.join(cfg.DATA.YOUTUBE_BB_ROOT, "sets", "clips.txt")
284 |             roidbs = []
285 |             with open(clips_fn) as f:
286 |                 for l in f:
287 |                     roidbs.append(l.strip())
288 |             return roidbs
289 | 
290 |         def _load_roidb_trackingnet(self, subset):
291 |             gt_files = glob.glob(os.path.join(cfg.DATA.TRACKINGNET_ROOT, "TRAIN*", "anno", "*.txt"))
292 |             vid_names = [x.split("/")[-3] + "____" + x.split("/")[-1].replace(".txt", "") for x in gt_files]
293 |             return vid_names
294 | 
295 |         def _load_roidb(self, subset):
296 |             vid_names = []
297 |             if cfg.DATA.IMAGENET_VID:
298 |                 logger.info("using imagenet vid")
299 |                 vid_names_imgnet = self._load_roidb_imagenet_vid(subset)
300 |                 vid_names_imgnet = ["VID/" + x for x in vid_names_imgnet]
301 |                 vid_names += vid_names_imgnet
302 |             if cfg.DATA.DAVIS2017:
303 |                 logger.info("using davis2017")
304 |                 vid_names_davis = self._load_roidb_davis(subset)
305 |                 vid_names_davis = ["DAVIS/" + x for x in vid_names_davis]
306 |                 vid_names += vid_names_davis
307 |             if cfg.DATA.YOUTUBE_VOS:
308 |                 logger.info("using YouTube-VOS")
309 |                 vid_names_youtubevos = self._load_roidb_youtubevos(subset)
310 |                 vid_names_youtubevos = ["YouTubeVOS/" + x for x in vid_names_youtubevos]
311 |                 vid_names += vid_names_youtubevos
312 |             if cfg.DATA.GOT10K:
313 |                 logger.info("using GOT10K")
314 |                 vid_names_got = self._load_roidb_got10k(subset)
315 |                 vid_names_got = ["GOT10K/" + x for x in vid_names_got]
316 |                 vid_names += vid_names_got
317 |             if cfg.DATA.LASOT:
318 |                 logger.info("using LaSOT")
319 |                 vid_names_lasot = self._load_roidb_lasot(subset)
320 |                 vid_names_lasot = ["LaSOT/" + x for x in vid_names_lasot]
321 |                 vid_names += vid_names_lasot
322 |             if cfg.DATA.YOUTUBE_BB:
323 |                 logger.info("using YouTube-BB")
324 |                 vid_names_youtube_bb = self._load_roidb_youtube_bb(subset)
325 |                 vid_names_youtube_bb = ["YouTube-BB/" + x for x in vid_names_youtube_bb]
326 |                 # duplicate all other vid names in order to sample them more often (YouTube-BB is very large, 300k clips)
327 |                 vid_names *= 60
328 |                 vid_names += vid_names_youtube_bb
329 |             if cfg.DATA.TRACKINGNET:
330 |                 logger.info("using TrackingNet")
331 |                 vid_names_trackingnet = self._load_roidb_trackingnet(subset)
332 |                 vid_names_trackingnet = ["TrackingNet/" + x for x in vid_names_trackingnet]
333 |                 # duplicate all other vid names in order to sample them more often (trackingnet is very large)
334 |                 vid_names *= 2
335 |                 vid_names += vid_names_trackingnet
336 |             random.shuffle(vid_names)
337 |             return vid_names
338 | 
339 |         def load_training_roidbs(self, names):
340 |             """
341 |             Args:
342 |                 names (list[str]): name of the training datasets, e.g.  ['train2014', 'valminusminival2014']
343 | 
344 |             Returns:
345 |                 roidbs (list[dict]):
346 | 
347 |             Produce "roidbs" as a list of dict, each dict corresponds to one image with k>=0 instances.
348 |             and the following keys are expected for training:
349 | 
350 |             height, width: integer
351 |             file_name: str, full path to the image
352 |             boxes: numpy array of kx4 floats, each row is [x1, y1, x2, y2]
353 |             category: numpy array of k integers, in the range of [1, #categories]
354 |             is_crowd: k booleans. Use k False if you don't know what it means.
355 |             segmentation: k lists of numpy arrays (one for each instance).
356 |                 Each list of numpy arrays corresponds to the mask for one instance.
357 |                 Each numpy array in the list is a polygon of shape Nx2,
358 |                 because one mask can be represented by N polygons.
359 | 
360 |                 If your segmentation annotations are originally masks rather than polygons,
361 |                 either convert it, or the augmentation will need to be changed or skipped accordingly.
362 | 
363 |                 Include this field only if training Mask R-CNN.
364 |             """
365 |             return self._load_roidb("train")
366 | 
367 |         def load_inference_roidbs(self, name):
368 |             """
369 |             Args:
370 |                 name (str): name of one inference dataset, e.g. 'minival2014'
371 | 
372 |             Returns:
373 |                 roidbs (list[dict]):
374 | 
375 |                 Each dict corresponds to one image to run inference on. The
376 |                 following keys in the dict are expected:
377 | 
378 |                 file_name (str): full path to the image
379 |                 id (str): an id for the image. The inference results will be stored with this id.
380 |             """
381 |             return self._load_roidb("val")
382 | 
383 |         def eval_or_save_inference_results(self, results, dataset, output=None):
384 |             ious_at_k = [[] for _ in range(10)]
385 |             ious_per_obj = {}
386 |             # results.sort(key=lambda x: x.gt_file)
387 |             for r in results:
388 |                 gt_file, res, target_box = r
389 |                 seq, obj_id, timestep = gt_file.split('__')
390 |                 obj_name = seq + "__" + obj_id
391 |                 res.sort(key=lambda x: x.score, reverse=True)
392 |                 max_iou = 0.0
393 |                 if obj_name not in ious_per_obj.keys():
394 |                     ious_per_obj[obj_name] = [[] for _ in range(10)]
395 | 
396 |                 for k in range(10):
397 |                     if len(res) > k:
398 |                         det = res[k]
399 |                         det_box = det.box
400 |                         iou = calculate_ious(target_box[np.newaxis], det_box[np.newaxis])[0, 0]
401 |                         max_iou = max(max_iou, iou)
402 |                         if k == 0:
403 |                             best_box = det_box
404 |                     ious_per_obj[obj_name][k].append(max_iou)
405 |                 # print(seq,obj_id,timestep,target_box, best_box, ious_per_obj[obj_name][0][-1])
406 | 
407 |             for obj_name in ious_per_obj.keys():
408 |                 for k in range(10):
409 |                     ious_at_k[k].append(np.mean(ious_per_obj[obj_name][k]))
410 |                 print(obj_name, np.mean(ious_per_obj[obj_name][0]))
411 | 
412 |             eval_res = {"miou@" + str(k + 1): np.mean(ious_at_k[k]) for k in range(10)}
413 |             print(eval_res)
414 |             return eval_res
415 | 
416 |         # code for singleton:
417 |         _instance = None
418 | 
419 |         def __new__(cls):
420 |             if not isinstance(cls._instance, cls):
421 |                 cls._instance = object.__new__(cls)
422 |             return cls._instance
423 | else:
424 |     class DetectionDataset(object):
425 |         """
426 |         A singleton to load datasets, evaluate results, and provide metadata.
427 | 
428 |         To use your own dataset that's not in COCO format, rewrite all methods of this class.
429 |         """
430 |         def __init__(self):
431 |             """
432 |             This function is responsible for setting the dataset-specific
433 |             attributes in both cfg and self.
434 |             """
435 |             self.num_category = cfg.DATA.NUM_CATEGORY = len(COCODetection.class_names)
436 |             self.num_classes = self.num_category + 1
437 |             self.class_names = cfg.DATA.CLASS_NAMES = ["BG"] + COCODetection.class_names
438 | 
439 |         def load_training_roidbs(self, names):
440 |             """
441 |             Args:
442 |                 names (list[str]): name of the training datasets, e.g.  ['train2014', 'valminusminival2014']
443 | 
444 |             Returns:
445 |                 roidbs (list[dict]):
446 | 
447 |             Produce "roidbs" as a list of dict, each dict corresponds to one image with k>=0 instances.
448 |             and the following keys are expected for training:
449 | 
450 |             height, width: integer
451 |             file_name: str, full path to the image
452 |             boxes: numpy array of kx4 floats, each row is [x1, y1, x2, y2]
453 |             category: numpy array of k integers, in the range of [1, #categories]
454 |             is_crowd: k booleans. Use k False if you don't know what it means.
455 |             segmentation: k lists of numpy arrays (one for each instance).
456 |                 Each list of numpy arrays corresponds to the mask for one instance.
457 |                 Each numpy array in the list is a polygon of shape Nx2,
458 |                 because one mask can be represented by N polygons.
459 | 
460 |                 If your segmentation annotations are originally masks rather than polygons,
461 |                 either convert it, or the augmentation will need to be changed or skipped accordingly.
462 | 
463 |                 Include this field only if training Mask R-CNN.
464 |             """
465 |             return COCODetection.load_many(
466 |                 cfg.DATA.BASEDIR, cfg.DATA.TRAIN, add_gt=True, add_mask=cfg.MODE_MASK)
467 | 
468 |         def load_inference_roidbs(self, name):
469 |             """
470 |             Args:
471 |                 name (str): name of one inference dataset, e.g. 'minival2014'
472 | 
473 |             Returns:
474 |                 roidbs (list[dict]):
475 | 
476 |                 Each dict corresponds to one image to run inference on. The
477 |                 following keys in the dict are expected:
478 | 
479 |                 file_name (str): full path to the image
480 |                 id (str): an id for the image. The inference results will be stored with this id.
481 |             """
482 |             return COCODetection.load_many(cfg.DATA.BASEDIR, name, add_gt=False)
483 | 
484 |         def eval_or_save_inference_results(self, results, dataset, output=None):
485 |             """
486 |             Args:
487 |                 results (list[dict]): the inference results as dicts.
488 |                     Each dict corresponds to one __instance__. It contains the following keys:
489 | 
490 |                     image_id (str): the id that matches `load_inference_roidbs`.
491 |                     category_id (int): the category prediction, in range [1, #category]
492 |                     bbox (list[float]): x1, y1, x2, y2
493 |                     score (float):
494 |                     segmentation: the segmentation mask in COCO's rle format.
495 | 
496 |                 dataset (str): the name of the dataset to evaluate.
497 |                 output (str): the output file to optionally save the results to.
498 | 
499 |             Returns:
500 |                 dict: the evaluation results.
501 |             """
502 |             continuous_id_to_COCO_id = {v: k for k, v in COCODetection.COCO_id_to_category_id.items()}
503 |             for res in results:
504 |                 # convert to COCO's incontinuous category id
505 |                 res['category_id'] = continuous_id_to_COCO_id[res['category_id']]
506 |                 # COCO expects results in xywh format
507 |                 box = res['bbox']
508 |                 box[2] -= box[0]
509 |                 box[3] -= box[1]
510 |                 res['bbox'] = [round(float(x), 3) for x in box]
511 | 
512 |             assert output is not None, "COCO evaluation requires an output file!"
513 |             with open(output, 'w') as f:
514 |                 json.dump(results, f)
515 |             if len(output):
516 |                 # sometimes may crash if the results are empty?
517 |                 return COCODetection(cfg.DATA.BASEDIR, dataset).print_coco_metrics(output)
518 |             else:
519 |                 return {}
520 | 
521 |         # code for singleton:
522 |         _instance = None
523 | 
524 |         def __new__(cls):
525 |             if not isinstance(cls._instance, cls):
526 |                 cls._instance = object.__new__(cls)
527 |             return cls._instance
528 | 
529 | 
530 | if __name__ == '__main__':
531 |     c = COCODetection(cfg.DATA.BASEDIR, 'train2014')
532 |     gt_boxes = c.load(add_gt=True, add_mask=True)
533 |     print("#Images:", len(gt_boxes))
534 | 


--------------------------------------------------------------------------------