├── .gitignore
├── README.md
├── _figs
├── overview.png
└── plots.png
├── cfgs
├── __init__.py
├── cfg_res18.py
├── cfg_res50.py
└── cfg_test.py
├── ckpt
└── __init__.py
├── dict
└── lasot_dict_test.npy
├── model
├── __init__.py
├── box_utils.py
├── boxes.py
├── build_model.py
├── cbam.py
├── context.py
├── fcos.py
├── focal_loss.py
├── loss.py
├── non_local.py
├── rcnn_module.py
├── resnet.py
└── rpn_module.py
├── output
└── __init__.py
├── test_tracker.py
├── th_utils.py
├── track_utils.py
├── tracker.py
├── tracker_batch.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .temp/
2 | .ipynb_checkpoints/
3 | *.pyc
4 | *.pth
5 | *.txt
6 | *.tar
7 | *.mp4
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Visual Tracking by TridentAlign and Context Embedding (TACT)
2 |
3 | Test code for "Visual Tracking by TridentAlign and Context Embedding"
4 |
5 | #### Janghoon Choi, Junseok Kwon, and Kyoung Mu Lee
6 |
7 | [arXiv paper](https://arxiv.org/pdf/2007.06887.pdf)
8 |
9 | ## Overall Framework
10 |
11 |
12 |
13 |
14 | ## Results on LaSOT test set
15 |
16 |
17 |
18 | - Link to [LaSOT dataset](https://cis.temple.edu/lasot/)
19 | - Raw results available on [Google drive](https://drive.google.com/drive/folders/1ShAPX-ho-b_JjEenPCjzy1m4UN-ooSot?usp=sharing)
20 |
21 |
22 | ## Dependencies
23 |
24 | - Ubuntu 18.04
25 | - Python==2.7.17
26 | - numpy==1.16.5
27 | - pytorch==1.3.0
28 | - matplotlib==2.2.4
29 | - opencv==4.1.0.25
30 | - moviepy==1.0.0
31 | - tqdm==4.32.1
32 |
33 |
34 | ## Usage
35 |
36 | ### Prerequisites
37 |
38 | - Download network weights from [Google drive](https://drive.google.com/drive/folders/1ShAPX-ho-b_JjEenPCjzy1m4UN-ooSot?usp=sharing)
39 | - Copy network weight files `ckpt_res18.tar` and `ckpt_res50.tar` to `ckpt/` folder
40 | - Choose between `TACT-18` and `TACT-50` by modifying the `cfgs/cfg_test.py` file (default: `TACT-50`)
41 |
42 | ### To test tracker on LaSOT test set
43 |
44 | - Download LaSOT dataset from [link](https://cis.temple.edu/lasot/)
45 | - Modify `cfgs/cfg_test.py` file to local `LaSOTBenchmark` folder path
46 | - Run `python test_tracker.py`
47 |
48 | ### To test tracker on an arbitrary sequence
49 |
50 | - Using `run_track_seq()` function in `tracker_batch.py`, tracker can run on an arbitrary sequence
51 | - Provide the function with following variables
52 | - `seq_name` : name of the given sequence
53 | - `seq_path` : path to the given sequence
54 | - `seq_imlist` : list of image file names of the given sequence
55 | - `seq_gt` : ground truth box annotations of the given sequence (may only contain annotation for initial frame, `[x_min,y_min,width,height]` format)
56 |
57 | ### Raw results on other datasets
58 |
59 | - Link to raw results on [Google drive](https://drive.google.com/drive/folders/1ShAPX-ho-b_JjEenPCjzy1m4UN-ooSot?usp=sharing)
60 | - Results for test sets of [LaSOT](https://cis.temple.edu/lasot/), [OxUvA](https://oxuva.github.io/long-term-tracking-benchmark/), [GOT-10k](http://got-10k.aitestunion.com/), [TrackingNet](https://tracking-net.org/)
61 |
62 |
63 | ## Citation
64 |
65 | If you find our work useful for your research, please consider citing the following paper:
66 |
67 | ``` text
68 | @article{choi2020tact,
69 | title={Visual tracking by tridentalign and context embedding},
70 | author={Choi, Janghoon and Kwon, Junseok and Lee, Kyoung Mu},
71 | journal={arXiv preprint arXiv:2007.06887},
72 | year={2020}
73 | }
74 | ```
75 |
76 |
77 |
--------------------------------------------------------------------------------
/_figs/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JanghoonChoi/TACT/6870cf45a489f4ebd610f25d099ab5f7470b22e4/_figs/overview.png
--------------------------------------------------------------------------------
/_figs/plots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JanghoonChoi/TACT/6870cf45a489f4ebd610f25d099ab5f7470b22e4/_figs/plots.png
--------------------------------------------------------------------------------
/cfgs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JanghoonChoi/TACT/6870cf45a489f4ebd610f25d099ab5f7470b22e4/cfgs/__init__.py
--------------------------------------------------------------------------------
/cfgs/cfg_res18.py:
--------------------------------------------------------------------------------
1 | # define training flags
2 | EXP_COMMENT = 'res18_final_model'
3 |
4 | # define training parameters
5 | im_size = (400, 666) # image max sizes (height, width)
6 | batch_size = 4 # batch size for training
7 | batch_size_val = 8 # batch size for validation
8 |
9 | name_bbnet = 'resnet18' # choose backbone : [resnet18, resnet34, resnet50, wide_resnet50_2, resnext50_32x4d]
10 | conv_npool = (2,3,4,4) # numof pooling for each output for backbone network (default:[2,3,4,5])
11 | roip_size = 5 # spatial sizeof roi-aligned features (default:7x7)
12 | head_nconv = 2 # numof conv layers for detection heads
13 | head_nfeat = 256 # channel dim. for feature maps in detection heads
14 | head_nlocl = True # use or not use nonlocal layer (embedded gaussian)
15 | head_dconv = True # use or not use dilated convs
16 | head_negff = False # use or not use negative feats for final scoring
17 | head_oproi = False # use or not use roi overlap prediction branch
18 | head_ctxff = (True, 3) # use or not use context feature fusion + fusion scheme number (0:cat,1:add,2:cbam,3:film)
19 | bbox_thres = (0.5, 0.4) # bbox thresholds for pos/neg samples for training
20 | nms_param = (0.90, 64) # nms params (overlap_threshold_pos, _neg, num_candidate_boxes)
21 | nft_param = (0.4, 6) # negative feat param (overlap_threshold, num_negative_boxes)
22 |
23 | num_epochs = int(1e+3) # numof training epochs
24 | training_iter = int(1e+5) # numof training iterations per epoch
25 | lr_start = 1e-4 # learning rate (initial)
26 | lr_decay = 0.50 # learning rate decay rate per loop
27 | lr_decay_step = 2000000 # learning rate decay steps
28 | w_decay = 1e-5 # weight decay rate for optimizer
29 | loss_lambda = 1.00 # balancing term for loss function (cls + lambda*reg)
30 | loss_gamma = 2.00 # focal loss gamma value (penalty on easy examples)
31 | loss_alpha = None # focal loss alpha value (pos/neg example balancing)
32 |
33 |
34 | # ===== PATH variables =====
35 | # checkpoint/init path + experiment number
36 | CHKPT_PATH, INITP_PATH = 'ckpt/', 'init/init_res18_weights.tar'
37 | CHKPT_CODE = ''
38 | # validation set dump path
39 | VALID_PATH = '/home/jhchoi/datasets3/track_valid_set_fcos.npz'
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/cfgs/cfg_res50.py:
--------------------------------------------------------------------------------
1 | # define training flags
2 | EXP_COMMENT = 'res50_final_model'
3 |
4 | # define training parameters
5 | im_size = (400, 666) # image max sizes (height, width)
6 | batch_size = 4 # batch size for training
7 | batch_size_val = 8 # batch size for validation
8 |
9 | name_bbnet = 'resnet50' # choose backbone : [resnet18, resnet34, resnet50, wide_resnet50_2, resnext50_32x4d]
10 | conv_npool = (2,3,4,4) # numof pooling for each output for backbone network (default:[2,3,4,5])
11 | roip_size = 5 # spatial sizeof roi-aligned features (default:7x7)
12 | head_nconv = 2 # numof conv layers for detection heads
13 | head_nfeat = 256 # channel dim. for feature maps in detection heads
14 | head_nlocl = True # use or not use nonlocal layer (embedded gaussian)
15 | head_dconv = True # use or not use dilated convs
16 | head_negff = False # use or not use negative feats for final scoring
17 | head_oproi = False # use or not use roi overlap prediction branch
18 | head_ctxff = (True, 3) # use or not use context feature fusion + fusion scheme number (0:cat,1:add,2:cbam,3:film)
19 | bbox_thres = (0.5, 0.4) # bbox thresholds for pos/neg samples for training
20 | nms_param = (0.90, 64) # nms params (overlap_threshold_pos, _neg, num_candidate_boxes)
21 | nft_param = (0.4, 6) # negative feat param (overlap_threshold, num_negative_boxes)
22 |
23 | num_epochs = int(1e+3) # numof training epochs
24 | training_iter = int(1e+5) # numof training iterations per epoch
25 | lr_start = 1e-4 # learning rate (initial)
26 | lr_decay = 0.50 # learning rate decay rate per loop
27 | lr_decay_step = 2000000 # learning rate decay steps
28 | w_decay = 1e-5 # weight decay rate for optimizer
29 | loss_lambda = 1.00 # balancing term for loss function (cls + lambda*reg)
30 | loss_gamma = 2.00 # focal loss gamma value (penalty on easy examples)
31 | loss_alpha = None # focal loss alpha value (pos/neg example balancing)
32 |
33 |
34 | # ===== PATH variables =====
35 | # checkpoint/init path + experiment number
36 | CHKPT_PATH, INITP_PATH = 'ckpt/', 'init/init_res50_weights.tar'
37 | CHKPT_CODE = ''
38 | # validation set dump path
39 | VALID_PATH = '/home/jhchoi/datasets3/track_valid_set_fcos.npz'
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/cfgs/cfg_test.py:
--------------------------------------------------------------------------------
1 | from cfg_res50 import *
2 |
3 |
4 | # ===== PATH variables =====
5 | # checkpoint path + experiment number
6 | CHKPT_PATH = 'ckpt/'
7 | CHKPT_CODE = 'res50'
8 |
9 |
10 | # construct dataset info dict
11 | db_info = dict()
12 | # test sets
13 | db_info['lasot'] = {'size': 280,
14 | 'path' : '/home/jhchoi/datasets5/LaSOTBenchmark/',
15 | 'dict' : 'dict/lasot_dict_test.npy'}
16 |
17 |
--------------------------------------------------------------------------------
/ckpt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JanghoonChoi/TACT/6870cf45a489f4ebd610f25d099ab5f7470b22e4/ckpt/__init__.py
--------------------------------------------------------------------------------
/dict/lasot_dict_test.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JanghoonChoi/TACT/6870cf45a489f4ebd610f25d099ab5f7470b22e4/dict/lasot_dict_test.npy
--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JanghoonChoi/TACT/6870cf45a489f4ebd610f25d099ab5f7470b22e4/model/__init__.py
--------------------------------------------------------------------------------
/model/box_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import torch
3 |
4 | # code from https://github.com/amdegroot/ssd.pytorch
5 |
6 | def point_form(boxes):
7 | """ Convert prior_boxes to (xmin, ymin, xmax, ymax)
8 | representation for comparison to point form ground truth data.
9 | Args:
10 | boxes: (tensor) center-size default boxes from priorbox layers.
11 | Return:
12 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
13 | """
14 | return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin
15 | boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax
16 |
17 |
18 | def center_size(boxes):
19 | """ Convert prior_boxes to (cx, cy, w, h)
20 | representation for comparison to center-size form ground truth data.
21 | Args:
22 | boxes: (tensor) point_form boxes
23 | Return:
24 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
25 | """
26 | return torch.cat((boxes[:, 2:] + boxes[:, :2])/2, # cx, cy
27 | boxes[:, 2:] - boxes[:, :2], 1) # w, h
28 |
29 |
30 | def intersect(box_a, box_b):
31 | """ We resize both tensors to [A,B,2] without new malloc:
32 | [A,2] -> [A,1,2] -> [A,B,2]
33 | [B,2] -> [1,B,2] -> [A,B,2]
34 | Then we compute the area of intersect between box_a and box_b.
35 | Args:
36 | box_a: (tensor) bounding boxes, Shape: [A,4].
37 | box_b: (tensor) bounding boxes, Shape: [B,4].
38 | Return:
39 | (tensor) intersection area, Shape: [A,B].
40 | """
41 | A = box_a.size(0)
42 | B = box_b.size(0)
43 | max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
44 | box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
45 | min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
46 | box_b[:, :2].unsqueeze(0).expand(A, B, 2))
47 | inter = torch.clamp((max_xy - min_xy), min=0)
48 | return inter[:, :, 0] * inter[:, :, 1]
49 |
50 |
51 | def jaccard(box_a, box_b, eps=0.):
52 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap
53 | is simply the intersection over union of two boxes. Here we operate on
54 | ground truth boxes and default boxes.
55 | E.g.:
56 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
57 | Args:
58 | box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
59 | box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
60 | Return:
61 | jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
62 | """
63 | inter = intersect(box_a, box_b)
64 | area_a = ((box_a[:, 2]-box_a[:, 0]) *
65 | (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
66 | area_b = ((box_b[:, 2]-box_b[:, 0]) *
67 | (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
68 | union = area_a + area_b - inter
69 | return (inter + eps) / (union + eps) # [A,B]
70 |
71 |
72 | def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx):
73 | """Match each prior box with the ground truth box of the highest jaccard
74 | overlap, encode the bounding boxes, then return the matched indices
75 | corresponding to both confidence and location preds.
76 | Args:
77 | threshold: (float) The overlap threshold used when mathing boxes.
78 | truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].
79 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
80 | variances: (tensor) Variances corresponding to each prior coord,
81 | Shape: [num_priors, 4].
82 | labels: (tensor) All the class labels for the image, Shape: [num_obj].
83 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
84 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.
85 | idx: (int) current batch index
86 | Return:
87 | The matched indices corresponding to 1)location and 2)confidence preds.
88 | """
89 | # jaccard index
90 | overlaps = jaccard(
91 | truths,
92 | point_form(priors)
93 | )
94 | # (Bipartite Matching)
95 | # [1,num_objects] best prior for each ground truth
96 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
97 | # [1,num_priors] best ground truth for each prior
98 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
99 | best_truth_idx.squeeze_(0)
100 | best_truth_overlap.squeeze_(0)
101 | best_prior_idx.squeeze_(1)
102 | best_prior_overlap.squeeze_(1)
103 | best_truth_overlap.index_fill_(0, best_prior_idx, 2) # ensure best prior
104 | # TODO refactor: index best_prior_idx with long tensor
105 | # ensure every gt matches with its prior of max overlap
106 | for j in range(best_prior_idx.size(0)):
107 | best_truth_idx[best_prior_idx[j]] = j
108 | matches = truths[best_truth_idx] # Shape: [num_priors,4]
109 | conf = labels[best_truth_idx] + 1 # Shape: [num_priors]
110 | conf[best_truth_overlap < threshold] = 0 # label as background
111 | loc = encode(matches, priors, variances)
112 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn
113 | conf_t[idx] = conf # [num_priors] top class label for each prior
114 |
115 |
116 | def encode(matched, priors, variances):
117 | """Encode the variances from the priorbox layers into the ground truth boxes
118 | we have matched (based on jaccard overlap) with the prior boxes.
119 | Args:
120 | matched: (tensor) Coords of ground truth for each prior in point-form
121 | Shape: [num_priors, 4].
122 | priors: (tensor) Prior boxes in center-offset form
123 | Shape: [num_priors,4].
124 | variances: (list[float]) Variances of priorboxes
125 | Return:
126 | encoded boxes (tensor), Shape: [num_priors, 4]
127 | """
128 |
129 | # dist b/t match center and prior's center
130 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
131 | # encode variance
132 | g_cxcy /= (variances[0] * priors[:, 2:])
133 | # match wh / prior wh
134 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
135 | g_wh = torch.log(g_wh) / variances[1]
136 | # return target for smooth_l1_loss
137 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4]
138 |
139 |
140 | # Adapted from https://github.com/Hakuyume/chainer-ssd
141 | def decode(loc, priors, variances):
142 | """Decode locations from predictions using priors to undo
143 | the encoding we did for offset regression at train time.
144 | Args:
145 | loc (tensor): location predictions for loc layers,
146 | Shape: [num_priors,4]
147 | priors (tensor): Prior boxes in center-offset form.
148 | Shape: [num_priors,4].
149 | variances: (list[float]) Variances of priorboxes
150 | Return:
151 | decoded bounding box predictions
152 | """
153 |
154 | boxes = torch.cat((
155 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
156 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
157 | boxes[:, :2] -= boxes[:, 2:] / 2
158 | boxes[:, 2:] += boxes[:, :2]
159 | return boxes
160 |
161 |
162 | def log_sum_exp(x):
163 | """Utility function for computing log_sum_exp while determining
164 | This will be used to determine unaveraged confidence loss across
165 | all examples in a batch.
166 | Args:
167 | x (Variable(tensor)): conf_preds from conf layers
168 | """
169 | x_max = x.data.max()
170 | return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max
171 |
172 |
173 | # Original author: Francisco Massa:
174 | # https://github.com/fmassa/object-detection.torch
175 | # Ported to PyTorch by Max deGroot (02/01/2017)
176 | def nms(boxes, scores, overlap=0.5, top_k=200):
177 | """Apply non-maximum suppression at test time to avoid detecting too many
178 | overlapping bounding boxes for a given object.
179 | Args:
180 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
181 | scores: (tensor) The class predscores for the img, Shape:[num_priors].
182 | overlap: (float) The overlap thresh for suppressing unnecessary boxes.
183 | top_k: (int) The Maximum number of box preds to consider.
184 | Return:
185 | The indices of the kept boxes with respect to num_priors.
186 | """
187 |
188 | keep = scores.new(scores.size(0)).zero_().long()
189 | if boxes.numel() == 0:
190 | return keep
191 | x1 = boxes[:, 0]
192 | y1 = boxes[:, 1]
193 | x2 = boxes[:, 2]
194 | y2 = boxes[:, 3]
195 | area = torch.mul(x2 - x1, y2 - y1)
196 | v, idx = scores.sort(0) # sort in ascending order
197 | # I = I[v >= 0.01]
198 | idx = idx[-top_k:] # indices of the top-k largest vals
199 | xx1 = boxes.new()
200 | yy1 = boxes.new()
201 | xx2 = boxes.new()
202 | yy2 = boxes.new()
203 | w = boxes.new()
204 | h = boxes.new()
205 |
206 | # keep = torch.Tensor()
207 | count = 0
208 | while idx.numel() > 0:
209 | i = idx[-1] # index of current largest val
210 | # keep.append(i)
211 | keep[count] = i
212 | count += 1
213 | if idx.size(0) == 1:
214 | break
215 | idx = idx[:-1] # remove kept element from view
216 | # load bboxes of next highest vals
217 | torch.index_select(x1, 0, idx, out=xx1)
218 | torch.index_select(y1, 0, idx, out=yy1)
219 | torch.index_select(x2, 0, idx, out=xx2)
220 | torch.index_select(y2, 0, idx, out=yy2)
221 | # store element-wise max with next highest score
222 | xx1 = torch.clamp(xx1, min=x1[i])
223 | yy1 = torch.clamp(yy1, min=y1[i])
224 | xx2 = torch.clamp(xx2, max=x2[i])
225 | yy2 = torch.clamp(yy2, max=y2[i])
226 | w.resize_as_(xx2)
227 | h.resize_as_(yy2)
228 | w = xx2 - xx1
229 | h = yy2 - yy1
230 | # check sizes of xx1 and xx2.. after each iteration
231 | w = torch.clamp(w, min=0.0)
232 | h = torch.clamp(h, min=0.0)
233 | inter = w*h
234 | # IoU = i / (area(a) + area(b) - i)
235 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas)
236 | union = (rem_areas - inter) + area[i]
237 | IoU = inter/union # store result in iou
238 | # keep only elements with an IoU <= overlap
239 | idx = idx[IoU.le(overlap)]
240 | return keep, count
241 |
242 |
--------------------------------------------------------------------------------
/model/boxes.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torchvision.ops.boxes import nms
5 | from time import time
6 |
7 | from th_utils import generate_reg_coords
8 |
9 | # box prediction module given response maps
10 | class BoxModule(nn.Module):
11 | def __init__(self, cfg):
12 | super(type(self), self).__init__()
13 | self.im_size = cfg.im_size
14 | # nms parameters
15 | self.bb_thres = cfg.nms_param[0]
16 | self.bb_nums = cfg.nms_param[1]
17 | # default anchor box center coordinates
18 | self.anc = torch.Tensor(generate_reg_coords(cfg)).unsqueeze(0).flatten(1,-2).cuda()
19 |
20 |
21 | def forward(self,cl,re, nms_param=None):
22 | # define nms parameters
23 | if nms_param is not None:
24 | bb_thr = nms_param[0]
25 | bb_num = nms_param[1]
26 | else:
27 | bb_thr = self.bb_thres
28 | bb_num = self.bb_nums
29 |
30 | # softmax class -> obtain scoremap
31 | ff = torch.exp(cl[...,0]) / (torch.exp(cl[...,0])+torch.exp(cl[...,1])) # [bnum, map_h, map_w]
32 | batch_size = ff.shape[0]
33 | # flatten scoremaps and regvals
34 | ff_f = ff.flatten(1) # [bnum,N]
35 | re_f = re.flatten(1,-2) #[bnum,N,ltrb]
36 |
37 | # translate regressed vals to bbox coordinates [bnum, N, x0y0x1y1]
38 | bb_f = self.anc.repeat_interleave(batch_size,dim=0).clone() # anchor coordinates to xyxy [bnum,N,xyxy]
39 | bb_f[...,0] -= re_f[...,0] # x_min = x_anc - left
40 | bb_f[...,1] -= re_f[...,1] # y_min = y_anc - top
41 | bb_f[...,2] += re_f[...,2] # x_max = x_anc + right
42 | bb_f[...,3] += re_f[...,3] # y_max = y_anc + down
43 |
44 | # cutoff boundary values
45 | xmin,ymin,xmax,ymax = bb_f[...,0],bb_f[...,1],bb_f[...,2],bb_f[...,3]
46 | xmin[xmin<0] = 0
47 | ymin[ymin<0] = 0
48 | xmax[xmax>self.im_size[1]-1] = self.im_size[1]-1
49 | ymax[ymax>self.im_size[0]-1] = self.im_size[0]-1
50 |
51 | # per-batch nms
52 | out_bb, out_ff = [], []
53 | for i in range(batch_size):
54 | ffi = ff_f[i]
55 | bbi = bb_f[i]
56 | b_idx = nms(bbi, ffi, bb_thr)
57 | # if numof boxes to choose is larger than obtained numof boxes
58 | b_sel = torch.LongTensor(range(bb_num)).cuda()
59 | b_sel[b_sel>len(b_idx)-1] = len(b_idx)-1
60 | # choose and store boxes
61 | b_box = bbi[b_idx[b_sel]]
62 | out_bb.append(b_box)
63 | out_ff.append(ffi[b_idx[b_sel]])
64 |
65 | # output : list of boxes where len(list)=batch_size, list[i]=[num_box,xyxy]
66 | return out_bb, out_ff
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/model/build_model.py:
--------------------------------------------------------------------------------
1 | import torch, time
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torchvision import transforms
5 |
6 | from box_utils import jaccard
7 | import resnet as resnet
8 | from rpn_module import RPN_Module
9 | from rcnn_module import RCNN_Module
10 |
11 | class Track_Model(nn.Module):
12 | def __init__(self,cfg):
13 | super(type(self), self).__init__()
14 | # dims and flags
15 | self.head_nfeat = cfg.head_nfeat
16 | self.head_negff = cfg.head_negff
17 | self.head_oproi = cfg.head_oproi
18 | self.head_ctxff = cfg.head_ctxff
19 | self.roip_size = cfg.roip_size
20 | self.nft_param = cfg.nft_param
21 | # backbone convnet
22 | self.backbone = getattr(resnet, cfg.name_bbnet)(cfg=cfg)
23 | # channel dim for backbone output featmap
24 | bb_ch = self.backbone(torch.zeros(1,3,64,64)).shape[1]
25 | # rpn module for proposal generation
26 | self.rpn = RPN_Module(cfg, bb_ch)
27 | # rcnn module for matching and refinement
28 | self.rcnn = RCNN_Module(cfg)
29 |
30 | def normalize_tensor(self, x, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
31 | # x: batch image tensor [bnum, 3, h, w]
32 | x[:,0],x[:,1],x[:,2] = (x[:,0]-mean[0])/std[0], (x[:,1]-mean[1])/std[1], (x[:,2]-mean[2])/std[2]
33 | return x
34 |
35 | def forward(self, x,y, xb, xfa=None, add_box=None):
36 | # x : query image, Tensor, [bnum, 3, img_h, img_w]
37 | # y : search image, Tensor, [bnum, 3, img_h, img_w]
38 | # xb : bbox coordinates for pos samples: list, [list_posbb] where len(list_bb)=bnum, list[i] = Tensor[1,4]
39 | # xfa : xfeats for feature reuse : tuple (xfa_tri, xfa_pos, xfa_neg)
40 |
41 | # pool pos/neg feats from x (if negft:true)
42 | xfa_in = self.get_feats_xfa(x, xb) if xfa is None else xfa
43 | # get feats from backbone (if not available)
44 | xf = self.backbone(self.normalize_tensor(x)) if xfa_in is None else None
45 | yf = self.backbone(self.normalize_tensor(y))
46 | # get roi proposals, pooled feats and rpn outputs
47 | rois, scores, rpn_feats, out_rpn = self.rpn(xf, yf, xb, xfa_in[0], add_box)
48 | # matching confidence scores and bbox refinement
49 | rcnn_feats = (xfa_in[1], xfa_in[2], rpn_feats[2]) #(pos_feat, neg_feat, can_feat)
50 | out_rcnn = self.rcnn(rcnn_feats, rois) #(cf,op,bb,roi)
51 |
52 | return out_rpn, out_rcnn
53 |
54 |
55 | def forward_box(self, x,y, xb, xfa=None, add_box=None, nbox=1):
56 | # params
57 | num_batch = y.shape[0]
58 | # get final outputs
59 | out_rpn, out_rcnn = self.forward(x,y, xb, xfa, add_box)
60 | out_cf, out_op, out_bb, out_br = out_rcnn
61 | # choose single box with max score for each batch - obtain scores, choose max score idxs for each batch
62 | # pos score + mean neg score
63 | out_ff_pos = torch.exp(out_cf[...,0,0]) / (torch.exp(out_cf[...,0,0])+torch.exp(out_cf[...,0,1]))
64 | out_ff_neg = torch.exp(out_cf[...,1:,1]) / (torch.exp(out_cf[...,1:,0])+torch.exp(out_cf[...,1:,1])) if self.head_negff else 1.
65 | # product of negative scores
66 | out_ff_neg = torch.prod(out_ff_neg, dim=-1) if self.head_negff else 1. #torch.mean/torch.sum
67 | # overlap score
68 | out_op = torch.sigmoid(out_op[...,0]) if self.head_oproi else 1.
69 |
70 | # fianl score = pos_score*overlap_score*neg_score
71 | out_ff = out_ff_pos*out_ff_neg*out_op
72 | sort_idxs = out_ff.argsort(descending=True, dim=1)
73 | # returns bb coordinates for each batch
74 | out_bb_b = []
75 | out_ff_b = []
76 | for i in range(num_batch):
77 | out_bb_b.append(out_bb[i,sort_idxs[i,:nbox]]) # out_bb out_br
78 | out_ff_b.append(out_ff[i,sort_idxs[i,:nbox]])
79 | out_bb_b = torch.stack(out_bb_b)
80 | out_ff_b = torch.stack(out_ff_b)
81 |
82 | return out_bb_b, out_ff_b, (out_rpn, out_rcnn)
83 |
84 |
85 | def get_feats_xfa(self, x, xb):
86 | # params
87 | num_batch = x.shape[0]
88 | thres,nfeat = self.nft_param
89 | nbox_num, nbox_thr = self.rpn.boxes.bb_nums, self.rpn.boxes.bb_thres
90 | # change numof candidate negative boxes
91 | self.rpn.boxes.bb_nums, self.rpn.boxes.bb_thres = 64,0.5
92 | # get pos and neg feats from query img
93 | xf = self.backbone(self.normalize_tensor(x))
94 | # roi proposals and feats
95 | rois, scores, feats, _ = self.rpn(xf, xf, xb, add_box=xb, pool_xf=True)
96 | xfa_tri = feats[0]
97 | xfa_pos = feats[2][:,-1]
98 | yfa = feats[2][:,:-1]
99 | # negative feature mining inside xf
100 | if self.head_negff:
101 | xfa_neg = torch.zeros(num_batch, nfeat, self.head_nfeat, self.roip_size, self.roip_size).cuda()
102 | for i in range(num_batch):
103 | # get ious per batch, choose feature idxs with lower iou < thres
104 | xb_i, roi_i, score_i = xb[i], rois[i][:-1,:], scores[i]
105 | iou_i = jaccard(xb_i, roi_i)[0]
106 | idx_sel = torch.nonzero( iou_i < thres )[:,0]
107 | idx_sel = idx_sel[:nfeat]
108 | # if numof features insufficient: repeat last idx
109 | if len(idx_sel)==0:
110 | continue
111 | if len(idx_sel) output: rois+context embedded features -> to rcnn module)
12 | class ContextModule(nn.Module):
13 | def __init__(self,cfg):
14 | super(type(self), self).__init__()
15 | # params
16 | self.im_size = cfg.im_size
17 | self.map_size = (down2n(cfg.im_size[0],cfg.conv_npool[-1]),down2n(cfg.im_size[1],cfg.conv_npool[-1]))
18 | self.scale_f = float(self.map_size[0]) / float(self.im_size[0])
19 | self.pool_size = cfg.roip_size
20 | self.head_nfeat = cfg.head_nfeat
21 | self.head_ctxff = cfg.head_ctxff
22 | self.num_ctxff = cfg.nft_param[1]
23 | self.ctx_param = (0.5,self.num_ctxff) #4
24 | # box module
25 | self.boxes = BoxModule(cfg)
26 |
27 | # variables w.r.t. different fusion schemes
28 | if self.head_ctxff[1]==0:
29 | fdim,reduce = (self.head_nfeat+2)*3-2, 1
30 | # simple concat
31 | self.simple = nn.Sequential(*[nn.Conv2d(fdim, self.head_nfeat, 3,1,1), nn.ReLU(),
32 | nn.Conv2d(self.head_nfeat, self.head_nfeat, 3,1,1), nn.ReLU(),
33 | nn.Conv2d(self.head_nfeat, self.head_nfeat, 1,1,0)])
34 |
35 | elif self.head_ctxff[1]==1:
36 | fdim,reduce = (self.head_nfeat+2)*2, 1
37 | # simple addition
38 | self.simple = nn.Sequential(*[nn.Conv2d(fdim, self.head_nfeat, 3,1,1), nn.ReLU(),
39 | nn.Conv2d(self.head_nfeat, self.head_nfeat, 3,1,1), nn.ReLU(),
40 | nn.Conv2d(self.head_nfeat, self.head_nfeat, 1,1,0)])
41 |
42 | elif self.head_ctxff[1]==2:
43 | # attention (cbam) based
44 | fdim,reduce = (self.head_nfeat+2)*2, 1
45 | # channel attention branch
46 | self.avg_pool, self.max_pool = nn.AdaptiveAvgPool2d(1), nn.AdaptiveMaxPool2d(1)
47 | self.conv1, self.conv2 = nn.Conv2d(fdim, self.head_nfeat//reduce, 1), nn.Conv2d(self.head_nfeat//reduce, self.head_nfeat, 1)
48 | self.relu1 = nn.ReLU()
49 | # spatial attention branch
50 | self.conv3 = nn.Conv2d(2, 1, 5, 1, 2)
51 | self.sigmoid = nn.Sigmoid()
52 |
53 | elif self.head_ctxff[1]==3:
54 | # film based
55 | fdim,reduce = (self.head_nfeat+2)*2, 1
56 | # common conv+relu
57 | self.conv1 = nn.Sequential(*[nn.Conv2d(fdim, self.head_nfeat//reduce, 3,1,1), nn.ReLU()])
58 | # channel multiplier gamma
59 | self.mult_g = nn.Parameter(torch.ones(1, self.head_nfeat, self.pool_size, self.pool_size))
60 | self.conv_g = nn.Sequential(*[nn.Conv2d(self.head_nfeat//reduce, self.head_nfeat//reduce, 3,1,1), nn.ReLU(),
61 | nn.Conv2d(self.head_nfeat//reduce, self.head_nfeat, 1,1,0)])
62 | # channel bias beta
63 | self.mult_b = nn.Parameter(torch.zeros(1, self.head_nfeat, self.pool_size, self.pool_size))
64 | self.conv_b = nn.Sequential(*[nn.Conv2d(self.head_nfeat//reduce, self.head_nfeat//reduce, 3,1,1), nn.ReLU(),
65 | nn.Conv2d(self.head_nfeat//reduce, self.head_nfeat, 1,1,0)])
66 |
67 | else:
68 | print 'unknown fusion scheme...'
69 |
70 | # init
71 | for m in self.modules():
72 | if isinstance(m, nn.Conv2d):
73 | nn.init.normal_(m.weight, 0., 1e-3)
74 | nn.init.constant_(m.bias, 0.)
75 |
76 |
77 |
78 | def forward(self, cl,re, cf,yfa,ybb):
79 | # cl,re : for obtaining context boxes
80 | # cf : full context feature map to sample features from
81 | # yfa : input features to be embedded with context; [num_batch, num_boxes, num_ch, pool_sz, pool_sz]
82 | # ybb : bounding box coordinates for input yfa feats; len(list)=num_batch, ybb[i]=[num_boxes,4]
83 | num_batch = yfa.shape[0]
84 | num_boxes = yfa.shape[1]
85 | # obtain candidate context box coordinates and pool feats cfa_all=[num_batch, num_ctx, num_ch, pool_sz, pool_sz]
86 | pred_ctxbb, pred_ctxsc = self.boxes(cl,re, self.ctx_param)
87 | ff = torch.cat((cf,cl.permute(0,3,1,2)),dim=1) # concat feats and cls logits
88 | cfa_all = roi_align(ff, pred_ctxbb, (self.pool_size,self.pool_size), self.scale_f)
89 | cfa_all = cfa_all.view(num_batch, self.ctx_param[1], self.head_nfeat+2, self.pool_size, self.pool_size)
90 | # max/mean pooling along channel dimension
91 | cfa_max,_ = cfa_all.max(dim=1)
92 | cfa_avg = cfa_all.mean(dim=1)
93 | cfa = torch.cat((cfa_max,cfa_avg), dim=1) # [num_batch, num_ch*2, pool_sz, pool_sz]
94 |
95 | # embed context into input feat yfa
96 | if self.head_ctxff[1]==0:
97 | # === simple concat
98 | cfa = cfa.unsqueeze(1).repeat_interleave(num_boxes,dim=1)# [num_batch, num_boxes, num_ch*2, pool_sz, pool_sz]
99 | cfa = torch.cat((yfa,cfa), dim=2) # channel-wise concat # [num_batch, num_boxes, num_ch*3, pool_sz, pool_sz]
100 | cfa = cfa.flatten(0,1) # batch-nbox dim flatten
101 | yfa = self.simple(cfa)
102 | yfa = yfa.view(num_batch, num_boxes, self.head_nfeat, self.pool_size, self.pool_size)
103 |
104 | elif self.head_ctxff[1]==1:
105 | # === simple addition
106 | cfa = self.simple(cfa) # [num_batch, self.head_nfeat, self.pool_size, self.pool_size]
107 | cfa = cfa.unsqueeze(1).repeat_interleave(num_boxes,dim=1)
108 | yfa += cfa
109 |
110 | elif self.head_ctxff[1]==2:
111 | # === channel and spatial attention (cbam) based
112 | # channel attention
113 | avg_out = self.conv2( self.relu1( self.conv1( self.avg_pool(cfa) ) ) )
114 | max_out = self.conv2( self.relu1( self.conv1( self.max_pool(cfa) ) ) )
115 | ca_out = self.sigmoid( avg_out + max_out )
116 | ca_out = ca_out.unsqueeze(1).repeat_interleave(num_boxes,dim=1)
117 | yfa *= ca_out
118 | # spatial attention
119 | avg_out = torch.mean(cfa, dim=1, keepdim=True)
120 | max_out,_ = torch.max(cfa, dim=1, keepdim=True)
121 | sp_out = torch.cat((avg_out,max_out), dim=1)
122 | sp_out = self.sigmoid( self.conv3(sp_out) )
123 | sp_out = sp_out.unsqueeze(1).repeat_interleave(num_boxes,dim=1)
124 | yfa *= sp_out
125 |
126 | elif self.head_ctxff[1]==3:
127 | # === film based affine transform
128 | # common branch
129 | fconv = self.conv1(cfa)
130 | # get channel multipler (mult_g*conv_g)
131 | fm_out = self.mult_g*self.conv_g(fconv)
132 | fm_out = fm_out.unsqueeze(1).repeat_interleave(num_boxes,dim=1)
133 | # get channel bias (mult_b*conv_b)
134 | fb_out = self.mult_b*self.conv_b(fconv)
135 | fb_out = fb_out.unsqueeze(1).repeat_interleave(num_boxes,dim=1)
136 | # apply channel wise linear transform ( (1-gamma)*feat+beta )
137 | yfa = (1+fm_out)*yfa + fb_out
138 |
139 | else:
140 | print 'unknown fusion scheme...'
141 |
142 | return yfa
143 |
144 |
145 |
--------------------------------------------------------------------------------
/model/fcos.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from non_local import NONLocalBlock2D
6 |
7 |
8 | # fcos detection head module
9 | class FCOSHead(nn.Module):
10 | def __init__(self, cfg):
11 | super(type(self), self).__init__()
12 |
13 | # define individual modules
14 | fdim = cfg.head_nfeat
15 | fmul = 3 if cfg.head_dconv else 1
16 |
17 | self.conv0_rdim = nn.Conv2d(fdim*fmul,fdim, 1,1,0)
18 |
19 | if cfg.head_nlocl:
20 | self.nl_feature = NONLocalBlock2D(in_channels=fdim)
21 |
22 | conv1_unit = [nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()]
23 | conv2_unit = [nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()]
24 | for i in range(cfg.head_nconv-1):
25 | conv1_unit.extend([nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()]) #nn.Conv2d(fdim,fdim, 3,1,1)
26 | conv2_unit.extend([nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()])
27 |
28 | self.conv1 = nn.Sequential(*conv1_unit)
29 | self.conv2 = nn.Sequential(*conv2_unit)
30 |
31 | self.conv_cls = nn.Sequential(nn.Conv2d(fdim,2, 3,1,1))
32 | self.conv_reg = nn.Sequential(nn.Conv2d(fdim,4, 3,1,1))
33 |
34 | # define sequential modules
35 | self.cls = nn.Sequential(self.conv1, self.conv_cls)
36 | self.reg = nn.Sequential(self.conv2, self.conv_reg)
37 | self.mul = nn.Parameter(torch.rand(1))
38 |
39 | # init
40 | head_module_list = nn.ModuleList([self.conv0_rdim, self.cls, self.reg])
41 | for m in head_module_list.modules():
42 | if isinstance(m, nn.Conv2d):
43 | nn.init.normal_(m.weight, 0., 1e-3)
44 | nn.init.constant_(m.bias, 0.)
45 |
46 |
47 | def forward(self, x):
48 | # reduce dim
49 | x = self.conv0_rdim(x)
50 | # nonlocal
51 | if hasattr(self, 'nl_feature'):
52 | x = self.nl_feature(x)
53 | # for all branches
54 | cl = self.cls(x)
55 | re = torch.exp(self.mul*self.reg(x))
56 |
57 | return cl, re, x
58 |
59 |
60 |
61 | # standard detection head module (cls, olp, reg)
62 | class DETHead(nn.Module):
63 | def __init__(self, cfg):
64 | super(type(self), self).__init__()
65 |
66 | # define individual modules
67 | self.head_oproi = cfg.head_oproi
68 | fdim = cfg.head_nfeat
69 | conv1_unit = [nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()]
70 | conv2_unit = [nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()]
71 | for i in range(cfg.head_nconv-1):
72 | conv1_unit.extend([nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()]) #nn.Conv2d(fdim,fdim, 3,1,1)
73 | conv2_unit.extend([nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()])
74 |
75 | self.conv1 = nn.Sequential(*conv1_unit)
76 | self.conv2 = nn.Sequential(*conv2_unit)
77 |
78 | self.conv_cls = nn.Sequential(nn.Conv2d(fdim,2, cfg.roip_size,1,0))
79 | self.conv_reg = nn.Sequential(nn.Conv2d(fdim,4, cfg.roip_size,1,0))
80 | if self.head_oproi:
81 | self.conv_olp = nn.Sequential(nn.Conv2d(fdim,1, cfg.roip_size,1,0))
82 |
83 | # define sequential modules
84 | self.cls = nn.Sequential(self.conv1, self.conv_cls)
85 | self.reg = nn.Sequential(self.conv2, self.conv_reg)
86 | if self.head_oproi:
87 | self.olp = nn.Sequential(self.conv2, self.conv_olp)
88 |
89 | # init
90 | head_module_list = nn.ModuleList([self.cls, self.olp, self.reg]) if self.head_oproi else nn.ModuleList([self.cls, self.reg])
91 | for m in head_module_list.modules():
92 | if isinstance(m, nn.Conv2d):
93 | nn.init.normal_(m.weight, 0., 1e-3)
94 | nn.init.constant_(m.bias, 0.)
95 |
96 |
97 | def forward(self, x, out_re=True):
98 | # for all 3 branches
99 | cl = self.cls(x)
100 | op = self.olp(x) if (out_re and self.head_oproi) else None
101 | re = self.reg(x) if out_re else None
102 |
103 | return cl, op, re
104 |
105 |
106 |
--------------------------------------------------------------------------------
/model/focal_loss.py:
--------------------------------------------------------------------------------
1 | # https://github.com/clcarwin/focal_loss_pytorch/blob/master/focalloss.py
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | from torch.autograd import Variable
7 |
8 | class FocalLoss(nn.Module):
9 | def __init__(self, gamma=0, alpha=None, size_average=True):
10 | super(FocalLoss, self).__init__()
11 | self.gamma = gamma
12 | self.alpha = alpha
13 | if isinstance(alpha,(float,int,long)): self.alpha = torch.Tensor([alpha,1-alpha])
14 | if isinstance(alpha,list): self.alpha = torch.Tensor(alpha)
15 | self.size_average = size_average
16 |
17 | def forward(self, input, target):
18 | if input.dim()>2:
19 | input = input.view(input.size(0),input.size(1),-1) # N,C,H,W => N,C,H*W
20 | input = input.transpose(1,2) # N,C,H*W => N,H*W,C
21 | input = input.contiguous().view(-1,input.size(2)) # N,H*W,C => N*H*W,C
22 | target = target.view(-1,1)
23 |
24 | logpt = F.log_softmax(input, dim=1)
25 | logpt = logpt.gather(1,target)
26 | logpt = logpt.view(-1)
27 | pt = Variable(logpt.data.exp())
28 |
29 | if self.alpha is not None:
30 | if self.alpha.type()!=input.data.type():
31 | self.alpha = self.alpha.type_as(input.data)
32 | at = self.alpha.gather(0,target.data.view(-1))
33 | logpt = logpt * Variable(at)
34 |
35 | loss = -1 * (1-pt)**self.gamma * logpt
36 | if self.size_average: return loss.mean()
37 | else: return loss.sum()
38 |
--------------------------------------------------------------------------------
/model/loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from focal_loss import FocalLoss
5 | from utils import down2n
6 | from box_utils import jaccard
7 |
8 |
9 | class Track_Loss(nn.Module):
10 | def __init__(self, cfg):
11 | super(type(self), self).__init__()
12 | # params and flags
13 | self.loss_lambda = cfg.loss_lambda
14 | self.im_size = cfg.im_size
15 | self.map_size = (down2n(cfg.im_size[0],cfg.conv_npool[-1]),down2n(cfg.im_size[1],cfg.conv_npool[-1]))
16 | self.bbox_thres = cfg.bbox_thres
17 | self.head_oproi = cfg.head_oproi
18 | # loss objects
19 | self.cl_loss = FocalLoss(gamma=cfg.loss_gamma, alpha=cfg.loss_alpha, size_average=True)
20 | self.cf_loss = nn.CrossEntropyLoss()
21 | self.op_loss = nn.BCEWithLogitsLoss()
22 |
23 |
24 | def get_cl_loss(self, cl, gt):
25 | # return classification loss for rpn module, use focal loss
26 | # get positive instance indices from gt [bnum, map_h, map_w, 2]
27 | pos_idxs = gt.flatten().nonzero()[:,0]
28 | pos_nums = pos_idxs.shape[0]
29 | cl_f = cl.reshape(-1,2)
30 | gt_f = (1-gt).flatten().long()
31 | loss = self.cl_loss(cl_f, gt_f)
32 | # averaging
33 | #loss /= float(pos_nums) if pos_nums>0 else gt.numel()
34 | return loss
35 |
36 |
37 | def get_re_loss(self, re, gr, gt, eps=1e-7):
38 | # return box regression loss for positive instances
39 | # get positive instance indices from gt [bnum, map_h, map_w, 2]
40 | pos_idxs = gt.flatten().nonzero()[:,0]
41 | pos_nums = pos_idxs.shape[0]
42 | if pos_nums < 1:
43 | return 0.
44 | # select corresponding instances in regularization results
45 | gr_sel = gr.reshape(-1,4)[pos_idxs] # [pos_idxs, ltrb]
46 | re_sel = re.reshape(-1,4)[pos_idxs]
47 |
48 | # iou calculation - intersection
49 | iou_inter = torch.min(re_sel, gr_sel)
50 | iou_inter = (iou_inter[:,0]+iou_inter[:,2])*(iou_inter[:,1]+iou_inter[:,3]) # area = (l+r)*(t+b)
51 | # iou calculation - union
52 | gr_area = (gr_sel[:,0]+gr_sel[:,2])*(gr_sel[:,1]+gr_sel[:,3]) # area = (l+r)*(t+b)
53 | re_area = (re_sel[:,0]+re_sel[:,2])*(re_sel[:,1]+re_sel[:,3]) # area = (l+r)*(t+b)
54 | iou_union = gr_area + re_area - iou_inter + eps
55 | # iou calculation - inter / union
56 | iou_sel = (iou_inter+1.) / (iou_union+1.)
57 | # total iou loss
58 | loss = torch.mean(1.-iou_sel)
59 | return loss
60 |
61 |
62 | def get_rcnn_loss(self, cf, op, bb, br, gb):
63 | # cf = [numb, numbb, 1+nnum, 2(pn)] (output binary class)
64 | # op = [numb, numbb, 1] (output iou overlap score)
65 | # bb = [numb, numbb, 4(xyxy)] (output refined bbox)
66 | # br = [numb, numbb, 4(xyxy)] (output unrefined bbox)
67 | # gb = [numb, 4] (ground truth bbox)
68 | # sizes
69 | num_batch = cf.shape[0]
70 | num_boxes = cf.shape[1]
71 | num_negbb = cf.shape[2]-1
72 | # per batch iteration
73 | loss, total_pos = 0,0
74 | for i in range(num_batch):
75 | # find positive instances in a batch (bb overlap > threshold)
76 | cf_i = cf[i] # [numbox, 1+nnum, 2]
77 | op_i = op[i] if self.head_oproi else None # [numbox, 1]
78 | bb_i = bb[i] # [numbox, 4] = [numbox, x0y0x1y1]
79 | br_i = br[i] # [numbox, 4] = [numbox, x0y0x1y1]
80 | gb_i = gb[i].unsqueeze(0) # [1,4] = [1, x0y0x1y1]
81 | # iou for rois
82 | iou_br = jaccard(gb_i, br_i)[0]
83 | pos_idxs = (iou_br >=self.bbox_thres[0]).nonzero()[:,0]
84 | neg_idxs = (iou_br < self.bbox_thres[1]).nonzero()[:,0]
85 | pos_nums, neg_nums = pos_idxs.shape[0], neg_idxs.shape[0]
86 | total_pos += pos_nums
87 |
88 | # enforce iou overlap regression loss
89 | loss_op_i = self.op_loss(op_i[...,0][pos_idxs], iou_br[pos_idxs]) if (pos_nums>0) and (self.head_oproi) else 0.
90 |
91 | # enforce labels, binary cross entropy loss
92 | # pos input sample ~ pos/neg boxes
93 | cf_lbl_pos = torch.zeros(pos_nums, device=cf_i.device).long()
94 | cf_lbl_neg = torch.ones(neg_nums, device=cf_i.device).long()
95 | loss_cf_i_pos_pos = self.cf_loss(cf_i[pos_idxs,0,:], cf_lbl_pos) if pos_nums>0 else 0.
96 | loss_cf_i_pos_neg = self.cf_loss(cf_i[neg_idxs,0,:], cf_lbl_neg) if neg_nums>0 else 0.
97 | loss_cf_i_pos = loss_cf_i_pos_pos + loss_cf_i_pos_neg
98 |
99 | # neg input sample ~ pos boxes
100 | if (num_negbb>0) and (pos_nums>0):
101 | cf_i_neg = cf_i[pos_idxs,1:,:].flatten(0,1) # [pos_nums*num_negbb, 2]
102 | cf_lbl_neg = torch.ones(pos_nums*num_negbb, device=cf_i.device).long()
103 | loss_cf_i_neg = self.cf_loss(cf_i_neg, cf_lbl_neg)
104 | else:
105 | loss_cf_i_neg = 0.
106 | loss_cf_i = loss_cf_i_pos + loss_cf_i_neg
107 |
108 | # iou for refined bb
109 | iou_bb = jaccard(gb_i, bb_i, eps=1.0)[0]
110 | # enforce box regression (only for positive instances), linear iou loss
111 | loss_bb_i = torch.mean(1. - iou_bb[pos_idxs]) if pos_nums>0 else 0
112 | # loss for single batch, add to total loss
113 | if pos_nums==0:
114 | loss_i = 0.
115 | else:
116 | loss_i = loss_cf_i + loss_bb_i + loss_op_i
117 | loss += loss_i
118 |
119 | # divide loss by batch size
120 | loss /= num_batch
121 | return loss, total_pos
122 |
123 |
124 | def forward(self, outs, gts, add_rcnn_loss=True):
125 | # parse network outputs
126 | out_rpn, out_rcnn = outs
127 | cl, re = out_rpn[0], out_rpn[1]
128 | cf, op, bb, br = out_rcnn[0], out_rcnn[1], out_rcnn[2], out_rcnn[3]
129 | # parse gts (gt_box, gt_cl, gt_re)
130 | gb, gt, gr = gts
131 |
132 | # loss for rpn outputs
133 | rpn_loss0 = self.get_cl_loss(cl, gt)
134 | rpn_loss1 = self.get_re_loss(re, gr, gt)
135 | rpn_loss = rpn_loss0 + rpn_loss1
136 |
137 | # loss for rcnn outputs
138 | rcnn_loss, total_pos = self.get_rcnn_loss(cf, op, bb, br, gb)
139 |
140 | # total loss
141 | if add_rcnn_loss:
142 | total_loss = rpn_loss + self.loss_lambda*rcnn_loss
143 | else:
144 | total_loss = rpn_loss
145 |
146 | return total_loss, [rpn_loss0, rpn_loss1, rcnn_loss, int(total_pos)]
147 |
148 |
149 |
150 |
151 |
152 |
--------------------------------------------------------------------------------
/model/non_local.py:
--------------------------------------------------------------------------------
1 | # code from : https://github.com/AlexHex7/Non-local_pytorch
2 | import torch
3 | from torch import nn
4 | from torch.nn import functional as F
5 |
6 |
7 | class _NonLocalBlockND(nn.Module):
8 | def __init__(self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True):
9 | """
10 | :param in_channels:
11 | :param inter_channels:
12 | :param dimension:
13 | :param sub_sample:
14 | :param bn_layer:
15 | """
16 |
17 | super(_NonLocalBlockND, self).__init__()
18 |
19 | assert dimension in [1, 2, 3]
20 |
21 | self.dimension = dimension
22 | self.sub_sample = sub_sample
23 |
24 | self.in_channels = in_channels
25 | self.inter_channels = inter_channels
26 |
27 | if self.inter_channels is None:
28 | self.inter_channels = in_channels // 2
29 | if self.inter_channels == 0:
30 | self.inter_channels = 1
31 |
32 | if dimension == 3:
33 | conv_nd = nn.Conv3d
34 | max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
35 | bn = nn.GroupNorm #nn.BatchNorm3d
36 | elif dimension == 2:
37 | conv_nd = nn.Conv2d
38 | max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
39 | bn = nn.GroupNorm #nn.BatchNorm2d
40 | else:
41 | conv_nd = nn.Conv1d
42 | max_pool_layer = nn.MaxPool1d(kernel_size=(2))
43 | bn = nn.GroupNorm #nn.BatchNorm1d
44 |
45 | self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
46 | kernel_size=1, stride=1, padding=0)
47 |
48 | if bn_layer:
49 | self.W = nn.Sequential(
50 | conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
51 | kernel_size=1, stride=1, padding=0),
52 | bn(16, self.in_channels) #bn(self.in_channels)
53 | )
54 | nn.init.constant_(self.W[1].weight, 0)
55 | nn.init.constant_(self.W[1].bias, 0)
56 | else:
57 | self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
58 | kernel_size=1, stride=1, padding=0)
59 | nn.init.constant_(self.W.weight, 0)
60 | nn.init.constant_(self.W.bias, 0)
61 |
62 | self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
63 | kernel_size=1, stride=1, padding=0)
64 | self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
65 | kernel_size=1, stride=1, padding=0)
66 |
67 | if sub_sample:
68 | self.g = nn.Sequential(self.g, max_pool_layer)
69 | self.phi = nn.Sequential(self.phi, max_pool_layer)
70 |
71 | def forward(self, x, return_nl_map=False):
72 | """
73 | :param x: (b, c, t, h, w)
74 | :param return_nl_map: if True return z, nl_map, else only return z.
75 | :return:
76 | """
77 |
78 | batch_size = x.size(0)
79 |
80 | g_x = self.g(x).view(batch_size, self.inter_channels, -1)
81 | g_x = g_x.permute(0, 2, 1)
82 |
83 | theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
84 | theta_x = theta_x.permute(0, 2, 1)
85 | phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
86 | f = torch.matmul(theta_x, phi_x)
87 | f_div_C = F.softmax(f, dim=-1)
88 |
89 | y = torch.matmul(f_div_C, g_x)
90 | y = y.permute(0, 2, 1).contiguous()
91 | y = y.view(batch_size, self.inter_channels, *x.size()[2:])
92 | W_y = self.W(y)
93 | z = W_y + x
94 |
95 | if return_nl_map:
96 | return z, f_div_C
97 | return z
98 |
99 |
100 | class NONLocalBlock1D(_NonLocalBlockND):
101 | def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
102 | super(NONLocalBlock1D, self).__init__(in_channels,
103 | inter_channels=inter_channels,
104 | dimension=1, sub_sample=sub_sample,
105 | bn_layer=bn_layer)
106 |
107 |
108 | class NONLocalBlock2D(_NonLocalBlockND):
109 | def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
110 | super(NONLocalBlock2D, self).__init__(in_channels,
111 | inter_channels=inter_channels,
112 | dimension=2, sub_sample=sub_sample,
113 | bn_layer=bn_layer,)
114 |
115 |
116 | class NONLocalBlock3D(_NonLocalBlockND):
117 | def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
118 | super(NONLocalBlock3D, self).__init__(in_channels,
119 | inter_channels=inter_channels,
120 | dimension=3, sub_sample=sub_sample,
121 | bn_layer=bn_layer,)
122 |
123 |
124 | if __name__ == '__main__':
125 | import torch
126 |
127 | for (sub_sample_, bn_layer_) in [(True, True), (False, False), (True, False), (False, True)]:
128 | img = torch.zeros(2, 3, 20)
129 | net = NONLocalBlock1D(3, sub_sample=sub_sample_, bn_layer=bn_layer_)
130 | out = net(img)
131 | print(out.size())
132 |
133 | img = torch.zeros(2, 3, 20, 20)
134 | net = NONLocalBlock2D(3, sub_sample=sub_sample_, bn_layer=bn_layer_, store_last_batch_nl_map=True)
135 | out = net(img)
136 | print(out.size())
137 |
138 | img = torch.randn(2, 3, 8, 20, 20)
139 | net = NONLocalBlock3D(3, sub_sample=sub_sample_, bn_layer=bn_layer_, store_last_batch_nl_map=True)
140 | out = net(img)
141 | print(out.size())
142 |
143 |
--------------------------------------------------------------------------------
/model/rcnn_module.py:
--------------------------------------------------------------------------------
1 | import torch, time
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torchvision.ops import roi_align
5 |
6 | from utils import down2n
7 | from fcos import DETHead
8 |
9 | # receives pooled features and predicts classes + refined boxes
10 | class RCNN_Module(nn.Module):
11 | def __init__(self,cfg):
12 | super(type(self), self).__init__()
13 | # params
14 | self.head_oproi = cfg.head_oproi
15 | self.im_size = cfg.im_size
16 | self.map_size = (down2n(cfg.im_size[0],cfg.conv_npool[-1]),down2n(cfg.im_size[1],cfg.conv_npool[-1]))
17 | self.scale_f = float(self.map_size[0]) / float(self.im_size[0])
18 | self.pool_size = cfg.roip_size
19 | # feat modulation layer
20 | self.conv_x = nn.Conv2d(cfg.head_nfeat, cfg.head_nfeat, 1)
21 | self.conv_y = nn.Conv2d(cfg.head_nfeat, cfg.head_nfeat, 1)
22 | # detection head
23 | self.rcnn_head = DETHead(cfg)
24 |
25 | # init
26 | rcnn_convs = nn.ModuleList([self.conv_x, self.conv_y])
27 | for m in rcnn_convs.modules():
28 | if isinstance(m, nn.Conv2d):
29 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
30 |
31 |
32 | def forward(self, feats, boxes):
33 | # given query feature, candidate features, candidate boxes
34 | # return classification results and bbox refinements
35 |
36 | # feats = (xfa, zfa)
37 | # xfa = list of len=2 (pos/neg) where xfa[0]= Tensor[bnum, ndim, pool_size,pool_size]
38 | # zfa = list of len=batch_size where zfa[i]=[nms_num_bb, ndim, pool_size, pool_size]
39 | # boxes = list of len=batch_size where boxes[i]=[num_num_bb, 4]
40 |
41 | # pos_feats [bnum, cnum, pool_sz, pool_sz]
42 | # neg_feats [bnum, nnum, cnum, pool_sz, pool_sz]
43 | xfa_p, xfa_n, yfa = feats
44 | pf = xfa_p # use spatially pooled feats
45 | nf = xfa_n
46 | # candidate feats [bnum, bbnum, cnum, psz, psz]
47 | cf = yfa
48 | # store shapes
49 | batch_size = cf.shape[0]
50 | bbnum_size = cf.shape[1]
51 | nfeat_size = cf.shape[2]
52 | negff_size = nf.shape[1] if nf is not None else 0
53 |
54 | # feature modulation
55 | pf = self.conv_x(pf)
56 | nf = self.conv_x(nf.flatten(0,1)).view(batch_size, negff_size, nfeat_size, self.pool_size, self.pool_size) if nf is not None else None
57 | cf = self.conv_y(cf.flatten(0,1)).view(batch_size, bbnum_size, nfeat_size, self.pool_size, self.pool_size)
58 |
59 | # == for positive feats
60 | # repeat pf feats
61 | pf_r = pf.unsqueeze(1).repeat_interleave(bbnum_size, dim=1) # [bnum, bbnum, cnum, psz, psz]
62 | # multiply between feats (correlation) or concat channel dim
63 | cc = pf_r * cf #torch.cat((pf_r, cf), dim=2)#
64 | # detection head
65 | cl_p, op, re = self.rcnn_head(cc.flatten(0,1))
66 | cl_p = cl_p.view(batch_size, bbnum_size, 1, 2)
67 | op = op.view(batch_size, bbnum_size, 1) if self.head_oproi else None
68 | re = re.view(batch_size, bbnum_size, 4)
69 | #re = torch.zeros_like(re)
70 |
71 | # == for negative feats
72 | if nf is not None:
73 | nf_r = nf.unsqueeze(1).repeat_interleave(bbnum_size, dim=1) # [bnum, bbnum, nnum, cnum, psz, psz]
74 | cf_r = cf.unsqueeze(2).repeat_interleave(negff_size, dim=2) # [bnum, bbnum, nnum, cnum, psz, psz]
75 | cn = nf_r * cf_r # correlation
76 | # cn = torch.cat((nf_r,cf_r), dim=3) # concatenation
77 | # detection head
78 | cl_n, _, _ = self.rcnn_head(cn.flatten(0,2), out_re=False)
79 | cl_n = cl_n.view(batch_size, bbnum_size, negff_size, 2)
80 |
81 | # integrated classification scores [bnum, bbnum, 1+nnum, 2]
82 | cl = torch.cat((cl_p, cl_n), dim=2) if nf is not None else cl_p
83 |
84 | # == modify input boxes accto re output
85 | # boxes = [bnum, bbnum, x0y0x1y1]
86 | boxes = torch.stack(boxes)
87 | #bb = boxes + re
88 | # change to [bnum, bbnum, x_cen/y_cen/width/height]
89 | boxes_w = boxes[...,2] - boxes[...,0]
90 | boxes_h = boxes[...,3] - boxes[...,1]
91 | boxes_xc = boxes[...,0] + boxes_w*0.5
92 | boxes_yc = boxes[...,1] + boxes_h*0.5
93 | # modify accto regression outputs
94 | boxes_xc_m = boxes_xc + boxes_w * re[...,0]
95 | boxes_yc_m = boxes_yc + boxes_h * re[...,1]
96 | boxes_w_m = boxes_w * torch.exp(re[...,2])
97 | boxes_h_m = boxes_h * torch.exp(re[...,3])
98 | # revert cooridates
99 | boxes_x0 = (boxes_xc_m - boxes_w_m*0.5).unsqueeze(-1)
100 | boxes_x1 = (boxes_xc_m + boxes_w_m*0.5).unsqueeze(-1)
101 | boxes_y0 = (boxes_yc_m - boxes_h_m*0.5).unsqueeze(-1)
102 | boxes_y1 = (boxes_yc_m + boxes_h_m*0.5).unsqueeze(-1)
103 | # concat
104 | bb = torch.cat([boxes_x0, boxes_y0, boxes_x1, boxes_y1], dim=-1)
105 |
106 | return cl, op, bb, boxes
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
--------------------------------------------------------------------------------
/model/resnet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | # from .utils import load_state_dict_from_url
4 |
5 |
6 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
7 | 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
8 | 'wide_resnet50_2', 'wide_resnet101_2']
9 |
10 |
11 | model_urls = {
12 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
13 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
14 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
15 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
16 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
17 | 'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
18 | 'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
19 | 'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
20 | 'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
21 | }
22 |
23 |
24 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
25 | """3x3 convolution with padding"""
26 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
27 | padding=dilation, groups=groups, bias=False, dilation=dilation)
28 |
29 |
30 | def conv1x1(in_planes, out_planes, stride=1):
31 | """1x1 convolution"""
32 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
33 |
34 |
35 | class BasicBlock(nn.Module):
36 | expansion = 1
37 |
38 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
39 | base_width=64, dilation=1, norm_layer=None):
40 | super(BasicBlock, self).__init__()
41 | if norm_layer is None:
42 | norm_layer = nn.BatchNorm2d
43 | if groups != 1 or base_width != 64:
44 | raise ValueError('BasicBlock only supports groups=1 and base_width=64')
45 | if dilation > 1:
46 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
47 | # Both self.conv1 and self.downsample layers downsample the input when stride != 1
48 | self.conv1 = conv3x3(inplanes, planes, stride)
49 | self.bn1 = norm_layer(planes)
50 | self.relu = nn.ReLU(inplace=True)
51 | self.conv2 = conv3x3(planes, planes)
52 | self.bn2 = norm_layer(planes)
53 | self.downsample = downsample
54 | self.stride = stride
55 |
56 | def forward(self, x):
57 | identity = x
58 |
59 | out = self.conv1(x)
60 | out = self.bn1(out)
61 | out = self.relu(out)
62 |
63 | out = self.conv2(out)
64 | out = self.bn2(out)
65 |
66 | if self.downsample is not None:
67 | identity = self.downsample(x)
68 |
69 | out += identity
70 | out = self.relu(out)
71 |
72 | return out
73 |
74 |
75 | class Bottleneck(nn.Module):
76 | # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
77 | # while original implementation places the stride at the first 1x1 convolution(self.conv1)
78 | # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
79 | # This variant is also known as ResNet V1.5 and improves accuracy according to
80 | # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
81 |
82 | expansion = 4
83 |
84 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
85 | base_width=64, dilation=1, norm_layer=None):
86 | super(Bottleneck, self).__init__()
87 | if norm_layer is None:
88 | norm_layer = nn.BatchNorm2d
89 | width = int(planes * (base_width / 64.)) * groups
90 | # Both self.conv2 and self.downsample layers downsample the input when stride != 1
91 | self.conv1 = conv1x1(inplanes, width)
92 | self.bn1 = norm_layer(width)
93 | self.conv2 = conv3x3(width, width, stride, groups, dilation)
94 | self.bn2 = norm_layer(width)
95 | self.conv3 = conv1x1(width, planes * self.expansion)
96 | self.bn3 = norm_layer(planes * self.expansion)
97 | self.relu = nn.ReLU(inplace=True)
98 | self.downsample = downsample
99 | self.stride = stride
100 |
101 | def forward(self, x):
102 | identity = x
103 |
104 | out = self.conv1(x)
105 | out = self.bn1(out)
106 | out = self.relu(out)
107 |
108 | out = self.conv2(out)
109 | out = self.bn2(out)
110 | out = self.relu(out)
111 |
112 | out = self.conv3(out)
113 | out = self.bn3(out)
114 |
115 | if self.downsample is not None:
116 | identity = self.downsample(x)
117 |
118 | out += identity
119 | out = self.relu(out)
120 |
121 | return out
122 |
123 |
124 | class ResNet(nn.Module):
125 |
126 | def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
127 | groups=1, width_per_group=64, replace_stride_with_dilation=None,
128 | norm_layer=None, cfg=None):
129 | super(ResNet, self).__init__()
130 | if norm_layer is None:
131 | norm_layer = nn.BatchNorm2d
132 | self._norm_layer = norm_layer
133 |
134 | self.inplanes = 64
135 | self.dilation = 1
136 | if replace_stride_with_dilation is None:
137 | # each element in the tuple indicates if we should replace
138 | # the 2x2 stride with a dilated convolution instead
139 | replace_stride_with_dilation = [False, False, False]
140 | if len(replace_stride_with_dilation) != 3:
141 | raise ValueError("replace_stride_with_dilation should be None "
142 | "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
143 | self.groups = groups
144 | self.base_width = width_per_group
145 | self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
146 | bias=False)
147 | self.bn1 = norm_layer(self.inplanes)
148 | self.relu = nn.ReLU(inplace=True)
149 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
150 | self.layer1 = self._make_layer(block, 64, layers[0])
151 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
152 | dilate=replace_stride_with_dilation[0])
153 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
154 | dilate=replace_stride_with_dilation[1])
155 | self.layer4 = self._make_layer(block, 512, layers[3], stride=1, #2
156 | dilate=replace_stride_with_dilation[2])
157 | # self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
158 | # self.fc = nn.Linear(512 * block.expansion, num_classes)
159 |
160 | for m in self.modules():
161 | if isinstance(m, nn.Conv2d):
162 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
163 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
164 | nn.init.constant_(m.weight, 1)
165 | nn.init.constant_(m.bias, 0)
166 |
167 | # Zero-initialize the last BN in each residual branch,
168 | # so that the residual branch starts with zeros, and each residual block behaves like an identity.
169 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
170 | if zero_init_residual:
171 | for m in self.modules():
172 | if isinstance(m, Bottleneck):
173 | nn.init.constant_(m.bn3.weight, 0)
174 | elif isinstance(m, BasicBlock):
175 | nn.init.constant_(m.bn2.weight, 0)
176 |
177 | def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
178 | norm_layer = self._norm_layer
179 | downsample = None
180 | previous_dilation = self.dilation
181 | if dilate:
182 | self.dilation *= stride
183 | stride = 1
184 | if stride != 1 or self.inplanes != planes * block.expansion:
185 | downsample = nn.Sequential(
186 | conv1x1(self.inplanes, planes * block.expansion, stride),
187 | norm_layer(planes * block.expansion),
188 | )
189 |
190 | layers = []
191 | layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
192 | self.base_width, previous_dilation, norm_layer))
193 | self.inplanes = planes * block.expansion
194 | for _ in range(1, blocks):
195 | layers.append(block(self.inplanes, planes, groups=self.groups,
196 | base_width=self.base_width, dilation=self.dilation,
197 | norm_layer=norm_layer))
198 |
199 | return nn.Sequential(*layers)
200 |
201 | def _forward_impl(self, x):
202 | # See note [TorchScript super()]
203 | x = self.conv1(x)
204 | x = self.bn1(x)
205 | x = self.relu(x)
206 | x = self.maxpool(x)
207 |
208 | x = self.layer1(x)
209 | x = self.layer2(x)
210 | x = self.layer3(x)
211 | x = self.layer4(x)
212 |
213 | # x = self.avgpool(x)
214 | # x = torch.flatten(x, 1)
215 | # x = self.fc(x)
216 |
217 | return x
218 |
219 | def forward(self, x):
220 | return self._forward_impl(x)
221 |
222 |
223 | def _resnet(arch, block, layers, pretrained, progress, **kwargs):
224 | model = ResNet(block, layers, **kwargs)
225 | if pretrained:
226 | state_dict = load_state_dict_from_url(model_urls[arch],
227 | progress=progress)
228 | model.load_state_dict(state_dict)
229 | return model
230 |
231 |
232 | def resnet18(pretrained=False, progress=True, **kwargs):
233 | r"""ResNet-18 model from
234 | `"Deep Residual Learning for Image Recognition" `_
235 |
236 | Args:
237 | pretrained (bool): If True, returns a model pre-trained on ImageNet
238 | progress (bool): If True, displays a progress bar of the download to stderr
239 | """
240 | return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
241 | **kwargs)
242 |
243 |
244 | def resnet34(pretrained=False, progress=True, **kwargs):
245 | r"""ResNet-34 model from
246 | `"Deep Residual Learning for Image Recognition" `_
247 |
248 | Args:
249 | pretrained (bool): If True, returns a model pre-trained on ImageNet
250 | progress (bool): If True, displays a progress bar of the download to stderr
251 | """
252 | return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
253 | **kwargs)
254 |
255 |
256 | def resnet50(pretrained=False, progress=True, **kwargs):
257 | r"""ResNet-50 model from
258 | `"Deep Residual Learning for Image Recognition" `_
259 |
260 | Args:
261 | pretrained (bool): If True, returns a model pre-trained on ImageNet
262 | progress (bool): If True, displays a progress bar of the download to stderr
263 | """
264 | return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
265 | **kwargs)
266 |
267 |
268 | def resnet101(pretrained=False, progress=True, **kwargs):
269 | r"""ResNet-101 model from
270 | `"Deep Residual Learning for Image Recognition" `_
271 |
272 | Args:
273 | pretrained (bool): If True, returns a model pre-trained on ImageNet
274 | progress (bool): If True, displays a progress bar of the download to stderr
275 | """
276 | return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
277 | **kwargs)
278 |
279 |
280 | def resnet152(pretrained=False, progress=True, **kwargs):
281 | r"""ResNet-152 model from
282 | `"Deep Residual Learning for Image Recognition" `_
283 |
284 | Args:
285 | pretrained (bool): If True, returns a model pre-trained on ImageNet
286 | progress (bool): If True, displays a progress bar of the download to stderr
287 | """
288 | return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
289 | **kwargs)
290 |
291 |
292 | def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
293 | r"""ResNeXt-50 32x4d model from
294 | `"Aggregated Residual Transformation for Deep Neural Networks" `_
295 |
296 | Args:
297 | pretrained (bool): If True, returns a model pre-trained on ImageNet
298 | progress (bool): If True, displays a progress bar of the download to stderr
299 | """
300 | kwargs['groups'] = 32
301 | kwargs['width_per_group'] = 4
302 | return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
303 | pretrained, progress, **kwargs)
304 |
305 |
306 | def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
307 | r"""ResNeXt-101 32x8d model from
308 | `"Aggregated Residual Transformation for Deep Neural Networks" `_
309 |
310 | Args:
311 | pretrained (bool): If True, returns a model pre-trained on ImageNet
312 | progress (bool): If True, displays a progress bar of the download to stderr
313 | """
314 | kwargs['groups'] = 32
315 | kwargs['width_per_group'] = 8
316 | return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
317 | pretrained, progress, **kwargs)
318 |
319 |
320 | def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
321 | r"""Wide ResNet-50-2 model from
322 | `"Wide Residual Networks" `_
323 |
324 | The model is the same as ResNet except for the bottleneck number of channels
325 | which is twice larger in every block. The number of channels in outer 1x1
326 | convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
327 | channels, and in Wide ResNet-50-2 has 2048-1024-2048.
328 |
329 | Args:
330 | pretrained (bool): If True, returns a model pre-trained on ImageNet
331 | progress (bool): If True, displays a progress bar of the download to stderr
332 | """
333 | kwargs['width_per_group'] = 64 * 2
334 | return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
335 | pretrained, progress, **kwargs)
336 |
337 |
338 | def wide_resnet101_2(pretrained=False, progress=True, **kwargs):
339 | r"""Wide ResNet-101-2 model from
340 | `"Wide Residual Networks" `_
341 |
342 | The model is the same as ResNet except for the bottleneck number of channels
343 | which is twice larger in every block. The number of channels in outer 1x1
344 | convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
345 | channels, and in Wide ResNet-50-2 has 2048-1024-2048.
346 |
347 | Args:
348 | pretrained (bool): If True, returns a model pre-trained on ImageNet
349 | progress (bool): If True, displays a progress bar of the download to stderr
350 | """
351 | kwargs['width_per_group'] = 64 * 2
352 | return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
353 | pretrained, progress, **kwargs)
354 |
--------------------------------------------------------------------------------
/model/rpn_module.py:
--------------------------------------------------------------------------------
1 | import torch, time
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torchvision.ops import roi_align
5 |
6 | from utils import down2n
7 | from fcos import FCOSHead
8 | from boxes import BoxModule
9 | from cbam import CBAModule
10 | from context import ContextModule
11 |
12 | # receives pairwise images as input, returns roi bounding box proposals and its pooled features
13 | class RPN_Module(nn.Module):
14 | def __init__(self,cfg,bb_ch):
15 | super(type(self), self).__init__()
16 | # params
17 | self.im_size = cfg.im_size
18 | self.map_size = (down2n(cfg.im_size[0],cfg.conv_npool[-1]),down2n(cfg.im_size[1],cfg.conv_npool[-1]))
19 | self.scale_f = float(self.map_size[0]) / float(self.im_size[0])
20 | self.pool_size = cfg.roip_size
21 | self.head_dconv = cfg.head_dconv
22 | self.head_ctxff = cfg.head_ctxff
23 | # numof channels for backbone output, refined output
24 | self.bb_ch = bb_ch
25 | self.head_nfeat = cfg.head_nfeat
26 | # attetntion module and channel conversion for backbone outputs
27 | fmul = 3 if cfg.head_dconv else 1
28 | self.cbamod = CBAModule(self.head_nfeat*fmul)
29 | self.conv_x = nn.Conv2d(self.bb_ch, cfg.head_nfeat, 1)
30 | self.conv_y = nn.Conv2d(self.bb_ch, cfg.head_nfeat, 1)
31 | # detection head
32 | self.roi_head = FCOSHead(cfg)
33 | # nms box predictions
34 | self.boxes = BoxModule(cfg)
35 | # context module
36 | self.context_x = ContextModule(cfg) if self.head_ctxff[0] else None
37 | self.context_y = ContextModule(cfg) if self.head_ctxff[0] else None
38 |
39 | # init
40 | rpn_convs = nn.ModuleList([self.conv_x, self.conv_y])
41 | for m in rpn_convs.modules():
42 | if isinstance(m, nn.Conv2d):
43 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
44 |
45 |
46 | def fmap_corr(self,xf,yf,pad=True,dilate=1):
47 | # get shapes
48 | xf_s = xf.shape # [bn, ch, xh, xw] (larger -> as image) [1, bn*ch, xh, xw]
49 | yf_s = yf.shape # [bn, ch, yh, yw] (smaller -> as filter) [bn*cn, 1, yh, yw]
50 | bn = xf_s[0]; cn = xf_s[1];
51 | # reshape dims
52 | xf_r = xf.view(1, bn*cn, xf_s[2], xf_s[3]) # [1, bn*cn, xh, xw]
53 | yf_r = yf.reshape(1, bn*cn, yf_s[2], yf_s[3]).transpose(0,1) # [bn*cn, 1, yh, yw] view
54 | # group conv [1, bn*cn, zh, zw] -> [bn, cn, zh, zw]
55 | if pad: pnum = (yf_s[-1] + (dilate-1)*(yf_s[-1]-1))//2
56 | else: pnum = 0
57 | of = F.conv2d(input=xf_r, weight=yf_r, groups=bn*cn, bias=None, padding=pnum, dilation=dilate)
58 | of = of.view(bn, cn, of.shape[2], of.shape[3])
59 | return of
60 |
61 |
62 | def dconv_fmap_corr(self, yf, xf):
63 | if self.head_dconv:
64 | zf = []
65 | for i in range(len(xf)):
66 | zf.append(self.fmap_corr(yf, xf[i]))
67 | zf = torch.cat(zf, dim=1)
68 | else:
69 | zf = self.fmap_corr(yf, xf[1], pad=False)
70 | return zf
71 |
72 |
73 | def corr_head(self, xfa, yf):
74 | # cross corr for xfa
75 | zf = self.dconv_fmap_corr(yf,xfa)
76 | # attention module
77 | zf,at = self.cbamod(zf)
78 | # detection head
79 | cl,re,zf = self.roi_head(zf)
80 | # permute dims to [bnum, map_h, map_w, pred], where pred_cls=[neg/pos], pred_re=[ltrb distances]
81 | cl = cl.permute(0,2,3,1)
82 | re = re.permute(0,2,3,1)
83 | return zf,cl,re,at
84 |
85 |
86 | def pool_feat(self, xf, xb_p):
87 | # xb: list of boxes wrt each batch : list, where len(list)=bnum, list[i] = Tensor[N,4]
88 | # feats -> change channel nums [bnum, ndim, pool_sz, pool_sz]
89 |
90 | # original roi align
91 | xfa = [roi_align(xf, xb_p, (self.pool_size,self.pool_size), self.scale_f)]
92 | # additional feats
93 | if self.head_dconv:
94 | # d2
95 | psz = self.pool_size*2 -1
96 | xfa.append(roi_align(xf, xb_p, (psz,psz), self.scale_f))
97 | # p2
98 | psz = self.pool_size//2
99 | psz += 1 if psz%2==0 else 0
100 | xfa.append(roi_align(xf, xb_p, (psz,psz), self.scale_f))
101 | else:
102 | xfa.append(roi_align(xf, xb_p, (1,1), self.scale_f))
103 |
104 | return xfa
105 |
106 |
107 | def forward(self,xf_in,yf_in, xb, xfa_in=None, add_box=None, pool_xf=False):
108 | # xf,yf : Tensor, [bnum, ndim, map_size_h, map_size_w]
109 | # xb : list, [list_posbb] where len(list_xxxbb)=bnum, list_xxxbb[i] = Tensor[N,4]
110 | # xfa_in : trident feat pooled from initial xf for reuse
111 | # add_box : list of boxes to add roi list(add_box)=bnum, add_box[i] = Tensor[M,4]
112 | # pool_xf : pool-align feat from xf rather than yf
113 |
114 | # change channel num of input feature
115 | xf = self.conv_x(xf_in) if xfa_in is None else None
116 | yf = self.conv_y(yf_in)
117 | # roi_align pooling from xf according to xb coordinates
118 | # use given feature if pooled feat xfa is already given
119 | xfa_tri = self.pool_feat(xf, xb) if xfa_in is None else xfa_in
120 |
121 | # fmap cross correlation + detection head = class, regression maps
122 | zf,cl,re,at = self.corr_head(xfa_tri, yf)
123 | pred_maps = (cl,re,at)
124 |
125 | # ==== obtain ROI bounding boxes and pooled features
126 | # nms stage for box predictions : bboxes+scores
127 | pred_roibb, pred_roisc = self.boxes(cl,re)
128 | # add previous box (if exists)
129 | if add_box is not None:
130 | for bi in range(len(pred_roibb)):
131 | pred_roibb[bi] = torch.cat((pred_roibb[bi], add_box[bi]),dim=0)
132 |
133 | # pool feats for given boxes yf with shapes: yfa = [bnum, bbnum, cnum, pool_size, pool_size]
134 | num_boxes = self.boxes.bb_nums if add_box is None else self.boxes.bb_nums+add_box[0].shape[0]
135 | yf = xf if pool_xf else yf # for initial frame feature fetching purposes
136 | yfa = roi_align(yf, pred_roibb, (self.pool_size,self.pool_size), self.scale_f)
137 | yfa = yfa.view(yf.shape[0], num_boxes, yf.shape[1], self.pool_size, self.pool_size)
138 |
139 | # (if specified) embed context feature into ROI features (yfa) based on box predictions (cl,re)
140 | if self.head_ctxff[0]:
141 | yfa = self.context_y(cl,re, zf,yfa,pred_roibb) if not pool_xf else self.context_x(cl,re, zf,yfa,pred_roibb)
142 |
143 | # feats = (xfa_tri, xfa_pos, yfa)
144 | pred_feats = (xfa_tri, xfa_tri[0], yfa)
145 |
146 | return pred_roibb, pred_roisc, pred_feats, pred_maps
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
--------------------------------------------------------------------------------
/output/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JanghoonChoi/TACT/6870cf45a489f4ebd610f25d099ab5f7470b22e4/output/__init__.py
--------------------------------------------------------------------------------
/test_tracker.py:
--------------------------------------------------------------------------------
1 | import os,sys,argparse,time,cv2
2 |
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from cfgs import cfg_test as cfg
6 | import torch
7 |
8 | from tracker import Tracker
9 | # from tracker_batch import Tracker
10 |
11 | torch.backends.cudnn.deterministic = True
12 | torch.backends.cudnn.benchmark = False
13 |
14 | _db_name = 'lasot'
15 | _save_txt = True
16 | _calc_auc = True
17 | _out_vid = False
18 |
19 |
20 | def run_eval(idx=-1):
21 | tracker = Tracker(cfg=cfg, db_name=_db_name, idx=idx)
22 | tic = time.time()
23 | res, fps, auc = tracker.run_track_db(seq_list=None, save_res=_save_txt, calc_auc=_calc_auc, out_vid=_out_vid)
24 |
25 | if _calc_auc:
26 | res_str = 'db: '+ _db_name + ', auc: '+str(np.mean(auc))[:6]+ ', fps: '+str(np.mean(fps))[:5]+ ', ckpt: '+tracker.chkpt_file[5:-4] + '\n'
27 | with open('all_results.txt','a') as res_file:
28 | res_file.write(res_str)
29 |
30 | print 'elaptime ' + str((time.time()-tic)/60.)[:6] + ' mins'
31 | return np.mean(auc)
32 |
33 |
34 |
35 | run_eval()
36 |
37 |
--------------------------------------------------------------------------------
/th_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import numpy as np
5 | from utils import down2n
6 |
7 | # torch implementation of np.random.choice
8 | def th_choice(a, p=None):
9 | """ torch implementation of np.random.choice(), x1.1~1.5 slower than original function """
10 | # preliminaries
11 | a_l = len(a)
12 | if p is None:
13 | idx = torch.randperm(a_l)
14 | return a[idx[0]]
15 |
16 | elif torch.sum(p) < 1.:
17 | print torch.sum(p),' p.sum() not 1'
18 |
19 | # accumulative prob
20 | pa = torch.cumsum(p,0)
21 |
22 | # random (0,1)
23 | trnd = torch.rand(1)[0]
24 |
25 | # find
26 | idx = (torch.argmax((pa < trnd).type(torch.FloatTensor))+1) % a_l
27 | return a[idx]
28 |
29 |
30 | def th_choice_mul(a, n):
31 | # choose n random instances from a
32 | # assume p=uniform, with replacement
33 | a_l = len(a)
34 | idxs = torch.randint(low=0, high=a_l, size=(n,))
35 |
36 | if isinstance(a, list):
37 | return [a[i] for i in idxs]
38 | elif n==1:
39 | return [a[idxs]]
40 | else:
41 | return a[idxs]
42 |
43 |
44 | def th_choice_seq(a, n):
45 | # choose n sequential instances from a
46 | # assume p=uniform, with replacement
47 | a_l = len(a)
48 | if n <= a_l:
49 | idx = torch.randint(low=0, high=a_l-n+1, size=())
50 | idxs = torch.LongTensor(range(idx, idx+n))
51 | else:
52 | idxs = torch.LongTensor(range(a_l)+[a_l-1]*(n-a_l))
53 |
54 | if isinstance(a, list):
55 | return [a[i] for i in idxs]
56 | elif n==1:
57 | return [a[idxs]]
58 | else:
59 | return a[idxs]
60 |
61 |
62 | def th_rand(n=1):
63 | """ proxy to torch.rand(n)[0] """
64 | if n == 1:
65 | return float(torch.rand(n)[0])
66 | else:
67 | return torch.rand(n).numpy()
68 |
69 |
70 | def th_rand_rng(low, high, n=1):
71 | """ pull uniform random sample(s) from [a,b) """
72 | if n == 1:
73 | return (high-low)*float(torch.rand(n)[0])+low
74 | else:
75 | return (high-low)*torch.rand(n)+low
76 |
77 |
78 | def th_rand_sym(r, n=1):
79 | """ pull random sample(s) from [1/r,r), keeping probability mean to 1.0 """
80 | def unit_rnd(r):
81 | ud_rf = 1 if th_rand() < 1./(r+1.) else 0
82 | rnd = th_rand_rng(1.,r) if ud_rf else th_rand_rng(1./r,1)
83 | return rnd
84 |
85 | if n == 1:
86 | return unit_rnd(r)
87 | else:
88 | return torch.Tensor([unit_rnd(r) for i in range(n)])
89 |
90 |
91 | def th_randint(low, high=None, size=1):
92 | """ proxy to torch.randint(low,high,(size,)) """
93 | if high is None: ilow = 0; ihigh = low
94 | else: ilow = low; ihigh = high
95 |
96 | if size == 1:
97 | return torch.randint(low=ilow, high=ihigh, size=(size,)).numpy()[0]
98 | else:
99 | return torch.randint(low=ilow, high=ihigh, size=(size,)).numpy()
100 |
101 |
102 | # generate center-anchor cooridnates for a given img_size and pooling size
103 | def generate_reg_coords(cfg):
104 | map_size = (down2n(cfg.im_size[0],cfg.conv_npool[-1]),down2n(cfg.im_size[1],cfg.conv_npool[-1]))
105 |
106 | batch_gtr = np.zeros([map_size[0], map_size[1], 4]) #[map_h, map_w, ltrb]
107 | grid_r = np.tile(np.arange(0.5, 0.5+map_size[0], 1.).reshape([-1,1]),(1,map_size[1]))
108 | grid_c = np.tile(np.arange(0.5, 0.5+map_size[1], 1.).reshape([1,-1]),(map_size[0],1))
109 | map_scale = float(cfg.im_size[0])/float(map_size[0])
110 |
111 | batch_gtr[:,:,0] = grid_c # left
112 | batch_gtr[:,:,1] = grid_r # top
113 | batch_gtr[:,:,2] = grid_c # right
114 | batch_gtr[:,:,3] = grid_r # bottom
115 | batch_gtr *= map_scale # rescale map by size
116 |
117 | return batch_gtr
118 |
119 |
120 |
--------------------------------------------------------------------------------
/track_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import cv2
3 | from utils import crop_img
4 | import matplotlib.pyplot as plt
5 |
6 |
7 | def result_curve(result, num_points):
8 | #num_seqs = 1 #len(result)
9 | all_seq_plot = np.zeros(num_points)
10 |
11 | #for i in range(num_seqs):
12 | seq_iou = result
13 | seq_len = np.size(result)
14 | seq_plot = list()
15 |
16 | bb_thvars = np.linspace(0,1,num_points)
17 | for bbth in bb_thvars:
18 | ratio_th = np.sum(seq_iou > bbth).astype(float) / seq_len
19 | seq_plot.append(ratio_th)
20 |
21 | return np.array(seq_plot)
22 |
23 |
24 | def result_curve_px(result, num_points):
25 | num_seqs = len(result)
26 | all_ious = np.array([])
27 |
28 | for i in range(num_seqs):
29 | all_ious = np.append(all_ious, result[i])
30 |
31 | num_frames = len(all_ious)
32 | bb_thvars = np.linspace(0,50,num_points)
33 | all_ratio_th = np.array([])
34 |
35 | for bbth in bb_thvars:
36 | ratio_th = np.sum(all_ious <= bbth).astype(float) / num_frames
37 | all_ratio_th = np.append(all_ratio_th, ratio_th)
38 |
39 | return all_ratio_th
40 |
41 |
42 | def box_overlap_area(A,B):
43 | if A.ndim == 1:
44 | A_xmin = A[0]; A_xmax = A_xmin+A[2]; A_ymin = A[1]; A_ymax = A_ymin+A[3]
45 | B_xmin = B[0]; B_xmax = B_xmin+B[2]; B_ymin = B[1]; B_ymax = B_ymin+B[3]
46 | # x,y dim overlap?
47 | x_over = max(0, min(A_xmax,B_xmax)-max(A_xmin,B_xmin))
48 | y_over = max(0, min(A_ymax,B_ymax)-max(A_ymin,B_ymin))
49 | # area of overlap
50 | area_overlap = x_over*y_over
51 | return area_overlap
52 | else:
53 | num_d = A.shape[0]
54 | A_xmin = A[:,0]; A_xmax = A_xmin+A[:,2]; A_ymin = A[:,1]; A_ymax = A_ymin+A[:,3]
55 | B_xmin = B[:,0]; B_xmax = B_xmin+B[:,2]; B_ymin = B[:,1]; B_ymax = B_ymin+B[:,3]
56 | # x,y dim overlap?
57 | x_over = np.max([np.zeros(num_d), np.min([A_xmax,B_xmax], axis=0)-np.max([A_xmin,B_xmin], axis=0)], axis=0)
58 | y_over = np.max([np.zeros(num_d), np.min([A_ymax,B_ymax], axis=0)-np.max([A_ymin,B_ymin], axis=0)], axis=0)
59 | # area of overlap
60 | area_overlap = x_over*y_over
61 | return area_overlap
62 |
63 |
64 | def box_overlap_score(A,B):
65 | if A.ndim == 1:
66 | A_width = A[2]; A_height = A[3]; B_width = B[2]; B_height = B[3];
67 | A_area = A[2]*A[3]; B_area = B[2]*B[3];
68 | area_overlap = box_overlap_area(A,B)
69 | area_union = A_area + B_area - area_overlap
70 | return area_overlap / area_union
71 | else:
72 | A_width = A[:,2]; A_height = A[:,3]; B_width = B[:,2]; B_height = B[:,3];
73 | A_area = A[:,2]*A[:,3]; B_area = B[:,2]*B[:,3];
74 | area_overlap = box_overlap_area(A,B)
75 | area_union = A_area + B_area - area_overlap
76 | return area_overlap / area_union
77 |
78 |
--------------------------------------------------------------------------------
/tracker.py:
--------------------------------------------------------------------------------
1 | import os,sys,time,cv2
2 |
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import moviepy.editor as mpe
6 |
7 | import torch, torchvision
8 | import torch.nn as nn
9 | import torch.nn.functional as F
10 |
11 | from model.build_model import Track_Model
12 | from utils import crop_img, imread_to_rgb
13 | from track_utils import box_overlap_score, result_curve
14 |
15 |
16 | # tracker object
17 | class Tracker(nn.Module):
18 | def __init__(self, cfg, db_name=None, idx=-1):
19 | super(type(self), self).__init__()
20 | # model object
21 | cfg.batch_size = 1
22 | self.net = Track_Model(cfg).cuda()
23 | self.net.eval()
24 | self.cfg = cfg
25 | # tracking db placeholders
26 | self.track_dbnm = None
27 | self.track_path = None
28 | self.track_dict = None
29 | # load model weights
30 | self.chkpt_file = [ckpt for ckpt in sorted(os.listdir(cfg.CHKPT_PATH)) if ckpt.find(cfg.CHKPT_CODE)>0][idx]
31 | ckpt = torch.load(cfg.CHKPT_PATH+self.chkpt_file)
32 | self.net.load_state_dict(ckpt['model_state_dict'], strict=False)
33 | print 'ckpt: ' + self.chkpt_file
34 | # load db
35 | if db_name is not None:
36 | self.load_track_db(db_name)
37 |
38 |
39 | def load_track_db(self, name):
40 | # load dataset
41 | self.track_dbnm = name
42 | self.track_path = self.cfg.db_info[name]['path']
43 | self.track_dict = np.load(self.cfg.db_info[name]['dict'], allow_pickle=True).item()
44 | print 'dataset: ' + name
45 |
46 |
47 | def read_img_resize(self, imf):
48 | img_orig = imread_to_rgb(imf)
49 | h_orig, w_orig, _ = img_orig.shape
50 | MAX_H, MAX_W = self.cfg.im_size
51 | s_f = float(MAX_H) / float(h_orig)
52 | if float(w_orig)*s_f > MAX_W:
53 | s_f = float(MAX_W) / float(w_orig)
54 | img_mod = cv2.resize(img_orig, (int(w_orig*s_f), int(h_orig*s_f)) )
55 | h_mod, w_mod, _ = img_mod.shape
56 | img_zero = np.zeros([MAX_H, MAX_W, 3])
57 | img_zero[:h_mod, :w_mod, :] = img_mod
58 | return img_zero, s_f
59 |
60 |
61 | def run_track_seq(self, seq_name, save_res=False):
62 | # preliminary
63 | seq_dict = self.track_dict[seq_name]
64 | seq_path = seq_name if not seq_dict.has_key('path') else seq_dict['path']
65 | if self.track_dbnm is not 'got10k':
66 | seq_path = os.path.join(seq_path, 'img/')
67 | seq_path = os.path.join(self.track_path, seq_path)
68 | # results placeholder
69 | seq_len = len(seq_dict['img'])
70 | seq_res, seq_fps = np.zeros([seq_len,4]), np.zeros(seq_len)
71 |
72 | # tracking part
73 | for i, imf in enumerate(seq_dict['img']):
74 | sys.stdout.write("\r"+str(i)+'/'+str(seq_len-1))
75 |
76 | # init frame, extract feats
77 | if i == 0:
78 | # init state = [xmin, ymin , width, height]
79 | state = seq_dict['gt'][0,:].copy().astype(float)
80 | seq_res[i] = state.copy()
81 | # init frame
82 | im_frame, s_f = self.read_img_resize(os.path.join(seq_path, imf))
83 | tic = time.time()
84 | # convert state to [xmin, ymin, xmax, ymax]*scale_factor
85 | state_mod = np.array([state[0], state[1], state[0]+state[2], state[1]+state[3]])*s_f
86 | state_net = torch.Tensor(state_mod).unsqueeze(0).cuda()
87 | # init feats
88 | net_im = torch.Tensor(im_frame).unsqueeze(0).permute(0,3,1,2).cuda()
89 | net_bb = [state_net]
90 | with torch.no_grad():
91 | xfa = self.net.get_feats_xfa(net_im, net_bb)
92 | seq_fps[i] = 1./(time.time()-tic)
93 | continue
94 |
95 | # subsequent frames
96 | # read img
97 | im_frame, _ = self.read_img_resize(os.path.join(seq_path, imf))
98 | tic = time.time()
99 | # find target
100 | net_im = torch.Tensor(im_frame).unsqueeze(0).permute(0,3,1,2).cuda()
101 | with torch.no_grad():
102 | net_out_bb, _, _ = self.net.forward_box(None,net_im, None, xfa=xfa, nbox=1) #add_box=[state_net]
103 | state_net = net_out_bb[0].detach()
104 | state_mod = state_net.squeeze().cpu().numpy() / s_f
105 | state = np.array([state_mod[0], state_mod[1], state_mod[2]-state_mod[0], state_mod[3]-state_mod[1]])
106 | # store results
107 | seq_res[i] = state.copy()
108 | seq_fps[i] = 1./(time.time()-tic)
109 |
110 | if save_res:
111 | np.savetxt('output/'+seq_name+'.txt', seq_res, fmt='%.4f', delimiter=',')
112 |
113 | return seq_res, seq_fps
114 |
115 |
116 | def run_track_db(self, seq_list=None, out_vid=False, calc_auc=True, save_res=False):
117 | # results placeholder
118 | db_res = dict()
119 | db_fps = []
120 | db_auc = []
121 | db_suc = []
122 | # per-sequence operation
123 | seq_list = self.track_dict.keys() if seq_list is None else seq_list
124 | seq_nums = len(seq_list)
125 | for s_i, seq in enumerate(seq_list):
126 | # seq name
127 | print '('+ str(s_i+1) +'/' + str(seq_nums) + '):' + seq
128 | # run tracking
129 | seq_res, seq_fps = self.run_track_seq(seq, save_res=save_res)
130 | db_res[seq] = seq_res
131 | db_fps.append(seq_fps.mean())
132 | # calc and display auc
133 | if calc_auc:
134 | seq_iou = box_overlap_score(seq_res, self.track_dict[seq]['gt'])
135 | seq_suc = seq_iou>0.5
136 | seq_auc = result_curve(seq_iou, 21)
137 | db_auc.append(seq_auc)
138 | db_suc.append(seq_suc)
139 | print ', fps: ' + str(seq_fps.mean())[:6],
140 | print ', suc: ' + str(float(np.sum(seq_suc))/seq_res.shape[0])[:6],
141 | print ', auc: ' + str(np.mean(seq_auc))[:6] + ', mean_auc: ' + str(np.mean(db_auc))[:6]
142 | if out_vid:
143 | self.draw_vid_seq(seq_res, seq)
144 |
145 | # display overall results
146 | if calc_auc:
147 | print '\nmean fps: ' + str(np.mean(db_fps))[:6]
148 | print 'mean suc: ' + str(np.mean(np.concatenate(db_suc)))[:6]
149 | print 'mean auc: ' + str(np.mean(db_auc))[:6]
150 |
151 | return db_res, db_fps, db_auc
152 |
153 |
154 | def draw_vid_seq(self, seq_res, seq_name):
155 | print '> make video seq...',
156 | # preliminaries
157 | seq_dict = self.track_dict[seq_name]
158 | seq_path = seq_name if not seq_dict.has_key('path') else seq_dict['path']
159 | if self.track_dbnm is not 'got10k':
160 | seq_path = os.path.join(seq_path, 'img/')
161 | seq_path = os.path.join(self.track_path, seq_path)
162 | seq_len = len(seq_dict['img'])
163 | # draw for all frames
164 | im_slist = []
165 | for i, imf in enumerate(seq_dict['img']):
166 | # read img
167 | im_frame = imread_to_rgb(os.path.join(seq_path,imf))
168 | # draw bb = [xmin, ymin, width, height]
169 | bb = seq_res[i].astype(int)
170 | im_frame = cv2.rectangle(im_frame, (bb[0], bb[1]), (bb[0]+bb[2], bb[1]+bb[3]), (1,0,0), 3)
171 | # fnum text
172 | fnum_str = str('%04d'%i)
173 | im_frame = cv2.putText(im_frame, fnum_str, (0,im_frame.shape[0]), cv2.FONT_HERSHEY_DUPLEX, im_frame.shape[0]/350., (1,1,0))
174 | # save img
175 | im_sname = os.path.join('.temp/', seq_name +'_'+ fnum_str + '.jpg')
176 | im_slist.append(im_sname)
177 | plt.imsave(im_sname, im_frame)
178 |
179 | # encode video
180 | vid_clip = mpe.ImageSequenceClip(im_slist, fps=30)
181 | vid_clip.write_videofile('test.mp4', logger=None)
182 | print 'done'
183 | return
184 |
185 |
186 | def clean_temp_dir(self, temp_dir='.temp/'):
187 | flist = os.listdir(temp_dir)
188 | for f in flist:
189 | os.remove(os.path.join(temp_dir, f))
190 | print '> cleaned cache folder'
191 | return
192 |
193 |
--------------------------------------------------------------------------------
/tracker_batch.py:
--------------------------------------------------------------------------------
1 | import os,sys,time,cv2
2 |
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import moviepy.editor as mpe
6 |
7 | import torch, torchvision
8 | import torch.nn as nn
9 | import torch.nn.functional as F
10 | from torch.utils.data import Dataset, DataLoader
11 |
12 | from model.build_model import Track_Model
13 | from utils import crop_img, imread_to_rgb
14 | from track_utils import box_overlap_score, result_curve
15 |
16 | _batch_size = 8
17 | _num_thread = 2
18 |
19 | # tracking dataset
20 | class Seq_dataset(Dataset):
21 | def __init__(self, cfg, seq_path, seq_imlist):
22 | self.cfg = cfg
23 | self.seq_path = seq_path
24 | self.seq_imlist = seq_imlist
25 | self.len = len(seq_imlist)-1
26 |
27 | def __len__(self):
28 | return self.len
29 |
30 | def read_img_resize(self, imf):
31 | img_orig = imread_to_rgb(imf)
32 | h_orig, w_orig, _ = img_orig.shape
33 | MAX_H, MAX_W = self.cfg.im_size
34 | s_f = float(MAX_H) / float(h_orig)
35 | if float(w_orig)*s_f > MAX_W:
36 | s_f = float(MAX_W) / float(w_orig)
37 | img_mod = cv2.resize(img_orig, (int(w_orig*s_f), int(h_orig*s_f)) )
38 | h_mod, w_mod, _ = img_mod.shape
39 | img_zero = np.zeros([MAX_H, MAX_W, 3])
40 | img_zero[:h_mod, :w_mod, :] = img_mod
41 | return img_zero, s_f
42 |
43 | def __getitem__(self, idx):
44 | seq_imlist = self.seq_imlist[1:]
45 | im_path = os.path.join(self.seq_path, seq_imlist[idx])
46 | im_frame,_ = self.read_img_resize(im_path)
47 | out_im = torch.Tensor(im_frame).permute(2,0,1)
48 | return out_im
49 |
50 |
51 | # tracker object
52 | class Tracker(nn.Module):
53 | def __init__(self, cfg, db_name=None, idx=-1):
54 | super(type(self), self).__init__()
55 | # model object
56 | cfg.batch_size = _batch_size
57 | self.net = Track_Model(cfg).cuda()
58 | self.net.eval()
59 | self.cfg = cfg
60 | # tracking db placeholders
61 | self.track_dbnm = None
62 | self.track_path = None
63 | self.track_dict = None
64 | # load model weights
65 | self.chkpt_file = [ckpt for ckpt in sorted(os.listdir(cfg.CHKPT_PATH)) if ckpt.find(cfg.CHKPT_CODE)>0][idx]
66 | ckpt = torch.load(cfg.CHKPT_PATH+self.chkpt_file)
67 | self.net.load_state_dict(ckpt['model_state_dict'], strict=False)
68 | print 'ckpt: ' + self.chkpt_file
69 | # load db
70 | if db_name is not None:
71 | self.load_track_db(db_name)
72 |
73 |
74 | def load_track_db(self, name):
75 | # load dataset
76 | self.track_dbnm = name
77 | self.track_path = self.cfg.db_info[name]['path']
78 | self.track_dict = np.load(self.cfg.db_info[name]['dict'], allow_pickle=True).item()
79 | print 'dataset: ' + name
80 |
81 |
82 | def read_img_resize(self, imf):
83 | img_orig = imread_to_rgb(imf)
84 | h_orig, w_orig, _ = img_orig.shape
85 | MAX_H, MAX_W = self.cfg.im_size
86 | s_f = float(MAX_H) / float(h_orig)
87 | if float(w_orig)*s_f > MAX_W:
88 | s_f = float(MAX_W) / float(w_orig)
89 | img_mod = cv2.resize(img_orig, (int(w_orig*s_f), int(h_orig*s_f)) )
90 | h_mod, w_mod, _ = img_mod.shape
91 | img_zero = np.zeros([MAX_H, MAX_W, 3])
92 | img_zero[:h_mod, :w_mod, :] = img_mod
93 | return img_zero, s_f
94 |
95 |
96 | def run_track_seq(self, seq_name, seq_path, seq_imlist, seq_gt, save_res=False):
97 | # preliminary
98 | if ['got10k', 'trackingnet', 'uav123', 'uav20l', 'nuspro'].count(self.track_dbnm) == 0:
99 | seq_path = os.path.join(seq_path, 'img/')
100 | seq_path = os.path.join(self.track_path, seq_path)
101 | # results placeholder
102 | seq_len = len(seq_imlist)
103 | seq_res, seq_fps = [],[]
104 | # seq db
105 | seq_tdb = Seq_dataset(self.cfg, seq_path, seq_imlist)
106 | seq_tdl = DataLoader(seq_tdb, batch_size=self.cfg.batch_size, num_workers=_num_thread)
107 | # initial frame
108 | i = 0
109 | # init state = [xmin, ymin , width, height]
110 | state = seq_gt[0,:].copy().astype(float)
111 | seq_res.append(np.expand_dims(state.copy(),0))
112 | # init frame
113 | im_frame, s_f = self.read_img_resize(os.path.join(seq_path, seq_imlist[0]))
114 | # convert state to [xmin, ymin, xmax, ymax]*scale_factor
115 | state_mod = np.array([state[0], state[1], state[0]+state[2], state[1]+state[3]])*s_f
116 | # init feats
117 | net_im = torch.Tensor(im_frame).unsqueeze(0).permute(0,3,1,2).repeat_interleave(self.cfg.batch_size,0).cuda()
118 | net_bb = [torch.Tensor(state_mod).unsqueeze(0).cuda()]*self.cfg.batch_size
119 | with torch.no_grad():
120 | xfa = self.net.get_feats_xfa(net_im, net_bb)
121 |
122 | # tracking part
123 | for i, im_frame in enumerate(seq_tdl):
124 | sys.stdout.write("\r"+str((i)*self.cfg.batch_size)+'/'+str(seq_len))
125 | # subsequent frames
126 | # read img
127 | tic = time.time()
128 | temp_sz = im_frame.shape[0]
129 | net_im = torch.zeros(self.cfg.batch_size, 3, self.cfg.im_size[0], self.cfg.im_size[1])
130 | net_im[:temp_sz] = im_frame
131 | net_im = net_im.cuda()
132 | # find target
133 | with torch.no_grad():
134 | net_out_bb, _, _ = self.net.forward_box(None,net_im, None, xfa=xfa, nbox=1)
135 | state_mod = net_out_bb.squeeze().detach().cpu().numpy() / s_f
136 | state = np.zeros_like(state_mod)
137 | state[:,0], state[:,1], state[:,2], state[:,3] = state_mod[:,0], state_mod[:,1], state_mod[:,2]-state_mod[:,0], state_mod[:,3]-state_mod[:,1]
138 | # store results
139 | seq_res.append(state.copy())
140 | seq_fps.append((time.time()-tic))
141 |
142 | # concat dims
143 | seq_res = np.concatenate(seq_res)[:seq_len]
144 | seq_fps = 1./(np.sum(seq_fps)/float(seq_len))
145 | # save res
146 | if save_res:
147 |
148 | if self.track_dbnm == 'got10k':
149 | os.mkdir('output/'+seq_name)
150 | np.savetxt('output/'+seq_name+'/'+seq_name+'_001.txt', seq_res, fmt='%.4f', delimiter=',')
151 | else:
152 | np.savetxt('output/'+seq_name+'.txt', seq_res, fmt='%.4f', delimiter=',')
153 |
154 | return seq_res, seq_fps
155 |
156 |
157 | def run_track_db(self, seq_list=None, out_vid=False, calc_auc=True, save_res=False):
158 | # results placeholder
159 | db_res = dict()
160 | db_fps = []
161 | db_auc = []
162 | db_suc = []
163 | # per-sequence operation
164 | seq_list = self.track_dict.keys() if seq_list is None else seq_list
165 | seq_nums = len(seq_list)
166 | for s_i, seq in enumerate(seq_list):
167 | # print seq name
168 | print '('+ str(s_i+1) +'/' + str(seq_nums) + '):' + seq
169 | # seq path+imlist+gt
170 | seq_dict = self.track_dict[seq]
171 | seq_path = seq if not seq_dict.has_key('path') else seq_dict['path']
172 | seq_imlist = seq_dict['img']
173 | seq_gt = seq_dict['gt']
174 | # run tracking
175 | seq_res, seq_fps = self.run_track_seq(seq, seq_path, seq_imlist, seq_gt, save_res=save_res)
176 | db_res[seq] = seq_res
177 | db_fps.append(seq_fps.mean())
178 | # calc and display auc
179 | if calc_auc:
180 | seq_iou = box_overlap_score(seq_res, self.track_dict[seq]['gt'])
181 | seq_suc = seq_iou>0.5
182 | seq_auc = result_curve(seq_iou, 21)
183 | db_auc.append(seq_auc)
184 | db_suc.append(seq_suc)
185 | print ', fps: ' + str(seq_fps.mean())[:6],
186 | print ', suc: ' + str(float(np.sum(seq_suc))/seq_res.shape[0])[:6],
187 | print ', auc: ' + str(np.mean(seq_auc))[:6] + ', mean_auc: ' + str(np.mean(db_auc))[:6]
188 | if out_vid:
189 | self.draw_vid_seq(seq_res, seq)
190 |
191 | # display overall results
192 | if calc_auc:
193 | print '\nmean fps: ' + str(np.mean(db_fps))[:6]
194 | print 'mean suc: ' + str(np.mean(np.concatenate(db_suc)))[:6]
195 | print 'mean auc: ' + str(np.mean(db_auc))[:6]
196 |
197 | return db_res, db_fps, db_auc
198 |
199 |
200 | def draw_vid_seq(self, seq_res, seq_name):
201 | print '> make video seq...',
202 | # preliminaries
203 | seq_dict = self.track_dict[seq_name]
204 | seq_path = seq_name if not seq_dict.has_key('path') else seq_dict['path']
205 | if self.track_dbnm is not 'got10k':
206 | seq_path = os.path.join(seq_path, 'img/')
207 | seq_path = os.path.join(self.track_path, seq_path)
208 | seq_len = len(seq_dict['img'])
209 | # draw for all frames
210 | im_slist = []
211 | for i, imf in enumerate(seq_dict['img']):
212 | # read img
213 | im_frame = imread_to_rgb(os.path.join(seq_path,imf))
214 | # draw bb = [xmin, ymin, width, height]
215 | bb = seq_res[i].astype(int)
216 | im_frame = cv2.rectangle(im_frame, (bb[0], bb[1]), (bb[0]+bb[2], bb[1]+bb[3]), (1,0,0), 3)
217 | # fnum text
218 | fnum_str = str('%04d'%i)
219 | im_frame = cv2.putText(im_frame, fnum_str, (0,im_frame.shape[0]), cv2.FONT_HERSHEY_DUPLEX, im_frame.shape[0]/350., (1,1,0))
220 | # save img
221 | im_sname = os.path.join('.temp/', seq_name +'_'+ fnum_str + '.jpg')
222 | im_slist.append(im_sname)
223 | plt.imsave(im_sname, im_frame)
224 |
225 | # encode video
226 | vid_clip = mpe.ImageSequenceClip(im_slist, fps=30)
227 | vid_clip.write_videofile('test.mp4', logger=None)
228 | print 'done'
229 | return
230 |
231 |
232 | def clean_temp_dir(self, temp_dir='.temp/'):
233 | flist = os.listdir(temp_dir)
234 | for f in flist:
235 | os.remove(os.path.join(temp_dir, f))
236 | print '> cleaned cache folder'
237 | return
238 |
239 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | import cv2,time
4 |
5 |
6 | def get_dtstr(sec=True):
7 | tst = time.localtime()
8 | if sec:
9 | outstr = str(tst.tm_year)[-2:] + str('%02d' % tst.tm_mon) + str('%02d' % tst.tm_mday) + str('%02d' % tst.tm_hour)+ str('%02d' % tst.tm_min)+ str('%02d' % tst.tm_sec)
10 | else:
11 | outstr = str(tst.tm_year)[-2:] + str('%02d' % tst.tm_mon) + str('%02d' % tst.tm_mday) + str('%02d' % tst.tm_hour)+ str('%02d' % tst.tm_min)
12 | return outstr
13 |
14 | def imread_to_rgb(path):
15 | img_in = np.flip(cv2.imread(path, flags=cv2.IMREAD_COLOR), 2)/255.
16 | return img_in
17 |
18 | def crop_img(I, x, y, w, h, center=False, mfill=False):
19 | im_h = I.shape[0]
20 | im_w = I.shape[1]
21 |
22 | if center:
23 | w0 = w // 2; w1 = w - w0 # w = w0+w1
24 | h0 = h // 2; h1 = h - h0 # h = h0+h1
25 |
26 | x_min = x - w0; x_max = x+w1-1;
27 | y_min = y - h0; y_max = y+h1-1;
28 | else:
29 | x_min = x; x_max = x+w-1;
30 | y_min = y; y_max = y+h-1;
31 |
32 | pad_l = 0; pad_r = 0;
33 | pad_u = 0; pad_d = 0;
34 |
35 | # bounds
36 | if x_min < 0: pad_l = -x_min; x_min = 0;
37 | if x_max > im_w-1: pad_r = x_max-(im_w-1); x_max = im_w-1;
38 | if y_min < 0: pad_u = -y_min; y_min = 0;
39 | if y_max > im_h-1: pad_d = y_max-(im_h-1); y_max = im_h-1;
40 |
41 | # crop & append
42 | J = I[y_min:y_max+1, x_min:x_max+1, :]
43 |
44 | # 0 size errors
45 | if J.shape[0] == 0 or J.shape[1] == 0:
46 | plt.imsave('crop_error_'+time.strftime('%y%m%d_%H%M%S',time.localtime())+'.png', I)
47 | print 'i: ',I.shape, (x,y,w,h),J.shape
48 | print 'i: ',(y_min,y_max+1),(x_min,x_max+1)
49 | # return black image for zero-dim images
50 | return np.zeros([h,w,3])
51 |
52 | if mfill:
53 | rsel = np.linspace(0, J.shape[0], 8, endpoint=False, dtype=int)
54 | csel = np.linspace(0, J.shape[1], 8, endpoint=False, dtype=int)
55 | fill = np.mean(J[rsel][:,csel], axis=(0,1))
56 | else:
57 | fill = (0,0,0)
58 | J = cv2.copyMakeBorder(J, pad_u,pad_d,pad_l,pad_r, cv2.BORDER_CONSTANT, value=fill)
59 | return J
60 |
61 |
62 | def draw_bb_img(img0, x_min,y_min,width,height, color, stroke):
63 | img = img0.copy()
64 | img_h = img.shape[0]; img_w = img.shape[1];
65 |
66 | x_rng = np.array(range(width)) + x_min
67 | y_rng = np.array(range(height))+ y_min
68 |
69 | x_rng[x_rng> img_w-1-stroke] = img_w-1-stroke
70 | y_rng[y_rng> img_h-1-stroke] = img_h-1-stroke
71 |
72 | x_max = np.max(x_rng)
73 | y_max = np.max(y_rng)
74 |
75 | img[y_min:y_min+stroke][:, x_rng, :] = color # up
76 | img[y_max-stroke:y_max][:, x_rng, :] = color # down
77 | img[:, x_min:x_min+stroke, :][y_rng] = color # left
78 | img[:, x_max-stroke:x_max, :][y_rng] = color # right
79 |
80 | return img
81 |
82 |
83 | def dist_succ(v_pred, v_gt, batch_size):
84 | maxvals = v_pred.max(axis=1).max(axis=1)
85 | v_gt_mod = v_gt.copy() + 1.
86 |
87 | idxs = list(); gt_idxs = list();
88 | for b_i in range(batch_size):
89 | maxpos = np.where(v_pred == maxvals[b_i])[1:3]
90 | if np.shape(maxpos)[1] > 1:
91 | maxpos = (np.array([maxpos[0][0]]), np.array([maxpos[1][0]]))
92 | idxs.append(maxpos)
93 | gt_idxs.append(center_of_mass(v_gt_mod[b_i]))
94 |
95 | idxs = np.array(idxs).reshape([batch_size, 2]).astype(float)
96 | gt_idxs = np.array(gt_idxs).reshape([batch_size, 2])
97 |
98 | dist = np.sum( ( idxs - gt_idxs )**2, axis=1 )
99 | dist = np.sqrt( dist )
100 | succ = (dist <= np.sqrt(2.))
101 |
102 | return dist, succ
103 |
104 |
105 | def down2n(x, n):
106 | # returns input length of x after n-times of pooling/strides of 2
107 | if n == 1:
108 | return np.ceil(x/2.).astype(int)
109 | else:
110 | return down2n(np.ceil(x/2.), n-1).astype(int)
111 |
112 |
113 | def gray2jet(I):
114 | # convert input gray image I to jet colormap image J
115 | # trapezoid func map [0,1]->[0,1] (rise:t0~t1, down:t2~t3)
116 | def tpz(xin, t0,t1,t2,t3):
117 | x = xin.copy()
118 | x[xin<=t0] = 0.
119 | x[(xin>t0)*(xin<=t1)] = (xin[(xin>t0)*(xin<=t1)] - t0) / (t1-t0)
120 | x[(xin>t1)*(xin<=t2)] = 1.
121 | x[(xin>t2)*(xin<=t3)] = (xin[(xin>t2)*(xin<=t3)] - t3) / (t2-t3)
122 | x[xin>t3] = 0.
123 | return x
124 |
125 | # respective rgb channel mappings
126 | J_r = tpz(I, 0.375, 0.625, 0.875, 1.125)
127 | J_g = tpz(I, 0.125, 0.375, 0.625, 0.875)
128 | J_b = tpz(I, -0.125, 0.125, 0.375, 0.625)
129 |
130 | J = np.zeros([I.shape[0], I.shape[1], 3])
131 | J[:,:,0] = J_r
132 | J[:,:,1] = J_g
133 | J[:,:,2] = J_b
134 | return J
135 |
136 |
--------------------------------------------------------------------------------