├── .gitignore ├── README.md ├── _figs ├── overview.png └── plots.png ├── cfgs ├── __init__.py ├── cfg_res18.py ├── cfg_res50.py └── cfg_test.py ├── ckpt └── __init__.py ├── dict └── lasot_dict_test.npy ├── model ├── __init__.py ├── box_utils.py ├── boxes.py ├── build_model.py ├── cbam.py ├── context.py ├── fcos.py ├── focal_loss.py ├── loss.py ├── non_local.py ├── rcnn_module.py ├── resnet.py └── rpn_module.py ├── output └── __init__.py ├── test_tracker.py ├── th_utils.py ├── track_utils.py ├── tracker.py ├── tracker_batch.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | .temp/ 2 | .ipynb_checkpoints/ 3 | *.pyc 4 | *.pth 5 | *.txt 6 | *.tar 7 | *.mp4 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Visual Tracking by TridentAlign and Context Embedding (TACT) 2 | 3 | Test code for "Visual Tracking by TridentAlign and Context Embedding" 4 | 5 | #### Janghoon Choi, Junseok Kwon, and Kyoung Mu Lee 6 | 7 | [arXiv paper](https://arxiv.org/pdf/2007.06887.pdf) 8 | 9 | ## Overall Framework 10 | 11 |

12 | 13 | 14 | ## Results on LaSOT test set 15 | 16 |

17 | 18 | - Link to [LaSOT dataset](https://cis.temple.edu/lasot/) 19 | - Raw results available on [Google drive](https://drive.google.com/drive/folders/1ShAPX-ho-b_JjEenPCjzy1m4UN-ooSot?usp=sharing) 20 | 21 | 22 | ## Dependencies 23 | 24 | - Ubuntu 18.04 25 | - Python==2.7.17 26 | - numpy==1.16.5 27 | - pytorch==1.3.0 28 | - matplotlib==2.2.4 29 | - opencv==4.1.0.25 30 | - moviepy==1.0.0 31 | - tqdm==4.32.1 32 | 33 | 34 | ## Usage 35 | 36 | ### Prerequisites 37 | 38 | - Download network weights from [Google drive](https://drive.google.com/drive/folders/1ShAPX-ho-b_JjEenPCjzy1m4UN-ooSot?usp=sharing) 39 | - Copy network weight files `ckpt_res18.tar` and `ckpt_res50.tar` to `ckpt/` folder 40 | - Choose between `TACT-18` and `TACT-50` by modifying the `cfgs/cfg_test.py` file (default: `TACT-50`) 41 | 42 | ### To test tracker on LaSOT test set 43 | 44 | - Download LaSOT dataset from [link](https://cis.temple.edu/lasot/) 45 | - Modify `cfgs/cfg_test.py` file to local `LaSOTBenchmark` folder path 46 | - Run `python test_tracker.py` 47 | 48 | ### To test tracker on an arbitrary sequence 49 | 50 | - Using `run_track_seq()` function in `tracker_batch.py`, tracker can run on an arbitrary sequence 51 | - Provide the function with following variables 52 | - `seq_name` : name of the given sequence 53 | - `seq_path` : path to the given sequence 54 | - `seq_imlist` : list of image file names of the given sequence 55 | - `seq_gt` : ground truth box annotations of the given sequence (may only contain annotation for initial frame, `[x_min,y_min,width,height]` format) 56 | 57 | ### Raw results on other datasets 58 | 59 | - Link to raw results on [Google drive](https://drive.google.com/drive/folders/1ShAPX-ho-b_JjEenPCjzy1m4UN-ooSot?usp=sharing) 60 | - Results for test sets of [LaSOT](https://cis.temple.edu/lasot/), [OxUvA](https://oxuva.github.io/long-term-tracking-benchmark/), [GOT-10k](http://got-10k.aitestunion.com/), [TrackingNet](https://tracking-net.org/) 61 | 62 | 63 | ## Citation 64 | 65 | If you find our work useful for your research, please consider citing the following paper: 66 | 67 | ``` text 68 | @article{choi2020tact, 69 | title={Visual tracking by tridentalign and context embedding}, 70 | author={Choi, Janghoon and Kwon, Junseok and Lee, Kyoung Mu}, 71 | journal={arXiv preprint arXiv:2007.06887}, 72 | year={2020} 73 | } 74 | ``` 75 | 76 | 77 | -------------------------------------------------------------------------------- /_figs/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanghoonChoi/TACT/6870cf45a489f4ebd610f25d099ab5f7470b22e4/_figs/overview.png -------------------------------------------------------------------------------- /_figs/plots.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanghoonChoi/TACT/6870cf45a489f4ebd610f25d099ab5f7470b22e4/_figs/plots.png -------------------------------------------------------------------------------- /cfgs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanghoonChoi/TACT/6870cf45a489f4ebd610f25d099ab5f7470b22e4/cfgs/__init__.py -------------------------------------------------------------------------------- /cfgs/cfg_res18.py: -------------------------------------------------------------------------------- 1 | # define training flags 2 | EXP_COMMENT = 'res18_final_model' 3 | 4 | # define training parameters 5 | im_size = (400, 666) # image max sizes (height, width) 6 | batch_size = 4 # batch size for training 7 | batch_size_val = 8 # batch size for validation 8 | 9 | name_bbnet = 'resnet18' # choose backbone : [resnet18, resnet34, resnet50, wide_resnet50_2, resnext50_32x4d] 10 | conv_npool = (2,3,4,4) # numof pooling for each output for backbone network (default:[2,3,4,5]) 11 | roip_size = 5 # spatial sizeof roi-aligned features (default:7x7) 12 | head_nconv = 2 # numof conv layers for detection heads 13 | head_nfeat = 256 # channel dim. for feature maps in detection heads 14 | head_nlocl = True # use or not use nonlocal layer (embedded gaussian) 15 | head_dconv = True # use or not use dilated convs 16 | head_negff = False # use or not use negative feats for final scoring 17 | head_oproi = False # use or not use roi overlap prediction branch 18 | head_ctxff = (True, 3) # use or not use context feature fusion + fusion scheme number (0:cat,1:add,2:cbam,3:film) 19 | bbox_thres = (0.5, 0.4) # bbox thresholds for pos/neg samples for training 20 | nms_param = (0.90, 64) # nms params (overlap_threshold_pos, _neg, num_candidate_boxes) 21 | nft_param = (0.4, 6) # negative feat param (overlap_threshold, num_negative_boxes) 22 | 23 | num_epochs = int(1e+3) # numof training epochs 24 | training_iter = int(1e+5) # numof training iterations per epoch 25 | lr_start = 1e-4 # learning rate (initial) 26 | lr_decay = 0.50 # learning rate decay rate per loop 27 | lr_decay_step = 2000000 # learning rate decay steps 28 | w_decay = 1e-5 # weight decay rate for optimizer 29 | loss_lambda = 1.00 # balancing term for loss function (cls + lambda*reg) 30 | loss_gamma = 2.00 # focal loss gamma value (penalty on easy examples) 31 | loss_alpha = None # focal loss alpha value (pos/neg example balancing) 32 | 33 | 34 | # ===== PATH variables ===== 35 | # checkpoint/init path + experiment number 36 | CHKPT_PATH, INITP_PATH = 'ckpt/', 'init/init_res18_weights.tar' 37 | CHKPT_CODE = '' 38 | # validation set dump path 39 | VALID_PATH = '/home/jhchoi/datasets3/track_valid_set_fcos.npz' 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /cfgs/cfg_res50.py: -------------------------------------------------------------------------------- 1 | # define training flags 2 | EXP_COMMENT = 'res50_final_model' 3 | 4 | # define training parameters 5 | im_size = (400, 666) # image max sizes (height, width) 6 | batch_size = 4 # batch size for training 7 | batch_size_val = 8 # batch size for validation 8 | 9 | name_bbnet = 'resnet50' # choose backbone : [resnet18, resnet34, resnet50, wide_resnet50_2, resnext50_32x4d] 10 | conv_npool = (2,3,4,4) # numof pooling for each output for backbone network (default:[2,3,4,5]) 11 | roip_size = 5 # spatial sizeof roi-aligned features (default:7x7) 12 | head_nconv = 2 # numof conv layers for detection heads 13 | head_nfeat = 256 # channel dim. for feature maps in detection heads 14 | head_nlocl = True # use or not use nonlocal layer (embedded gaussian) 15 | head_dconv = True # use or not use dilated convs 16 | head_negff = False # use or not use negative feats for final scoring 17 | head_oproi = False # use or not use roi overlap prediction branch 18 | head_ctxff = (True, 3) # use or not use context feature fusion + fusion scheme number (0:cat,1:add,2:cbam,3:film) 19 | bbox_thres = (0.5, 0.4) # bbox thresholds for pos/neg samples for training 20 | nms_param = (0.90, 64) # nms params (overlap_threshold_pos, _neg, num_candidate_boxes) 21 | nft_param = (0.4, 6) # negative feat param (overlap_threshold, num_negative_boxes) 22 | 23 | num_epochs = int(1e+3) # numof training epochs 24 | training_iter = int(1e+5) # numof training iterations per epoch 25 | lr_start = 1e-4 # learning rate (initial) 26 | lr_decay = 0.50 # learning rate decay rate per loop 27 | lr_decay_step = 2000000 # learning rate decay steps 28 | w_decay = 1e-5 # weight decay rate for optimizer 29 | loss_lambda = 1.00 # balancing term for loss function (cls + lambda*reg) 30 | loss_gamma = 2.00 # focal loss gamma value (penalty on easy examples) 31 | loss_alpha = None # focal loss alpha value (pos/neg example balancing) 32 | 33 | 34 | # ===== PATH variables ===== 35 | # checkpoint/init path + experiment number 36 | CHKPT_PATH, INITP_PATH = 'ckpt/', 'init/init_res50_weights.tar' 37 | CHKPT_CODE = '' 38 | # validation set dump path 39 | VALID_PATH = '/home/jhchoi/datasets3/track_valid_set_fcos.npz' 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /cfgs/cfg_test.py: -------------------------------------------------------------------------------- 1 | from cfg_res50 import * 2 | 3 | 4 | # ===== PATH variables ===== 5 | # checkpoint path + experiment number 6 | CHKPT_PATH = 'ckpt/' 7 | CHKPT_CODE = 'res50' 8 | 9 | 10 | # construct dataset info dict 11 | db_info = dict() 12 | # test sets 13 | db_info['lasot'] = {'size': 280, 14 | 'path' : '/home/jhchoi/datasets5/LaSOTBenchmark/', 15 | 'dict' : 'dict/lasot_dict_test.npy'} 16 | 17 | -------------------------------------------------------------------------------- /ckpt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanghoonChoi/TACT/6870cf45a489f4ebd610f25d099ab5f7470b22e4/ckpt/__init__.py -------------------------------------------------------------------------------- /dict/lasot_dict_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanghoonChoi/TACT/6870cf45a489f4ebd610f25d099ab5f7470b22e4/dict/lasot_dict_test.npy -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanghoonChoi/TACT/6870cf45a489f4ebd610f25d099ab5f7470b22e4/model/__init__.py -------------------------------------------------------------------------------- /model/box_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | 4 | # code from https://github.com/amdegroot/ssd.pytorch 5 | 6 | def point_form(boxes): 7 | """ Convert prior_boxes to (xmin, ymin, xmax, ymax) 8 | representation for comparison to point form ground truth data. 9 | Args: 10 | boxes: (tensor) center-size default boxes from priorbox layers. 11 | Return: 12 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 13 | """ 14 | return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin 15 | boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax 16 | 17 | 18 | def center_size(boxes): 19 | """ Convert prior_boxes to (cx, cy, w, h) 20 | representation for comparison to center-size form ground truth data. 21 | Args: 22 | boxes: (tensor) point_form boxes 23 | Return: 24 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 25 | """ 26 | return torch.cat((boxes[:, 2:] + boxes[:, :2])/2, # cx, cy 27 | boxes[:, 2:] - boxes[:, :2], 1) # w, h 28 | 29 | 30 | def intersect(box_a, box_b): 31 | """ We resize both tensors to [A,B,2] without new malloc: 32 | [A,2] -> [A,1,2] -> [A,B,2] 33 | [B,2] -> [1,B,2] -> [A,B,2] 34 | Then we compute the area of intersect between box_a and box_b. 35 | Args: 36 | box_a: (tensor) bounding boxes, Shape: [A,4]. 37 | box_b: (tensor) bounding boxes, Shape: [B,4]. 38 | Return: 39 | (tensor) intersection area, Shape: [A,B]. 40 | """ 41 | A = box_a.size(0) 42 | B = box_b.size(0) 43 | max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), 44 | box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) 45 | min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), 46 | box_b[:, :2].unsqueeze(0).expand(A, B, 2)) 47 | inter = torch.clamp((max_xy - min_xy), min=0) 48 | return inter[:, :, 0] * inter[:, :, 1] 49 | 50 | 51 | def jaccard(box_a, box_b, eps=0.): 52 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 53 | is simply the intersection over union of two boxes. Here we operate on 54 | ground truth boxes and default boxes. 55 | E.g.: 56 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 57 | Args: 58 | box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] 59 | box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] 60 | Return: 61 | jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] 62 | """ 63 | inter = intersect(box_a, box_b) 64 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 65 | (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] 66 | area_b = ((box_b[:, 2]-box_b[:, 0]) * 67 | (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] 68 | union = area_a + area_b - inter 69 | return (inter + eps) / (union + eps) # [A,B] 70 | 71 | 72 | def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx): 73 | """Match each prior box with the ground truth box of the highest jaccard 74 | overlap, encode the bounding boxes, then return the matched indices 75 | corresponding to both confidence and location preds. 76 | Args: 77 | threshold: (float) The overlap threshold used when mathing boxes. 78 | truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors]. 79 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. 80 | variances: (tensor) Variances corresponding to each prior coord, 81 | Shape: [num_priors, 4]. 82 | labels: (tensor) All the class labels for the image, Shape: [num_obj]. 83 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets. 84 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. 85 | idx: (int) current batch index 86 | Return: 87 | The matched indices corresponding to 1)location and 2)confidence preds. 88 | """ 89 | # jaccard index 90 | overlaps = jaccard( 91 | truths, 92 | point_form(priors) 93 | ) 94 | # (Bipartite Matching) 95 | # [1,num_objects] best prior for each ground truth 96 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) 97 | # [1,num_priors] best ground truth for each prior 98 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) 99 | best_truth_idx.squeeze_(0) 100 | best_truth_overlap.squeeze_(0) 101 | best_prior_idx.squeeze_(1) 102 | best_prior_overlap.squeeze_(1) 103 | best_truth_overlap.index_fill_(0, best_prior_idx, 2) # ensure best prior 104 | # TODO refactor: index best_prior_idx with long tensor 105 | # ensure every gt matches with its prior of max overlap 106 | for j in range(best_prior_idx.size(0)): 107 | best_truth_idx[best_prior_idx[j]] = j 108 | matches = truths[best_truth_idx] # Shape: [num_priors,4] 109 | conf = labels[best_truth_idx] + 1 # Shape: [num_priors] 110 | conf[best_truth_overlap < threshold] = 0 # label as background 111 | loc = encode(matches, priors, variances) 112 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn 113 | conf_t[idx] = conf # [num_priors] top class label for each prior 114 | 115 | 116 | def encode(matched, priors, variances): 117 | """Encode the variances from the priorbox layers into the ground truth boxes 118 | we have matched (based on jaccard overlap) with the prior boxes. 119 | Args: 120 | matched: (tensor) Coords of ground truth for each prior in point-form 121 | Shape: [num_priors, 4]. 122 | priors: (tensor) Prior boxes in center-offset form 123 | Shape: [num_priors,4]. 124 | variances: (list[float]) Variances of priorboxes 125 | Return: 126 | encoded boxes (tensor), Shape: [num_priors, 4] 127 | """ 128 | 129 | # dist b/t match center and prior's center 130 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] 131 | # encode variance 132 | g_cxcy /= (variances[0] * priors[:, 2:]) 133 | # match wh / prior wh 134 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 135 | g_wh = torch.log(g_wh) / variances[1] 136 | # return target for smooth_l1_loss 137 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 138 | 139 | 140 | # Adapted from https://github.com/Hakuyume/chainer-ssd 141 | def decode(loc, priors, variances): 142 | """Decode locations from predictions using priors to undo 143 | the encoding we did for offset regression at train time. 144 | Args: 145 | loc (tensor): location predictions for loc layers, 146 | Shape: [num_priors,4] 147 | priors (tensor): Prior boxes in center-offset form. 148 | Shape: [num_priors,4]. 149 | variances: (list[float]) Variances of priorboxes 150 | Return: 151 | decoded bounding box predictions 152 | """ 153 | 154 | boxes = torch.cat(( 155 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 156 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 157 | boxes[:, :2] -= boxes[:, 2:] / 2 158 | boxes[:, 2:] += boxes[:, :2] 159 | return boxes 160 | 161 | 162 | def log_sum_exp(x): 163 | """Utility function for computing log_sum_exp while determining 164 | This will be used to determine unaveraged confidence loss across 165 | all examples in a batch. 166 | Args: 167 | x (Variable(tensor)): conf_preds from conf layers 168 | """ 169 | x_max = x.data.max() 170 | return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max 171 | 172 | 173 | # Original author: Francisco Massa: 174 | # https://github.com/fmassa/object-detection.torch 175 | # Ported to PyTorch by Max deGroot (02/01/2017) 176 | def nms(boxes, scores, overlap=0.5, top_k=200): 177 | """Apply non-maximum suppression at test time to avoid detecting too many 178 | overlapping bounding boxes for a given object. 179 | Args: 180 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 181 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 182 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 183 | top_k: (int) The Maximum number of box preds to consider. 184 | Return: 185 | The indices of the kept boxes with respect to num_priors. 186 | """ 187 | 188 | keep = scores.new(scores.size(0)).zero_().long() 189 | if boxes.numel() == 0: 190 | return keep 191 | x1 = boxes[:, 0] 192 | y1 = boxes[:, 1] 193 | x2 = boxes[:, 2] 194 | y2 = boxes[:, 3] 195 | area = torch.mul(x2 - x1, y2 - y1) 196 | v, idx = scores.sort(0) # sort in ascending order 197 | # I = I[v >= 0.01] 198 | idx = idx[-top_k:] # indices of the top-k largest vals 199 | xx1 = boxes.new() 200 | yy1 = boxes.new() 201 | xx2 = boxes.new() 202 | yy2 = boxes.new() 203 | w = boxes.new() 204 | h = boxes.new() 205 | 206 | # keep = torch.Tensor() 207 | count = 0 208 | while idx.numel() > 0: 209 | i = idx[-1] # index of current largest val 210 | # keep.append(i) 211 | keep[count] = i 212 | count += 1 213 | if idx.size(0) == 1: 214 | break 215 | idx = idx[:-1] # remove kept element from view 216 | # load bboxes of next highest vals 217 | torch.index_select(x1, 0, idx, out=xx1) 218 | torch.index_select(y1, 0, idx, out=yy1) 219 | torch.index_select(x2, 0, idx, out=xx2) 220 | torch.index_select(y2, 0, idx, out=yy2) 221 | # store element-wise max with next highest score 222 | xx1 = torch.clamp(xx1, min=x1[i]) 223 | yy1 = torch.clamp(yy1, min=y1[i]) 224 | xx2 = torch.clamp(xx2, max=x2[i]) 225 | yy2 = torch.clamp(yy2, max=y2[i]) 226 | w.resize_as_(xx2) 227 | h.resize_as_(yy2) 228 | w = xx2 - xx1 229 | h = yy2 - yy1 230 | # check sizes of xx1 and xx2.. after each iteration 231 | w = torch.clamp(w, min=0.0) 232 | h = torch.clamp(h, min=0.0) 233 | inter = w*h 234 | # IoU = i / (area(a) + area(b) - i) 235 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas) 236 | union = (rem_areas - inter) + area[i] 237 | IoU = inter/union # store result in iou 238 | # keep only elements with an IoU <= overlap 239 | idx = idx[IoU.le(overlap)] 240 | return keep, count 241 | 242 | -------------------------------------------------------------------------------- /model/boxes.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torchvision.ops.boxes import nms 5 | from time import time 6 | 7 | from th_utils import generate_reg_coords 8 | 9 | # box prediction module given response maps 10 | class BoxModule(nn.Module): 11 | def __init__(self, cfg): 12 | super(type(self), self).__init__() 13 | self.im_size = cfg.im_size 14 | # nms parameters 15 | self.bb_thres = cfg.nms_param[0] 16 | self.bb_nums = cfg.nms_param[1] 17 | # default anchor box center coordinates 18 | self.anc = torch.Tensor(generate_reg_coords(cfg)).unsqueeze(0).flatten(1,-2).cuda() 19 | 20 | 21 | def forward(self,cl,re, nms_param=None): 22 | # define nms parameters 23 | if nms_param is not None: 24 | bb_thr = nms_param[0] 25 | bb_num = nms_param[1] 26 | else: 27 | bb_thr = self.bb_thres 28 | bb_num = self.bb_nums 29 | 30 | # softmax class -> obtain scoremap 31 | ff = torch.exp(cl[...,0]) / (torch.exp(cl[...,0])+torch.exp(cl[...,1])) # [bnum, map_h, map_w] 32 | batch_size = ff.shape[0] 33 | # flatten scoremaps and regvals 34 | ff_f = ff.flatten(1) # [bnum,N] 35 | re_f = re.flatten(1,-2) #[bnum,N,ltrb] 36 | 37 | # translate regressed vals to bbox coordinates [bnum, N, x0y0x1y1] 38 | bb_f = self.anc.repeat_interleave(batch_size,dim=0).clone() # anchor coordinates to xyxy [bnum,N,xyxy] 39 | bb_f[...,0] -= re_f[...,0] # x_min = x_anc - left 40 | bb_f[...,1] -= re_f[...,1] # y_min = y_anc - top 41 | bb_f[...,2] += re_f[...,2] # x_max = x_anc + right 42 | bb_f[...,3] += re_f[...,3] # y_max = y_anc + down 43 | 44 | # cutoff boundary values 45 | xmin,ymin,xmax,ymax = bb_f[...,0],bb_f[...,1],bb_f[...,2],bb_f[...,3] 46 | xmin[xmin<0] = 0 47 | ymin[ymin<0] = 0 48 | xmax[xmax>self.im_size[1]-1] = self.im_size[1]-1 49 | ymax[ymax>self.im_size[0]-1] = self.im_size[0]-1 50 | 51 | # per-batch nms 52 | out_bb, out_ff = [], [] 53 | for i in range(batch_size): 54 | ffi = ff_f[i] 55 | bbi = bb_f[i] 56 | b_idx = nms(bbi, ffi, bb_thr) 57 | # if numof boxes to choose is larger than obtained numof boxes 58 | b_sel = torch.LongTensor(range(bb_num)).cuda() 59 | b_sel[b_sel>len(b_idx)-1] = len(b_idx)-1 60 | # choose and store boxes 61 | b_box = bbi[b_idx[b_sel]] 62 | out_bb.append(b_box) 63 | out_ff.append(ffi[b_idx[b_sel]]) 64 | 65 | # output : list of boxes where len(list)=batch_size, list[i]=[num_box,xyxy] 66 | return out_bb, out_ff 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /model/build_model.py: -------------------------------------------------------------------------------- 1 | import torch, time 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torchvision import transforms 5 | 6 | from box_utils import jaccard 7 | import resnet as resnet 8 | from rpn_module import RPN_Module 9 | from rcnn_module import RCNN_Module 10 | 11 | class Track_Model(nn.Module): 12 | def __init__(self,cfg): 13 | super(type(self), self).__init__() 14 | # dims and flags 15 | self.head_nfeat = cfg.head_nfeat 16 | self.head_negff = cfg.head_negff 17 | self.head_oproi = cfg.head_oproi 18 | self.head_ctxff = cfg.head_ctxff 19 | self.roip_size = cfg.roip_size 20 | self.nft_param = cfg.nft_param 21 | # backbone convnet 22 | self.backbone = getattr(resnet, cfg.name_bbnet)(cfg=cfg) 23 | # channel dim for backbone output featmap 24 | bb_ch = self.backbone(torch.zeros(1,3,64,64)).shape[1] 25 | # rpn module for proposal generation 26 | self.rpn = RPN_Module(cfg, bb_ch) 27 | # rcnn module for matching and refinement 28 | self.rcnn = RCNN_Module(cfg) 29 | 30 | def normalize_tensor(self, x, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]): 31 | # x: batch image tensor [bnum, 3, h, w] 32 | x[:,0],x[:,1],x[:,2] = (x[:,0]-mean[0])/std[0], (x[:,1]-mean[1])/std[1], (x[:,2]-mean[2])/std[2] 33 | return x 34 | 35 | def forward(self, x,y, xb, xfa=None, add_box=None): 36 | # x : query image, Tensor, [bnum, 3, img_h, img_w] 37 | # y : search image, Tensor, [bnum, 3, img_h, img_w] 38 | # xb : bbox coordinates for pos samples: list, [list_posbb] where len(list_bb)=bnum, list[i] = Tensor[1,4] 39 | # xfa : xfeats for feature reuse : tuple (xfa_tri, xfa_pos, xfa_neg) 40 | 41 | # pool pos/neg feats from x (if negft:true) 42 | xfa_in = self.get_feats_xfa(x, xb) if xfa is None else xfa 43 | # get feats from backbone (if not available) 44 | xf = self.backbone(self.normalize_tensor(x)) if xfa_in is None else None 45 | yf = self.backbone(self.normalize_tensor(y)) 46 | # get roi proposals, pooled feats and rpn outputs 47 | rois, scores, rpn_feats, out_rpn = self.rpn(xf, yf, xb, xfa_in[0], add_box) 48 | # matching confidence scores and bbox refinement 49 | rcnn_feats = (xfa_in[1], xfa_in[2], rpn_feats[2]) #(pos_feat, neg_feat, can_feat) 50 | out_rcnn = self.rcnn(rcnn_feats, rois) #(cf,op,bb,roi) 51 | 52 | return out_rpn, out_rcnn 53 | 54 | 55 | def forward_box(self, x,y, xb, xfa=None, add_box=None, nbox=1): 56 | # params 57 | num_batch = y.shape[0] 58 | # get final outputs 59 | out_rpn, out_rcnn = self.forward(x,y, xb, xfa, add_box) 60 | out_cf, out_op, out_bb, out_br = out_rcnn 61 | # choose single box with max score for each batch - obtain scores, choose max score idxs for each batch 62 | # pos score + mean neg score 63 | out_ff_pos = torch.exp(out_cf[...,0,0]) / (torch.exp(out_cf[...,0,0])+torch.exp(out_cf[...,0,1])) 64 | out_ff_neg = torch.exp(out_cf[...,1:,1]) / (torch.exp(out_cf[...,1:,0])+torch.exp(out_cf[...,1:,1])) if self.head_negff else 1. 65 | # product of negative scores 66 | out_ff_neg = torch.prod(out_ff_neg, dim=-1) if self.head_negff else 1. #torch.mean/torch.sum 67 | # overlap score 68 | out_op = torch.sigmoid(out_op[...,0]) if self.head_oproi else 1. 69 | 70 | # fianl score = pos_score*overlap_score*neg_score 71 | out_ff = out_ff_pos*out_ff_neg*out_op 72 | sort_idxs = out_ff.argsort(descending=True, dim=1) 73 | # returns bb coordinates for each batch 74 | out_bb_b = [] 75 | out_ff_b = [] 76 | for i in range(num_batch): 77 | out_bb_b.append(out_bb[i,sort_idxs[i,:nbox]]) # out_bb out_br 78 | out_ff_b.append(out_ff[i,sort_idxs[i,:nbox]]) 79 | out_bb_b = torch.stack(out_bb_b) 80 | out_ff_b = torch.stack(out_ff_b) 81 | 82 | return out_bb_b, out_ff_b, (out_rpn, out_rcnn) 83 | 84 | 85 | def get_feats_xfa(self, x, xb): 86 | # params 87 | num_batch = x.shape[0] 88 | thres,nfeat = self.nft_param 89 | nbox_num, nbox_thr = self.rpn.boxes.bb_nums, self.rpn.boxes.bb_thres 90 | # change numof candidate negative boxes 91 | self.rpn.boxes.bb_nums, self.rpn.boxes.bb_thres = 64,0.5 92 | # get pos and neg feats from query img 93 | xf = self.backbone(self.normalize_tensor(x)) 94 | # roi proposals and feats 95 | rois, scores, feats, _ = self.rpn(xf, xf, xb, add_box=xb, pool_xf=True) 96 | xfa_tri = feats[0] 97 | xfa_pos = feats[2][:,-1] 98 | yfa = feats[2][:,:-1] 99 | # negative feature mining inside xf 100 | if self.head_negff: 101 | xfa_neg = torch.zeros(num_batch, nfeat, self.head_nfeat, self.roip_size, self.roip_size).cuda() 102 | for i in range(num_batch): 103 | # get ious per batch, choose feature idxs with lower iou < thres 104 | xb_i, roi_i, score_i = xb[i], rois[i][:-1,:], scores[i] 105 | iou_i = jaccard(xb_i, roi_i)[0] 106 | idx_sel = torch.nonzero( iou_i < thres )[:,0] 107 | idx_sel = idx_sel[:nfeat] 108 | # if numof features insufficient: repeat last idx 109 | if len(idx_sel)==0: 110 | continue 111 | if len(idx_sel) output: rois+context embedded features -> to rcnn module) 12 | class ContextModule(nn.Module): 13 | def __init__(self,cfg): 14 | super(type(self), self).__init__() 15 | # params 16 | self.im_size = cfg.im_size 17 | self.map_size = (down2n(cfg.im_size[0],cfg.conv_npool[-1]),down2n(cfg.im_size[1],cfg.conv_npool[-1])) 18 | self.scale_f = float(self.map_size[0]) / float(self.im_size[0]) 19 | self.pool_size = cfg.roip_size 20 | self.head_nfeat = cfg.head_nfeat 21 | self.head_ctxff = cfg.head_ctxff 22 | self.num_ctxff = cfg.nft_param[1] 23 | self.ctx_param = (0.5,self.num_ctxff) #4 24 | # box module 25 | self.boxes = BoxModule(cfg) 26 | 27 | # variables w.r.t. different fusion schemes 28 | if self.head_ctxff[1]==0: 29 | fdim,reduce = (self.head_nfeat+2)*3-2, 1 30 | # simple concat 31 | self.simple = nn.Sequential(*[nn.Conv2d(fdim, self.head_nfeat, 3,1,1), nn.ReLU(), 32 | nn.Conv2d(self.head_nfeat, self.head_nfeat, 3,1,1), nn.ReLU(), 33 | nn.Conv2d(self.head_nfeat, self.head_nfeat, 1,1,0)]) 34 | 35 | elif self.head_ctxff[1]==1: 36 | fdim,reduce = (self.head_nfeat+2)*2, 1 37 | # simple addition 38 | self.simple = nn.Sequential(*[nn.Conv2d(fdim, self.head_nfeat, 3,1,1), nn.ReLU(), 39 | nn.Conv2d(self.head_nfeat, self.head_nfeat, 3,1,1), nn.ReLU(), 40 | nn.Conv2d(self.head_nfeat, self.head_nfeat, 1,1,0)]) 41 | 42 | elif self.head_ctxff[1]==2: 43 | # attention (cbam) based 44 | fdim,reduce = (self.head_nfeat+2)*2, 1 45 | # channel attention branch 46 | self.avg_pool, self.max_pool = nn.AdaptiveAvgPool2d(1), nn.AdaptiveMaxPool2d(1) 47 | self.conv1, self.conv2 = nn.Conv2d(fdim, self.head_nfeat//reduce, 1), nn.Conv2d(self.head_nfeat//reduce, self.head_nfeat, 1) 48 | self.relu1 = nn.ReLU() 49 | # spatial attention branch 50 | self.conv3 = nn.Conv2d(2, 1, 5, 1, 2) 51 | self.sigmoid = nn.Sigmoid() 52 | 53 | elif self.head_ctxff[1]==3: 54 | # film based 55 | fdim,reduce = (self.head_nfeat+2)*2, 1 56 | # common conv+relu 57 | self.conv1 = nn.Sequential(*[nn.Conv2d(fdim, self.head_nfeat//reduce, 3,1,1), nn.ReLU()]) 58 | # channel multiplier gamma 59 | self.mult_g = nn.Parameter(torch.ones(1, self.head_nfeat, self.pool_size, self.pool_size)) 60 | self.conv_g = nn.Sequential(*[nn.Conv2d(self.head_nfeat//reduce, self.head_nfeat//reduce, 3,1,1), nn.ReLU(), 61 | nn.Conv2d(self.head_nfeat//reduce, self.head_nfeat, 1,1,0)]) 62 | # channel bias beta 63 | self.mult_b = nn.Parameter(torch.zeros(1, self.head_nfeat, self.pool_size, self.pool_size)) 64 | self.conv_b = nn.Sequential(*[nn.Conv2d(self.head_nfeat//reduce, self.head_nfeat//reduce, 3,1,1), nn.ReLU(), 65 | nn.Conv2d(self.head_nfeat//reduce, self.head_nfeat, 1,1,0)]) 66 | 67 | else: 68 | print 'unknown fusion scheme...' 69 | 70 | # init 71 | for m in self.modules(): 72 | if isinstance(m, nn.Conv2d): 73 | nn.init.normal_(m.weight, 0., 1e-3) 74 | nn.init.constant_(m.bias, 0.) 75 | 76 | 77 | 78 | def forward(self, cl,re, cf,yfa,ybb): 79 | # cl,re : for obtaining context boxes 80 | # cf : full context feature map to sample features from 81 | # yfa : input features to be embedded with context; [num_batch, num_boxes, num_ch, pool_sz, pool_sz] 82 | # ybb : bounding box coordinates for input yfa feats; len(list)=num_batch, ybb[i]=[num_boxes,4] 83 | num_batch = yfa.shape[0] 84 | num_boxes = yfa.shape[1] 85 | # obtain candidate context box coordinates and pool feats cfa_all=[num_batch, num_ctx, num_ch, pool_sz, pool_sz] 86 | pred_ctxbb, pred_ctxsc = self.boxes(cl,re, self.ctx_param) 87 | ff = torch.cat((cf,cl.permute(0,3,1,2)),dim=1) # concat feats and cls logits 88 | cfa_all = roi_align(ff, pred_ctxbb, (self.pool_size,self.pool_size), self.scale_f) 89 | cfa_all = cfa_all.view(num_batch, self.ctx_param[1], self.head_nfeat+2, self.pool_size, self.pool_size) 90 | # max/mean pooling along channel dimension 91 | cfa_max,_ = cfa_all.max(dim=1) 92 | cfa_avg = cfa_all.mean(dim=1) 93 | cfa = torch.cat((cfa_max,cfa_avg), dim=1) # [num_batch, num_ch*2, pool_sz, pool_sz] 94 | 95 | # embed context into input feat yfa 96 | if self.head_ctxff[1]==0: 97 | # === simple concat 98 | cfa = cfa.unsqueeze(1).repeat_interleave(num_boxes,dim=1)# [num_batch, num_boxes, num_ch*2, pool_sz, pool_sz] 99 | cfa = torch.cat((yfa,cfa), dim=2) # channel-wise concat # [num_batch, num_boxes, num_ch*3, pool_sz, pool_sz] 100 | cfa = cfa.flatten(0,1) # batch-nbox dim flatten 101 | yfa = self.simple(cfa) 102 | yfa = yfa.view(num_batch, num_boxes, self.head_nfeat, self.pool_size, self.pool_size) 103 | 104 | elif self.head_ctxff[1]==1: 105 | # === simple addition 106 | cfa = self.simple(cfa) # [num_batch, self.head_nfeat, self.pool_size, self.pool_size] 107 | cfa = cfa.unsqueeze(1).repeat_interleave(num_boxes,dim=1) 108 | yfa += cfa 109 | 110 | elif self.head_ctxff[1]==2: 111 | # === channel and spatial attention (cbam) based 112 | # channel attention 113 | avg_out = self.conv2( self.relu1( self.conv1( self.avg_pool(cfa) ) ) ) 114 | max_out = self.conv2( self.relu1( self.conv1( self.max_pool(cfa) ) ) ) 115 | ca_out = self.sigmoid( avg_out + max_out ) 116 | ca_out = ca_out.unsqueeze(1).repeat_interleave(num_boxes,dim=1) 117 | yfa *= ca_out 118 | # spatial attention 119 | avg_out = torch.mean(cfa, dim=1, keepdim=True) 120 | max_out,_ = torch.max(cfa, dim=1, keepdim=True) 121 | sp_out = torch.cat((avg_out,max_out), dim=1) 122 | sp_out = self.sigmoid( self.conv3(sp_out) ) 123 | sp_out = sp_out.unsqueeze(1).repeat_interleave(num_boxes,dim=1) 124 | yfa *= sp_out 125 | 126 | elif self.head_ctxff[1]==3: 127 | # === film based affine transform 128 | # common branch 129 | fconv = self.conv1(cfa) 130 | # get channel multipler (mult_g*conv_g) 131 | fm_out = self.mult_g*self.conv_g(fconv) 132 | fm_out = fm_out.unsqueeze(1).repeat_interleave(num_boxes,dim=1) 133 | # get channel bias (mult_b*conv_b) 134 | fb_out = self.mult_b*self.conv_b(fconv) 135 | fb_out = fb_out.unsqueeze(1).repeat_interleave(num_boxes,dim=1) 136 | # apply channel wise linear transform ( (1-gamma)*feat+beta ) 137 | yfa = (1+fm_out)*yfa + fb_out 138 | 139 | else: 140 | print 'unknown fusion scheme...' 141 | 142 | return yfa 143 | 144 | 145 | -------------------------------------------------------------------------------- /model/fcos.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from non_local import NONLocalBlock2D 6 | 7 | 8 | # fcos detection head module 9 | class FCOSHead(nn.Module): 10 | def __init__(self, cfg): 11 | super(type(self), self).__init__() 12 | 13 | # define individual modules 14 | fdim = cfg.head_nfeat 15 | fmul = 3 if cfg.head_dconv else 1 16 | 17 | self.conv0_rdim = nn.Conv2d(fdim*fmul,fdim, 1,1,0) 18 | 19 | if cfg.head_nlocl: 20 | self.nl_feature = NONLocalBlock2D(in_channels=fdim) 21 | 22 | conv1_unit = [nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()] 23 | conv2_unit = [nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()] 24 | for i in range(cfg.head_nconv-1): 25 | conv1_unit.extend([nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()]) #nn.Conv2d(fdim,fdim, 3,1,1) 26 | conv2_unit.extend([nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()]) 27 | 28 | self.conv1 = nn.Sequential(*conv1_unit) 29 | self.conv2 = nn.Sequential(*conv2_unit) 30 | 31 | self.conv_cls = nn.Sequential(nn.Conv2d(fdim,2, 3,1,1)) 32 | self.conv_reg = nn.Sequential(nn.Conv2d(fdim,4, 3,1,1)) 33 | 34 | # define sequential modules 35 | self.cls = nn.Sequential(self.conv1, self.conv_cls) 36 | self.reg = nn.Sequential(self.conv2, self.conv_reg) 37 | self.mul = nn.Parameter(torch.rand(1)) 38 | 39 | # init 40 | head_module_list = nn.ModuleList([self.conv0_rdim, self.cls, self.reg]) 41 | for m in head_module_list.modules(): 42 | if isinstance(m, nn.Conv2d): 43 | nn.init.normal_(m.weight, 0., 1e-3) 44 | nn.init.constant_(m.bias, 0.) 45 | 46 | 47 | def forward(self, x): 48 | # reduce dim 49 | x = self.conv0_rdim(x) 50 | # nonlocal 51 | if hasattr(self, 'nl_feature'): 52 | x = self.nl_feature(x) 53 | # for all branches 54 | cl = self.cls(x) 55 | re = torch.exp(self.mul*self.reg(x)) 56 | 57 | return cl, re, x 58 | 59 | 60 | 61 | # standard detection head module (cls, olp, reg) 62 | class DETHead(nn.Module): 63 | def __init__(self, cfg): 64 | super(type(self), self).__init__() 65 | 66 | # define individual modules 67 | self.head_oproi = cfg.head_oproi 68 | fdim = cfg.head_nfeat 69 | conv1_unit = [nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()] 70 | conv2_unit = [nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()] 71 | for i in range(cfg.head_nconv-1): 72 | conv1_unit.extend([nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()]) #nn.Conv2d(fdim,fdim, 3,1,1) 73 | conv2_unit.extend([nn.Conv2d(fdim,fdim, 1,1,0), nn.GroupNorm(16,fdim), nn.ReLU()]) 74 | 75 | self.conv1 = nn.Sequential(*conv1_unit) 76 | self.conv2 = nn.Sequential(*conv2_unit) 77 | 78 | self.conv_cls = nn.Sequential(nn.Conv2d(fdim,2, cfg.roip_size,1,0)) 79 | self.conv_reg = nn.Sequential(nn.Conv2d(fdim,4, cfg.roip_size,1,0)) 80 | if self.head_oproi: 81 | self.conv_olp = nn.Sequential(nn.Conv2d(fdim,1, cfg.roip_size,1,0)) 82 | 83 | # define sequential modules 84 | self.cls = nn.Sequential(self.conv1, self.conv_cls) 85 | self.reg = nn.Sequential(self.conv2, self.conv_reg) 86 | if self.head_oproi: 87 | self.olp = nn.Sequential(self.conv2, self.conv_olp) 88 | 89 | # init 90 | head_module_list = nn.ModuleList([self.cls, self.olp, self.reg]) if self.head_oproi else nn.ModuleList([self.cls, self.reg]) 91 | for m in head_module_list.modules(): 92 | if isinstance(m, nn.Conv2d): 93 | nn.init.normal_(m.weight, 0., 1e-3) 94 | nn.init.constant_(m.bias, 0.) 95 | 96 | 97 | def forward(self, x, out_re=True): 98 | # for all 3 branches 99 | cl = self.cls(x) 100 | op = self.olp(x) if (out_re and self.head_oproi) else None 101 | re = self.reg(x) if out_re else None 102 | 103 | return cl, op, re 104 | 105 | 106 | -------------------------------------------------------------------------------- /model/focal_loss.py: -------------------------------------------------------------------------------- 1 | # https://github.com/clcarwin/focal_loss_pytorch/blob/master/focalloss.py 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | 8 | class FocalLoss(nn.Module): 9 | def __init__(self, gamma=0, alpha=None, size_average=True): 10 | super(FocalLoss, self).__init__() 11 | self.gamma = gamma 12 | self.alpha = alpha 13 | if isinstance(alpha,(float,int,long)): self.alpha = torch.Tensor([alpha,1-alpha]) 14 | if isinstance(alpha,list): self.alpha = torch.Tensor(alpha) 15 | self.size_average = size_average 16 | 17 | def forward(self, input, target): 18 | if input.dim()>2: 19 | input = input.view(input.size(0),input.size(1),-1) # N,C,H,W => N,C,H*W 20 | input = input.transpose(1,2) # N,C,H*W => N,H*W,C 21 | input = input.contiguous().view(-1,input.size(2)) # N,H*W,C => N*H*W,C 22 | target = target.view(-1,1) 23 | 24 | logpt = F.log_softmax(input, dim=1) 25 | logpt = logpt.gather(1,target) 26 | logpt = logpt.view(-1) 27 | pt = Variable(logpt.data.exp()) 28 | 29 | if self.alpha is not None: 30 | if self.alpha.type()!=input.data.type(): 31 | self.alpha = self.alpha.type_as(input.data) 32 | at = self.alpha.gather(0,target.data.view(-1)) 33 | logpt = logpt * Variable(at) 34 | 35 | loss = -1 * (1-pt)**self.gamma * logpt 36 | if self.size_average: return loss.mean() 37 | else: return loss.sum() 38 | -------------------------------------------------------------------------------- /model/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from focal_loss import FocalLoss 5 | from utils import down2n 6 | from box_utils import jaccard 7 | 8 | 9 | class Track_Loss(nn.Module): 10 | def __init__(self, cfg): 11 | super(type(self), self).__init__() 12 | # params and flags 13 | self.loss_lambda = cfg.loss_lambda 14 | self.im_size = cfg.im_size 15 | self.map_size = (down2n(cfg.im_size[0],cfg.conv_npool[-1]),down2n(cfg.im_size[1],cfg.conv_npool[-1])) 16 | self.bbox_thres = cfg.bbox_thres 17 | self.head_oproi = cfg.head_oproi 18 | # loss objects 19 | self.cl_loss = FocalLoss(gamma=cfg.loss_gamma, alpha=cfg.loss_alpha, size_average=True) 20 | self.cf_loss = nn.CrossEntropyLoss() 21 | self.op_loss = nn.BCEWithLogitsLoss() 22 | 23 | 24 | def get_cl_loss(self, cl, gt): 25 | # return classification loss for rpn module, use focal loss 26 | # get positive instance indices from gt [bnum, map_h, map_w, 2] 27 | pos_idxs = gt.flatten().nonzero()[:,0] 28 | pos_nums = pos_idxs.shape[0] 29 | cl_f = cl.reshape(-1,2) 30 | gt_f = (1-gt).flatten().long() 31 | loss = self.cl_loss(cl_f, gt_f) 32 | # averaging 33 | #loss /= float(pos_nums) if pos_nums>0 else gt.numel() 34 | return loss 35 | 36 | 37 | def get_re_loss(self, re, gr, gt, eps=1e-7): 38 | # return box regression loss for positive instances 39 | # get positive instance indices from gt [bnum, map_h, map_w, 2] 40 | pos_idxs = gt.flatten().nonzero()[:,0] 41 | pos_nums = pos_idxs.shape[0] 42 | if pos_nums < 1: 43 | return 0. 44 | # select corresponding instances in regularization results 45 | gr_sel = gr.reshape(-1,4)[pos_idxs] # [pos_idxs, ltrb] 46 | re_sel = re.reshape(-1,4)[pos_idxs] 47 | 48 | # iou calculation - intersection 49 | iou_inter = torch.min(re_sel, gr_sel) 50 | iou_inter = (iou_inter[:,0]+iou_inter[:,2])*(iou_inter[:,1]+iou_inter[:,3]) # area = (l+r)*(t+b) 51 | # iou calculation - union 52 | gr_area = (gr_sel[:,0]+gr_sel[:,2])*(gr_sel[:,1]+gr_sel[:,3]) # area = (l+r)*(t+b) 53 | re_area = (re_sel[:,0]+re_sel[:,2])*(re_sel[:,1]+re_sel[:,3]) # area = (l+r)*(t+b) 54 | iou_union = gr_area + re_area - iou_inter + eps 55 | # iou calculation - inter / union 56 | iou_sel = (iou_inter+1.) / (iou_union+1.) 57 | # total iou loss 58 | loss = torch.mean(1.-iou_sel) 59 | return loss 60 | 61 | 62 | def get_rcnn_loss(self, cf, op, bb, br, gb): 63 | # cf = [numb, numbb, 1+nnum, 2(pn)] (output binary class) 64 | # op = [numb, numbb, 1] (output iou overlap score) 65 | # bb = [numb, numbb, 4(xyxy)] (output refined bbox) 66 | # br = [numb, numbb, 4(xyxy)] (output unrefined bbox) 67 | # gb = [numb, 4] (ground truth bbox) 68 | # sizes 69 | num_batch = cf.shape[0] 70 | num_boxes = cf.shape[1] 71 | num_negbb = cf.shape[2]-1 72 | # per batch iteration 73 | loss, total_pos = 0,0 74 | for i in range(num_batch): 75 | # find positive instances in a batch (bb overlap > threshold) 76 | cf_i = cf[i] # [numbox, 1+nnum, 2] 77 | op_i = op[i] if self.head_oproi else None # [numbox, 1] 78 | bb_i = bb[i] # [numbox, 4] = [numbox, x0y0x1y1] 79 | br_i = br[i] # [numbox, 4] = [numbox, x0y0x1y1] 80 | gb_i = gb[i].unsqueeze(0) # [1,4] = [1, x0y0x1y1] 81 | # iou for rois 82 | iou_br = jaccard(gb_i, br_i)[0] 83 | pos_idxs = (iou_br >=self.bbox_thres[0]).nonzero()[:,0] 84 | neg_idxs = (iou_br < self.bbox_thres[1]).nonzero()[:,0] 85 | pos_nums, neg_nums = pos_idxs.shape[0], neg_idxs.shape[0] 86 | total_pos += pos_nums 87 | 88 | # enforce iou overlap regression loss 89 | loss_op_i = self.op_loss(op_i[...,0][pos_idxs], iou_br[pos_idxs]) if (pos_nums>0) and (self.head_oproi) else 0. 90 | 91 | # enforce labels, binary cross entropy loss 92 | # pos input sample ~ pos/neg boxes 93 | cf_lbl_pos = torch.zeros(pos_nums, device=cf_i.device).long() 94 | cf_lbl_neg = torch.ones(neg_nums, device=cf_i.device).long() 95 | loss_cf_i_pos_pos = self.cf_loss(cf_i[pos_idxs,0,:], cf_lbl_pos) if pos_nums>0 else 0. 96 | loss_cf_i_pos_neg = self.cf_loss(cf_i[neg_idxs,0,:], cf_lbl_neg) if neg_nums>0 else 0. 97 | loss_cf_i_pos = loss_cf_i_pos_pos + loss_cf_i_pos_neg 98 | 99 | # neg input sample ~ pos boxes 100 | if (num_negbb>0) and (pos_nums>0): 101 | cf_i_neg = cf_i[pos_idxs,1:,:].flatten(0,1) # [pos_nums*num_negbb, 2] 102 | cf_lbl_neg = torch.ones(pos_nums*num_negbb, device=cf_i.device).long() 103 | loss_cf_i_neg = self.cf_loss(cf_i_neg, cf_lbl_neg) 104 | else: 105 | loss_cf_i_neg = 0. 106 | loss_cf_i = loss_cf_i_pos + loss_cf_i_neg 107 | 108 | # iou for refined bb 109 | iou_bb = jaccard(gb_i, bb_i, eps=1.0)[0] 110 | # enforce box regression (only for positive instances), linear iou loss 111 | loss_bb_i = torch.mean(1. - iou_bb[pos_idxs]) if pos_nums>0 else 0 112 | # loss for single batch, add to total loss 113 | if pos_nums==0: 114 | loss_i = 0. 115 | else: 116 | loss_i = loss_cf_i + loss_bb_i + loss_op_i 117 | loss += loss_i 118 | 119 | # divide loss by batch size 120 | loss /= num_batch 121 | return loss, total_pos 122 | 123 | 124 | def forward(self, outs, gts, add_rcnn_loss=True): 125 | # parse network outputs 126 | out_rpn, out_rcnn = outs 127 | cl, re = out_rpn[0], out_rpn[1] 128 | cf, op, bb, br = out_rcnn[0], out_rcnn[1], out_rcnn[2], out_rcnn[3] 129 | # parse gts (gt_box, gt_cl, gt_re) 130 | gb, gt, gr = gts 131 | 132 | # loss for rpn outputs 133 | rpn_loss0 = self.get_cl_loss(cl, gt) 134 | rpn_loss1 = self.get_re_loss(re, gr, gt) 135 | rpn_loss = rpn_loss0 + rpn_loss1 136 | 137 | # loss for rcnn outputs 138 | rcnn_loss, total_pos = self.get_rcnn_loss(cf, op, bb, br, gb) 139 | 140 | # total loss 141 | if add_rcnn_loss: 142 | total_loss = rpn_loss + self.loss_lambda*rcnn_loss 143 | else: 144 | total_loss = rpn_loss 145 | 146 | return total_loss, [rpn_loss0, rpn_loss1, rcnn_loss, int(total_pos)] 147 | 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /model/non_local.py: -------------------------------------------------------------------------------- 1 | # code from : https://github.com/AlexHex7/Non-local_pytorch 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | 7 | class _NonLocalBlockND(nn.Module): 8 | def __init__(self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True): 9 | """ 10 | :param in_channels: 11 | :param inter_channels: 12 | :param dimension: 13 | :param sub_sample: 14 | :param bn_layer: 15 | """ 16 | 17 | super(_NonLocalBlockND, self).__init__() 18 | 19 | assert dimension in [1, 2, 3] 20 | 21 | self.dimension = dimension 22 | self.sub_sample = sub_sample 23 | 24 | self.in_channels = in_channels 25 | self.inter_channels = inter_channels 26 | 27 | if self.inter_channels is None: 28 | self.inter_channels = in_channels // 2 29 | if self.inter_channels == 0: 30 | self.inter_channels = 1 31 | 32 | if dimension == 3: 33 | conv_nd = nn.Conv3d 34 | max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2)) 35 | bn = nn.GroupNorm #nn.BatchNorm3d 36 | elif dimension == 2: 37 | conv_nd = nn.Conv2d 38 | max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2)) 39 | bn = nn.GroupNorm #nn.BatchNorm2d 40 | else: 41 | conv_nd = nn.Conv1d 42 | max_pool_layer = nn.MaxPool1d(kernel_size=(2)) 43 | bn = nn.GroupNorm #nn.BatchNorm1d 44 | 45 | self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, 46 | kernel_size=1, stride=1, padding=0) 47 | 48 | if bn_layer: 49 | self.W = nn.Sequential( 50 | conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, 51 | kernel_size=1, stride=1, padding=0), 52 | bn(16, self.in_channels) #bn(self.in_channels) 53 | ) 54 | nn.init.constant_(self.W[1].weight, 0) 55 | nn.init.constant_(self.W[1].bias, 0) 56 | else: 57 | self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, 58 | kernel_size=1, stride=1, padding=0) 59 | nn.init.constant_(self.W.weight, 0) 60 | nn.init.constant_(self.W.bias, 0) 61 | 62 | self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, 63 | kernel_size=1, stride=1, padding=0) 64 | self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, 65 | kernel_size=1, stride=1, padding=0) 66 | 67 | if sub_sample: 68 | self.g = nn.Sequential(self.g, max_pool_layer) 69 | self.phi = nn.Sequential(self.phi, max_pool_layer) 70 | 71 | def forward(self, x, return_nl_map=False): 72 | """ 73 | :param x: (b, c, t, h, w) 74 | :param return_nl_map: if True return z, nl_map, else only return z. 75 | :return: 76 | """ 77 | 78 | batch_size = x.size(0) 79 | 80 | g_x = self.g(x).view(batch_size, self.inter_channels, -1) 81 | g_x = g_x.permute(0, 2, 1) 82 | 83 | theta_x = self.theta(x).view(batch_size, self.inter_channels, -1) 84 | theta_x = theta_x.permute(0, 2, 1) 85 | phi_x = self.phi(x).view(batch_size, self.inter_channels, -1) 86 | f = torch.matmul(theta_x, phi_x) 87 | f_div_C = F.softmax(f, dim=-1) 88 | 89 | y = torch.matmul(f_div_C, g_x) 90 | y = y.permute(0, 2, 1).contiguous() 91 | y = y.view(batch_size, self.inter_channels, *x.size()[2:]) 92 | W_y = self.W(y) 93 | z = W_y + x 94 | 95 | if return_nl_map: 96 | return z, f_div_C 97 | return z 98 | 99 | 100 | class NONLocalBlock1D(_NonLocalBlockND): 101 | def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True): 102 | super(NONLocalBlock1D, self).__init__(in_channels, 103 | inter_channels=inter_channels, 104 | dimension=1, sub_sample=sub_sample, 105 | bn_layer=bn_layer) 106 | 107 | 108 | class NONLocalBlock2D(_NonLocalBlockND): 109 | def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True): 110 | super(NONLocalBlock2D, self).__init__(in_channels, 111 | inter_channels=inter_channels, 112 | dimension=2, sub_sample=sub_sample, 113 | bn_layer=bn_layer,) 114 | 115 | 116 | class NONLocalBlock3D(_NonLocalBlockND): 117 | def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True): 118 | super(NONLocalBlock3D, self).__init__(in_channels, 119 | inter_channels=inter_channels, 120 | dimension=3, sub_sample=sub_sample, 121 | bn_layer=bn_layer,) 122 | 123 | 124 | if __name__ == '__main__': 125 | import torch 126 | 127 | for (sub_sample_, bn_layer_) in [(True, True), (False, False), (True, False), (False, True)]: 128 | img = torch.zeros(2, 3, 20) 129 | net = NONLocalBlock1D(3, sub_sample=sub_sample_, bn_layer=bn_layer_) 130 | out = net(img) 131 | print(out.size()) 132 | 133 | img = torch.zeros(2, 3, 20, 20) 134 | net = NONLocalBlock2D(3, sub_sample=sub_sample_, bn_layer=bn_layer_, store_last_batch_nl_map=True) 135 | out = net(img) 136 | print(out.size()) 137 | 138 | img = torch.randn(2, 3, 8, 20, 20) 139 | net = NONLocalBlock3D(3, sub_sample=sub_sample_, bn_layer=bn_layer_, store_last_batch_nl_map=True) 140 | out = net(img) 141 | print(out.size()) 142 | 143 | -------------------------------------------------------------------------------- /model/rcnn_module.py: -------------------------------------------------------------------------------- 1 | import torch, time 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torchvision.ops import roi_align 5 | 6 | from utils import down2n 7 | from fcos import DETHead 8 | 9 | # receives pooled features and predicts classes + refined boxes 10 | class RCNN_Module(nn.Module): 11 | def __init__(self,cfg): 12 | super(type(self), self).__init__() 13 | # params 14 | self.head_oproi = cfg.head_oproi 15 | self.im_size = cfg.im_size 16 | self.map_size = (down2n(cfg.im_size[0],cfg.conv_npool[-1]),down2n(cfg.im_size[1],cfg.conv_npool[-1])) 17 | self.scale_f = float(self.map_size[0]) / float(self.im_size[0]) 18 | self.pool_size = cfg.roip_size 19 | # feat modulation layer 20 | self.conv_x = nn.Conv2d(cfg.head_nfeat, cfg.head_nfeat, 1) 21 | self.conv_y = nn.Conv2d(cfg.head_nfeat, cfg.head_nfeat, 1) 22 | # detection head 23 | self.rcnn_head = DETHead(cfg) 24 | 25 | # init 26 | rcnn_convs = nn.ModuleList([self.conv_x, self.conv_y]) 27 | for m in rcnn_convs.modules(): 28 | if isinstance(m, nn.Conv2d): 29 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 30 | 31 | 32 | def forward(self, feats, boxes): 33 | # given query feature, candidate features, candidate boxes 34 | # return classification results and bbox refinements 35 | 36 | # feats = (xfa, zfa) 37 | # xfa = list of len=2 (pos/neg) where xfa[0]= Tensor[bnum, ndim, pool_size,pool_size] 38 | # zfa = list of len=batch_size where zfa[i]=[nms_num_bb, ndim, pool_size, pool_size] 39 | # boxes = list of len=batch_size where boxes[i]=[num_num_bb, 4] 40 | 41 | # pos_feats [bnum, cnum, pool_sz, pool_sz] 42 | # neg_feats [bnum, nnum, cnum, pool_sz, pool_sz] 43 | xfa_p, xfa_n, yfa = feats 44 | pf = xfa_p # use spatially pooled feats 45 | nf = xfa_n 46 | # candidate feats [bnum, bbnum, cnum, psz, psz] 47 | cf = yfa 48 | # store shapes 49 | batch_size = cf.shape[0] 50 | bbnum_size = cf.shape[1] 51 | nfeat_size = cf.shape[2] 52 | negff_size = nf.shape[1] if nf is not None else 0 53 | 54 | # feature modulation 55 | pf = self.conv_x(pf) 56 | nf = self.conv_x(nf.flatten(0,1)).view(batch_size, negff_size, nfeat_size, self.pool_size, self.pool_size) if nf is not None else None 57 | cf = self.conv_y(cf.flatten(0,1)).view(batch_size, bbnum_size, nfeat_size, self.pool_size, self.pool_size) 58 | 59 | # == for positive feats 60 | # repeat pf feats 61 | pf_r = pf.unsqueeze(1).repeat_interleave(bbnum_size, dim=1) # [bnum, bbnum, cnum, psz, psz] 62 | # multiply between feats (correlation) or concat channel dim 63 | cc = pf_r * cf #torch.cat((pf_r, cf), dim=2)# 64 | # detection head 65 | cl_p, op, re = self.rcnn_head(cc.flatten(0,1)) 66 | cl_p = cl_p.view(batch_size, bbnum_size, 1, 2) 67 | op = op.view(batch_size, bbnum_size, 1) if self.head_oproi else None 68 | re = re.view(batch_size, bbnum_size, 4) 69 | #re = torch.zeros_like(re) 70 | 71 | # == for negative feats 72 | if nf is not None: 73 | nf_r = nf.unsqueeze(1).repeat_interleave(bbnum_size, dim=1) # [bnum, bbnum, nnum, cnum, psz, psz] 74 | cf_r = cf.unsqueeze(2).repeat_interleave(negff_size, dim=2) # [bnum, bbnum, nnum, cnum, psz, psz] 75 | cn = nf_r * cf_r # correlation 76 | # cn = torch.cat((nf_r,cf_r), dim=3) # concatenation 77 | # detection head 78 | cl_n, _, _ = self.rcnn_head(cn.flatten(0,2), out_re=False) 79 | cl_n = cl_n.view(batch_size, bbnum_size, negff_size, 2) 80 | 81 | # integrated classification scores [bnum, bbnum, 1+nnum, 2] 82 | cl = torch.cat((cl_p, cl_n), dim=2) if nf is not None else cl_p 83 | 84 | # == modify input boxes accto re output 85 | # boxes = [bnum, bbnum, x0y0x1y1] 86 | boxes = torch.stack(boxes) 87 | #bb = boxes + re 88 | # change to [bnum, bbnum, x_cen/y_cen/width/height] 89 | boxes_w = boxes[...,2] - boxes[...,0] 90 | boxes_h = boxes[...,3] - boxes[...,1] 91 | boxes_xc = boxes[...,0] + boxes_w*0.5 92 | boxes_yc = boxes[...,1] + boxes_h*0.5 93 | # modify accto regression outputs 94 | boxes_xc_m = boxes_xc + boxes_w * re[...,0] 95 | boxes_yc_m = boxes_yc + boxes_h * re[...,1] 96 | boxes_w_m = boxes_w * torch.exp(re[...,2]) 97 | boxes_h_m = boxes_h * torch.exp(re[...,3]) 98 | # revert cooridates 99 | boxes_x0 = (boxes_xc_m - boxes_w_m*0.5).unsqueeze(-1) 100 | boxes_x1 = (boxes_xc_m + boxes_w_m*0.5).unsqueeze(-1) 101 | boxes_y0 = (boxes_yc_m - boxes_h_m*0.5).unsqueeze(-1) 102 | boxes_y1 = (boxes_yc_m + boxes_h_m*0.5).unsqueeze(-1) 103 | # concat 104 | bb = torch.cat([boxes_x0, boxes_y0, boxes_x1, boxes_y1], dim=-1) 105 | 106 | return cl, op, bb, boxes 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /model/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | # from .utils import load_state_dict_from_url 4 | 5 | 6 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 7 | 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d', 8 | 'wide_resnet50_2', 'wide_resnet101_2'] 9 | 10 | 11 | model_urls = { 12 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 13 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 14 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 15 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 16 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', 17 | 'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth', 18 | 'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth', 19 | 'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth', 20 | 'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth', 21 | } 22 | 23 | 24 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): 25 | """3x3 convolution with padding""" 26 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 27 | padding=dilation, groups=groups, bias=False, dilation=dilation) 28 | 29 | 30 | def conv1x1(in_planes, out_planes, stride=1): 31 | """1x1 convolution""" 32 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 33 | 34 | 35 | class BasicBlock(nn.Module): 36 | expansion = 1 37 | 38 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 39 | base_width=64, dilation=1, norm_layer=None): 40 | super(BasicBlock, self).__init__() 41 | if norm_layer is None: 42 | norm_layer = nn.BatchNorm2d 43 | if groups != 1 or base_width != 64: 44 | raise ValueError('BasicBlock only supports groups=1 and base_width=64') 45 | if dilation > 1: 46 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock") 47 | # Both self.conv1 and self.downsample layers downsample the input when stride != 1 48 | self.conv1 = conv3x3(inplanes, planes, stride) 49 | self.bn1 = norm_layer(planes) 50 | self.relu = nn.ReLU(inplace=True) 51 | self.conv2 = conv3x3(planes, planes) 52 | self.bn2 = norm_layer(planes) 53 | self.downsample = downsample 54 | self.stride = stride 55 | 56 | def forward(self, x): 57 | identity = x 58 | 59 | out = self.conv1(x) 60 | out = self.bn1(out) 61 | out = self.relu(out) 62 | 63 | out = self.conv2(out) 64 | out = self.bn2(out) 65 | 66 | if self.downsample is not None: 67 | identity = self.downsample(x) 68 | 69 | out += identity 70 | out = self.relu(out) 71 | 72 | return out 73 | 74 | 75 | class Bottleneck(nn.Module): 76 | # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2) 77 | # while original implementation places the stride at the first 1x1 convolution(self.conv1) 78 | # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385. 79 | # This variant is also known as ResNet V1.5 and improves accuracy according to 80 | # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch. 81 | 82 | expansion = 4 83 | 84 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 85 | base_width=64, dilation=1, norm_layer=None): 86 | super(Bottleneck, self).__init__() 87 | if norm_layer is None: 88 | norm_layer = nn.BatchNorm2d 89 | width = int(planes * (base_width / 64.)) * groups 90 | # Both self.conv2 and self.downsample layers downsample the input when stride != 1 91 | self.conv1 = conv1x1(inplanes, width) 92 | self.bn1 = norm_layer(width) 93 | self.conv2 = conv3x3(width, width, stride, groups, dilation) 94 | self.bn2 = norm_layer(width) 95 | self.conv3 = conv1x1(width, planes * self.expansion) 96 | self.bn3 = norm_layer(planes * self.expansion) 97 | self.relu = nn.ReLU(inplace=True) 98 | self.downsample = downsample 99 | self.stride = stride 100 | 101 | def forward(self, x): 102 | identity = x 103 | 104 | out = self.conv1(x) 105 | out = self.bn1(out) 106 | out = self.relu(out) 107 | 108 | out = self.conv2(out) 109 | out = self.bn2(out) 110 | out = self.relu(out) 111 | 112 | out = self.conv3(out) 113 | out = self.bn3(out) 114 | 115 | if self.downsample is not None: 116 | identity = self.downsample(x) 117 | 118 | out += identity 119 | out = self.relu(out) 120 | 121 | return out 122 | 123 | 124 | class ResNet(nn.Module): 125 | 126 | def __init__(self, block, layers, num_classes=1000, zero_init_residual=False, 127 | groups=1, width_per_group=64, replace_stride_with_dilation=None, 128 | norm_layer=None, cfg=None): 129 | super(ResNet, self).__init__() 130 | if norm_layer is None: 131 | norm_layer = nn.BatchNorm2d 132 | self._norm_layer = norm_layer 133 | 134 | self.inplanes = 64 135 | self.dilation = 1 136 | if replace_stride_with_dilation is None: 137 | # each element in the tuple indicates if we should replace 138 | # the 2x2 stride with a dilated convolution instead 139 | replace_stride_with_dilation = [False, False, False] 140 | if len(replace_stride_with_dilation) != 3: 141 | raise ValueError("replace_stride_with_dilation should be None " 142 | "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) 143 | self.groups = groups 144 | self.base_width = width_per_group 145 | self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, 146 | bias=False) 147 | self.bn1 = norm_layer(self.inplanes) 148 | self.relu = nn.ReLU(inplace=True) 149 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 150 | self.layer1 = self._make_layer(block, 64, layers[0]) 151 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2, 152 | dilate=replace_stride_with_dilation[0]) 153 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2, 154 | dilate=replace_stride_with_dilation[1]) 155 | self.layer4 = self._make_layer(block, 512, layers[3], stride=1, #2 156 | dilate=replace_stride_with_dilation[2]) 157 | # self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 158 | # self.fc = nn.Linear(512 * block.expansion, num_classes) 159 | 160 | for m in self.modules(): 161 | if isinstance(m, nn.Conv2d): 162 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 163 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): 164 | nn.init.constant_(m.weight, 1) 165 | nn.init.constant_(m.bias, 0) 166 | 167 | # Zero-initialize the last BN in each residual branch, 168 | # so that the residual branch starts with zeros, and each residual block behaves like an identity. 169 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 170 | if zero_init_residual: 171 | for m in self.modules(): 172 | if isinstance(m, Bottleneck): 173 | nn.init.constant_(m.bn3.weight, 0) 174 | elif isinstance(m, BasicBlock): 175 | nn.init.constant_(m.bn2.weight, 0) 176 | 177 | def _make_layer(self, block, planes, blocks, stride=1, dilate=False): 178 | norm_layer = self._norm_layer 179 | downsample = None 180 | previous_dilation = self.dilation 181 | if dilate: 182 | self.dilation *= stride 183 | stride = 1 184 | if stride != 1 or self.inplanes != planes * block.expansion: 185 | downsample = nn.Sequential( 186 | conv1x1(self.inplanes, planes * block.expansion, stride), 187 | norm_layer(planes * block.expansion), 188 | ) 189 | 190 | layers = [] 191 | layers.append(block(self.inplanes, planes, stride, downsample, self.groups, 192 | self.base_width, previous_dilation, norm_layer)) 193 | self.inplanes = planes * block.expansion 194 | for _ in range(1, blocks): 195 | layers.append(block(self.inplanes, planes, groups=self.groups, 196 | base_width=self.base_width, dilation=self.dilation, 197 | norm_layer=norm_layer)) 198 | 199 | return nn.Sequential(*layers) 200 | 201 | def _forward_impl(self, x): 202 | # See note [TorchScript super()] 203 | x = self.conv1(x) 204 | x = self.bn1(x) 205 | x = self.relu(x) 206 | x = self.maxpool(x) 207 | 208 | x = self.layer1(x) 209 | x = self.layer2(x) 210 | x = self.layer3(x) 211 | x = self.layer4(x) 212 | 213 | # x = self.avgpool(x) 214 | # x = torch.flatten(x, 1) 215 | # x = self.fc(x) 216 | 217 | return x 218 | 219 | def forward(self, x): 220 | return self._forward_impl(x) 221 | 222 | 223 | def _resnet(arch, block, layers, pretrained, progress, **kwargs): 224 | model = ResNet(block, layers, **kwargs) 225 | if pretrained: 226 | state_dict = load_state_dict_from_url(model_urls[arch], 227 | progress=progress) 228 | model.load_state_dict(state_dict) 229 | return model 230 | 231 | 232 | def resnet18(pretrained=False, progress=True, **kwargs): 233 | r"""ResNet-18 model from 234 | `"Deep Residual Learning for Image Recognition" `_ 235 | 236 | Args: 237 | pretrained (bool): If True, returns a model pre-trained on ImageNet 238 | progress (bool): If True, displays a progress bar of the download to stderr 239 | """ 240 | return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress, 241 | **kwargs) 242 | 243 | 244 | def resnet34(pretrained=False, progress=True, **kwargs): 245 | r"""ResNet-34 model from 246 | `"Deep Residual Learning for Image Recognition" `_ 247 | 248 | Args: 249 | pretrained (bool): If True, returns a model pre-trained on ImageNet 250 | progress (bool): If True, displays a progress bar of the download to stderr 251 | """ 252 | return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress, 253 | **kwargs) 254 | 255 | 256 | def resnet50(pretrained=False, progress=True, **kwargs): 257 | r"""ResNet-50 model from 258 | `"Deep Residual Learning for Image Recognition" `_ 259 | 260 | Args: 261 | pretrained (bool): If True, returns a model pre-trained on ImageNet 262 | progress (bool): If True, displays a progress bar of the download to stderr 263 | """ 264 | return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress, 265 | **kwargs) 266 | 267 | 268 | def resnet101(pretrained=False, progress=True, **kwargs): 269 | r"""ResNet-101 model from 270 | `"Deep Residual Learning for Image Recognition" `_ 271 | 272 | Args: 273 | pretrained (bool): If True, returns a model pre-trained on ImageNet 274 | progress (bool): If True, displays a progress bar of the download to stderr 275 | """ 276 | return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress, 277 | **kwargs) 278 | 279 | 280 | def resnet152(pretrained=False, progress=True, **kwargs): 281 | r"""ResNet-152 model from 282 | `"Deep Residual Learning for Image Recognition" `_ 283 | 284 | Args: 285 | pretrained (bool): If True, returns a model pre-trained on ImageNet 286 | progress (bool): If True, displays a progress bar of the download to stderr 287 | """ 288 | return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress, 289 | **kwargs) 290 | 291 | 292 | def resnext50_32x4d(pretrained=False, progress=True, **kwargs): 293 | r"""ResNeXt-50 32x4d model from 294 | `"Aggregated Residual Transformation for Deep Neural Networks" `_ 295 | 296 | Args: 297 | pretrained (bool): If True, returns a model pre-trained on ImageNet 298 | progress (bool): If True, displays a progress bar of the download to stderr 299 | """ 300 | kwargs['groups'] = 32 301 | kwargs['width_per_group'] = 4 302 | return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3], 303 | pretrained, progress, **kwargs) 304 | 305 | 306 | def resnext101_32x8d(pretrained=False, progress=True, **kwargs): 307 | r"""ResNeXt-101 32x8d model from 308 | `"Aggregated Residual Transformation for Deep Neural Networks" `_ 309 | 310 | Args: 311 | pretrained (bool): If True, returns a model pre-trained on ImageNet 312 | progress (bool): If True, displays a progress bar of the download to stderr 313 | """ 314 | kwargs['groups'] = 32 315 | kwargs['width_per_group'] = 8 316 | return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3], 317 | pretrained, progress, **kwargs) 318 | 319 | 320 | def wide_resnet50_2(pretrained=False, progress=True, **kwargs): 321 | r"""Wide ResNet-50-2 model from 322 | `"Wide Residual Networks" `_ 323 | 324 | The model is the same as ResNet except for the bottleneck number of channels 325 | which is twice larger in every block. The number of channels in outer 1x1 326 | convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 327 | channels, and in Wide ResNet-50-2 has 2048-1024-2048. 328 | 329 | Args: 330 | pretrained (bool): If True, returns a model pre-trained on ImageNet 331 | progress (bool): If True, displays a progress bar of the download to stderr 332 | """ 333 | kwargs['width_per_group'] = 64 * 2 334 | return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3], 335 | pretrained, progress, **kwargs) 336 | 337 | 338 | def wide_resnet101_2(pretrained=False, progress=True, **kwargs): 339 | r"""Wide ResNet-101-2 model from 340 | `"Wide Residual Networks" `_ 341 | 342 | The model is the same as ResNet except for the bottleneck number of channels 343 | which is twice larger in every block. The number of channels in outer 1x1 344 | convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 345 | channels, and in Wide ResNet-50-2 has 2048-1024-2048. 346 | 347 | Args: 348 | pretrained (bool): If True, returns a model pre-trained on ImageNet 349 | progress (bool): If True, displays a progress bar of the download to stderr 350 | """ 351 | kwargs['width_per_group'] = 64 * 2 352 | return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3], 353 | pretrained, progress, **kwargs) 354 | -------------------------------------------------------------------------------- /model/rpn_module.py: -------------------------------------------------------------------------------- 1 | import torch, time 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torchvision.ops import roi_align 5 | 6 | from utils import down2n 7 | from fcos import FCOSHead 8 | from boxes import BoxModule 9 | from cbam import CBAModule 10 | from context import ContextModule 11 | 12 | # receives pairwise images as input, returns roi bounding box proposals and its pooled features 13 | class RPN_Module(nn.Module): 14 | def __init__(self,cfg,bb_ch): 15 | super(type(self), self).__init__() 16 | # params 17 | self.im_size = cfg.im_size 18 | self.map_size = (down2n(cfg.im_size[0],cfg.conv_npool[-1]),down2n(cfg.im_size[1],cfg.conv_npool[-1])) 19 | self.scale_f = float(self.map_size[0]) / float(self.im_size[0]) 20 | self.pool_size = cfg.roip_size 21 | self.head_dconv = cfg.head_dconv 22 | self.head_ctxff = cfg.head_ctxff 23 | # numof channels for backbone output, refined output 24 | self.bb_ch = bb_ch 25 | self.head_nfeat = cfg.head_nfeat 26 | # attetntion module and channel conversion for backbone outputs 27 | fmul = 3 if cfg.head_dconv else 1 28 | self.cbamod = CBAModule(self.head_nfeat*fmul) 29 | self.conv_x = nn.Conv2d(self.bb_ch, cfg.head_nfeat, 1) 30 | self.conv_y = nn.Conv2d(self.bb_ch, cfg.head_nfeat, 1) 31 | # detection head 32 | self.roi_head = FCOSHead(cfg) 33 | # nms box predictions 34 | self.boxes = BoxModule(cfg) 35 | # context module 36 | self.context_x = ContextModule(cfg) if self.head_ctxff[0] else None 37 | self.context_y = ContextModule(cfg) if self.head_ctxff[0] else None 38 | 39 | # init 40 | rpn_convs = nn.ModuleList([self.conv_x, self.conv_y]) 41 | for m in rpn_convs.modules(): 42 | if isinstance(m, nn.Conv2d): 43 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 44 | 45 | 46 | def fmap_corr(self,xf,yf,pad=True,dilate=1): 47 | # get shapes 48 | xf_s = xf.shape # [bn, ch, xh, xw] (larger -> as image) [1, bn*ch, xh, xw] 49 | yf_s = yf.shape # [bn, ch, yh, yw] (smaller -> as filter) [bn*cn, 1, yh, yw] 50 | bn = xf_s[0]; cn = xf_s[1]; 51 | # reshape dims 52 | xf_r = xf.view(1, bn*cn, xf_s[2], xf_s[3]) # [1, bn*cn, xh, xw] 53 | yf_r = yf.reshape(1, bn*cn, yf_s[2], yf_s[3]).transpose(0,1) # [bn*cn, 1, yh, yw] view 54 | # group conv [1, bn*cn, zh, zw] -> [bn, cn, zh, zw] 55 | if pad: pnum = (yf_s[-1] + (dilate-1)*(yf_s[-1]-1))//2 56 | else: pnum = 0 57 | of = F.conv2d(input=xf_r, weight=yf_r, groups=bn*cn, bias=None, padding=pnum, dilation=dilate) 58 | of = of.view(bn, cn, of.shape[2], of.shape[3]) 59 | return of 60 | 61 | 62 | def dconv_fmap_corr(self, yf, xf): 63 | if self.head_dconv: 64 | zf = [] 65 | for i in range(len(xf)): 66 | zf.append(self.fmap_corr(yf, xf[i])) 67 | zf = torch.cat(zf, dim=1) 68 | else: 69 | zf = self.fmap_corr(yf, xf[1], pad=False) 70 | return zf 71 | 72 | 73 | def corr_head(self, xfa, yf): 74 | # cross corr for xfa 75 | zf = self.dconv_fmap_corr(yf,xfa) 76 | # attention module 77 | zf,at = self.cbamod(zf) 78 | # detection head 79 | cl,re,zf = self.roi_head(zf) 80 | # permute dims to [bnum, map_h, map_w, pred], where pred_cls=[neg/pos], pred_re=[ltrb distances] 81 | cl = cl.permute(0,2,3,1) 82 | re = re.permute(0,2,3,1) 83 | return zf,cl,re,at 84 | 85 | 86 | def pool_feat(self, xf, xb_p): 87 | # xb: list of boxes wrt each batch : list, where len(list)=bnum, list[i] = Tensor[N,4] 88 | # feats -> change channel nums [bnum, ndim, pool_sz, pool_sz] 89 | 90 | # original roi align 91 | xfa = [roi_align(xf, xb_p, (self.pool_size,self.pool_size), self.scale_f)] 92 | # additional feats 93 | if self.head_dconv: 94 | # d2 95 | psz = self.pool_size*2 -1 96 | xfa.append(roi_align(xf, xb_p, (psz,psz), self.scale_f)) 97 | # p2 98 | psz = self.pool_size//2 99 | psz += 1 if psz%2==0 else 0 100 | xfa.append(roi_align(xf, xb_p, (psz,psz), self.scale_f)) 101 | else: 102 | xfa.append(roi_align(xf, xb_p, (1,1), self.scale_f)) 103 | 104 | return xfa 105 | 106 | 107 | def forward(self,xf_in,yf_in, xb, xfa_in=None, add_box=None, pool_xf=False): 108 | # xf,yf : Tensor, [bnum, ndim, map_size_h, map_size_w] 109 | # xb : list, [list_posbb] where len(list_xxxbb)=bnum, list_xxxbb[i] = Tensor[N,4] 110 | # xfa_in : trident feat pooled from initial xf for reuse 111 | # add_box : list of boxes to add roi list(add_box)=bnum, add_box[i] = Tensor[M,4] 112 | # pool_xf : pool-align feat from xf rather than yf 113 | 114 | # change channel num of input feature 115 | xf = self.conv_x(xf_in) if xfa_in is None else None 116 | yf = self.conv_y(yf_in) 117 | # roi_align pooling from xf according to xb coordinates 118 | # use given feature if pooled feat xfa is already given 119 | xfa_tri = self.pool_feat(xf, xb) if xfa_in is None else xfa_in 120 | 121 | # fmap cross correlation + detection head = class, regression maps 122 | zf,cl,re,at = self.corr_head(xfa_tri, yf) 123 | pred_maps = (cl,re,at) 124 | 125 | # ==== obtain ROI bounding boxes and pooled features 126 | # nms stage for box predictions : bboxes+scores 127 | pred_roibb, pred_roisc = self.boxes(cl,re) 128 | # add previous box (if exists) 129 | if add_box is not None: 130 | for bi in range(len(pred_roibb)): 131 | pred_roibb[bi] = torch.cat((pred_roibb[bi], add_box[bi]),dim=0) 132 | 133 | # pool feats for given boxes yf with shapes: yfa = [bnum, bbnum, cnum, pool_size, pool_size] 134 | num_boxes = self.boxes.bb_nums if add_box is None else self.boxes.bb_nums+add_box[0].shape[0] 135 | yf = xf if pool_xf else yf # for initial frame feature fetching purposes 136 | yfa = roi_align(yf, pred_roibb, (self.pool_size,self.pool_size), self.scale_f) 137 | yfa = yfa.view(yf.shape[0], num_boxes, yf.shape[1], self.pool_size, self.pool_size) 138 | 139 | # (if specified) embed context feature into ROI features (yfa) based on box predictions (cl,re) 140 | if self.head_ctxff[0]: 141 | yfa = self.context_y(cl,re, zf,yfa,pred_roibb) if not pool_xf else self.context_x(cl,re, zf,yfa,pred_roibb) 142 | 143 | # feats = (xfa_tri, xfa_pos, yfa) 144 | pred_feats = (xfa_tri, xfa_tri[0], yfa) 145 | 146 | return pred_roibb, pred_roisc, pred_feats, pred_maps 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /output/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanghoonChoi/TACT/6870cf45a489f4ebd610f25d099ab5f7470b22e4/output/__init__.py -------------------------------------------------------------------------------- /test_tracker.py: -------------------------------------------------------------------------------- 1 | import os,sys,argparse,time,cv2 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from cfgs import cfg_test as cfg 6 | import torch 7 | 8 | from tracker import Tracker 9 | # from tracker_batch import Tracker 10 | 11 | torch.backends.cudnn.deterministic = True 12 | torch.backends.cudnn.benchmark = False 13 | 14 | _db_name = 'lasot' 15 | _save_txt = True 16 | _calc_auc = True 17 | _out_vid = False 18 | 19 | 20 | def run_eval(idx=-1): 21 | tracker = Tracker(cfg=cfg, db_name=_db_name, idx=idx) 22 | tic = time.time() 23 | res, fps, auc = tracker.run_track_db(seq_list=None, save_res=_save_txt, calc_auc=_calc_auc, out_vid=_out_vid) 24 | 25 | if _calc_auc: 26 | res_str = 'db: '+ _db_name + ', auc: '+str(np.mean(auc))[:6]+ ', fps: '+str(np.mean(fps))[:5]+ ', ckpt: '+tracker.chkpt_file[5:-4] + '\n' 27 | with open('all_results.txt','a') as res_file: 28 | res_file.write(res_str) 29 | 30 | print 'elaptime ' + str((time.time()-tic)/60.)[:6] + ' mins' 31 | return np.mean(auc) 32 | 33 | 34 | 35 | run_eval() 36 | 37 | -------------------------------------------------------------------------------- /th_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | from utils import down2n 6 | 7 | # torch implementation of np.random.choice 8 | def th_choice(a, p=None): 9 | """ torch implementation of np.random.choice(), x1.1~1.5 slower than original function """ 10 | # preliminaries 11 | a_l = len(a) 12 | if p is None: 13 | idx = torch.randperm(a_l) 14 | return a[idx[0]] 15 | 16 | elif torch.sum(p) < 1.: 17 | print torch.sum(p),' p.sum() not 1' 18 | 19 | # accumulative prob 20 | pa = torch.cumsum(p,0) 21 | 22 | # random (0,1) 23 | trnd = torch.rand(1)[0] 24 | 25 | # find 26 | idx = (torch.argmax((pa < trnd).type(torch.FloatTensor))+1) % a_l 27 | return a[idx] 28 | 29 | 30 | def th_choice_mul(a, n): 31 | # choose n random instances from a 32 | # assume p=uniform, with replacement 33 | a_l = len(a) 34 | idxs = torch.randint(low=0, high=a_l, size=(n,)) 35 | 36 | if isinstance(a, list): 37 | return [a[i] for i in idxs] 38 | elif n==1: 39 | return [a[idxs]] 40 | else: 41 | return a[idxs] 42 | 43 | 44 | def th_choice_seq(a, n): 45 | # choose n sequential instances from a 46 | # assume p=uniform, with replacement 47 | a_l = len(a) 48 | if n <= a_l: 49 | idx = torch.randint(low=0, high=a_l-n+1, size=()) 50 | idxs = torch.LongTensor(range(idx, idx+n)) 51 | else: 52 | idxs = torch.LongTensor(range(a_l)+[a_l-1]*(n-a_l)) 53 | 54 | if isinstance(a, list): 55 | return [a[i] for i in idxs] 56 | elif n==1: 57 | return [a[idxs]] 58 | else: 59 | return a[idxs] 60 | 61 | 62 | def th_rand(n=1): 63 | """ proxy to torch.rand(n)[0] """ 64 | if n == 1: 65 | return float(torch.rand(n)[0]) 66 | else: 67 | return torch.rand(n).numpy() 68 | 69 | 70 | def th_rand_rng(low, high, n=1): 71 | """ pull uniform random sample(s) from [a,b) """ 72 | if n == 1: 73 | return (high-low)*float(torch.rand(n)[0])+low 74 | else: 75 | return (high-low)*torch.rand(n)+low 76 | 77 | 78 | def th_rand_sym(r, n=1): 79 | """ pull random sample(s) from [1/r,r), keeping probability mean to 1.0 """ 80 | def unit_rnd(r): 81 | ud_rf = 1 if th_rand() < 1./(r+1.) else 0 82 | rnd = th_rand_rng(1.,r) if ud_rf else th_rand_rng(1./r,1) 83 | return rnd 84 | 85 | if n == 1: 86 | return unit_rnd(r) 87 | else: 88 | return torch.Tensor([unit_rnd(r) for i in range(n)]) 89 | 90 | 91 | def th_randint(low, high=None, size=1): 92 | """ proxy to torch.randint(low,high,(size,)) """ 93 | if high is None: ilow = 0; ihigh = low 94 | else: ilow = low; ihigh = high 95 | 96 | if size == 1: 97 | return torch.randint(low=ilow, high=ihigh, size=(size,)).numpy()[0] 98 | else: 99 | return torch.randint(low=ilow, high=ihigh, size=(size,)).numpy() 100 | 101 | 102 | # generate center-anchor cooridnates for a given img_size and pooling size 103 | def generate_reg_coords(cfg): 104 | map_size = (down2n(cfg.im_size[0],cfg.conv_npool[-1]),down2n(cfg.im_size[1],cfg.conv_npool[-1])) 105 | 106 | batch_gtr = np.zeros([map_size[0], map_size[1], 4]) #[map_h, map_w, ltrb] 107 | grid_r = np.tile(np.arange(0.5, 0.5+map_size[0], 1.).reshape([-1,1]),(1,map_size[1])) 108 | grid_c = np.tile(np.arange(0.5, 0.5+map_size[1], 1.).reshape([1,-1]),(map_size[0],1)) 109 | map_scale = float(cfg.im_size[0])/float(map_size[0]) 110 | 111 | batch_gtr[:,:,0] = grid_c # left 112 | batch_gtr[:,:,1] = grid_r # top 113 | batch_gtr[:,:,2] = grid_c # right 114 | batch_gtr[:,:,3] = grid_r # bottom 115 | batch_gtr *= map_scale # rescale map by size 116 | 117 | return batch_gtr 118 | 119 | 120 | -------------------------------------------------------------------------------- /track_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | from utils import crop_img 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def result_curve(result, num_points): 8 | #num_seqs = 1 #len(result) 9 | all_seq_plot = np.zeros(num_points) 10 | 11 | #for i in range(num_seqs): 12 | seq_iou = result 13 | seq_len = np.size(result) 14 | seq_plot = list() 15 | 16 | bb_thvars = np.linspace(0,1,num_points) 17 | for bbth in bb_thvars: 18 | ratio_th = np.sum(seq_iou > bbth).astype(float) / seq_len 19 | seq_plot.append(ratio_th) 20 | 21 | return np.array(seq_plot) 22 | 23 | 24 | def result_curve_px(result, num_points): 25 | num_seqs = len(result) 26 | all_ious = np.array([]) 27 | 28 | for i in range(num_seqs): 29 | all_ious = np.append(all_ious, result[i]) 30 | 31 | num_frames = len(all_ious) 32 | bb_thvars = np.linspace(0,50,num_points) 33 | all_ratio_th = np.array([]) 34 | 35 | for bbth in bb_thvars: 36 | ratio_th = np.sum(all_ious <= bbth).astype(float) / num_frames 37 | all_ratio_th = np.append(all_ratio_th, ratio_th) 38 | 39 | return all_ratio_th 40 | 41 | 42 | def box_overlap_area(A,B): 43 | if A.ndim == 1: 44 | A_xmin = A[0]; A_xmax = A_xmin+A[2]; A_ymin = A[1]; A_ymax = A_ymin+A[3] 45 | B_xmin = B[0]; B_xmax = B_xmin+B[2]; B_ymin = B[1]; B_ymax = B_ymin+B[3] 46 | # x,y dim overlap? 47 | x_over = max(0, min(A_xmax,B_xmax)-max(A_xmin,B_xmin)) 48 | y_over = max(0, min(A_ymax,B_ymax)-max(A_ymin,B_ymin)) 49 | # area of overlap 50 | area_overlap = x_over*y_over 51 | return area_overlap 52 | else: 53 | num_d = A.shape[0] 54 | A_xmin = A[:,0]; A_xmax = A_xmin+A[:,2]; A_ymin = A[:,1]; A_ymax = A_ymin+A[:,3] 55 | B_xmin = B[:,0]; B_xmax = B_xmin+B[:,2]; B_ymin = B[:,1]; B_ymax = B_ymin+B[:,3] 56 | # x,y dim overlap? 57 | x_over = np.max([np.zeros(num_d), np.min([A_xmax,B_xmax], axis=0)-np.max([A_xmin,B_xmin], axis=0)], axis=0) 58 | y_over = np.max([np.zeros(num_d), np.min([A_ymax,B_ymax], axis=0)-np.max([A_ymin,B_ymin], axis=0)], axis=0) 59 | # area of overlap 60 | area_overlap = x_over*y_over 61 | return area_overlap 62 | 63 | 64 | def box_overlap_score(A,B): 65 | if A.ndim == 1: 66 | A_width = A[2]; A_height = A[3]; B_width = B[2]; B_height = B[3]; 67 | A_area = A[2]*A[3]; B_area = B[2]*B[3]; 68 | area_overlap = box_overlap_area(A,B) 69 | area_union = A_area + B_area - area_overlap 70 | return area_overlap / area_union 71 | else: 72 | A_width = A[:,2]; A_height = A[:,3]; B_width = B[:,2]; B_height = B[:,3]; 73 | A_area = A[:,2]*A[:,3]; B_area = B[:,2]*B[:,3]; 74 | area_overlap = box_overlap_area(A,B) 75 | area_union = A_area + B_area - area_overlap 76 | return area_overlap / area_union 77 | 78 | -------------------------------------------------------------------------------- /tracker.py: -------------------------------------------------------------------------------- 1 | import os,sys,time,cv2 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import moviepy.editor as mpe 6 | 7 | import torch, torchvision 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | from model.build_model import Track_Model 12 | from utils import crop_img, imread_to_rgb 13 | from track_utils import box_overlap_score, result_curve 14 | 15 | 16 | # tracker object 17 | class Tracker(nn.Module): 18 | def __init__(self, cfg, db_name=None, idx=-1): 19 | super(type(self), self).__init__() 20 | # model object 21 | cfg.batch_size = 1 22 | self.net = Track_Model(cfg).cuda() 23 | self.net.eval() 24 | self.cfg = cfg 25 | # tracking db placeholders 26 | self.track_dbnm = None 27 | self.track_path = None 28 | self.track_dict = None 29 | # load model weights 30 | self.chkpt_file = [ckpt for ckpt in sorted(os.listdir(cfg.CHKPT_PATH)) if ckpt.find(cfg.CHKPT_CODE)>0][idx] 31 | ckpt = torch.load(cfg.CHKPT_PATH+self.chkpt_file) 32 | self.net.load_state_dict(ckpt['model_state_dict'], strict=False) 33 | print 'ckpt: ' + self.chkpt_file 34 | # load db 35 | if db_name is not None: 36 | self.load_track_db(db_name) 37 | 38 | 39 | def load_track_db(self, name): 40 | # load dataset 41 | self.track_dbnm = name 42 | self.track_path = self.cfg.db_info[name]['path'] 43 | self.track_dict = np.load(self.cfg.db_info[name]['dict'], allow_pickle=True).item() 44 | print 'dataset: ' + name 45 | 46 | 47 | def read_img_resize(self, imf): 48 | img_orig = imread_to_rgb(imf) 49 | h_orig, w_orig, _ = img_orig.shape 50 | MAX_H, MAX_W = self.cfg.im_size 51 | s_f = float(MAX_H) / float(h_orig) 52 | if float(w_orig)*s_f > MAX_W: 53 | s_f = float(MAX_W) / float(w_orig) 54 | img_mod = cv2.resize(img_orig, (int(w_orig*s_f), int(h_orig*s_f)) ) 55 | h_mod, w_mod, _ = img_mod.shape 56 | img_zero = np.zeros([MAX_H, MAX_W, 3]) 57 | img_zero[:h_mod, :w_mod, :] = img_mod 58 | return img_zero, s_f 59 | 60 | 61 | def run_track_seq(self, seq_name, save_res=False): 62 | # preliminary 63 | seq_dict = self.track_dict[seq_name] 64 | seq_path = seq_name if not seq_dict.has_key('path') else seq_dict['path'] 65 | if self.track_dbnm is not 'got10k': 66 | seq_path = os.path.join(seq_path, 'img/') 67 | seq_path = os.path.join(self.track_path, seq_path) 68 | # results placeholder 69 | seq_len = len(seq_dict['img']) 70 | seq_res, seq_fps = np.zeros([seq_len,4]), np.zeros(seq_len) 71 | 72 | # tracking part 73 | for i, imf in enumerate(seq_dict['img']): 74 | sys.stdout.write("\r"+str(i)+'/'+str(seq_len-1)) 75 | 76 | # init frame, extract feats 77 | if i == 0: 78 | # init state = [xmin, ymin , width, height] 79 | state = seq_dict['gt'][0,:].copy().astype(float) 80 | seq_res[i] = state.copy() 81 | # init frame 82 | im_frame, s_f = self.read_img_resize(os.path.join(seq_path, imf)) 83 | tic = time.time() 84 | # convert state to [xmin, ymin, xmax, ymax]*scale_factor 85 | state_mod = np.array([state[0], state[1], state[0]+state[2], state[1]+state[3]])*s_f 86 | state_net = torch.Tensor(state_mod).unsqueeze(0).cuda() 87 | # init feats 88 | net_im = torch.Tensor(im_frame).unsqueeze(0).permute(0,3,1,2).cuda() 89 | net_bb = [state_net] 90 | with torch.no_grad(): 91 | xfa = self.net.get_feats_xfa(net_im, net_bb) 92 | seq_fps[i] = 1./(time.time()-tic) 93 | continue 94 | 95 | # subsequent frames 96 | # read img 97 | im_frame, _ = self.read_img_resize(os.path.join(seq_path, imf)) 98 | tic = time.time() 99 | # find target 100 | net_im = torch.Tensor(im_frame).unsqueeze(0).permute(0,3,1,2).cuda() 101 | with torch.no_grad(): 102 | net_out_bb, _, _ = self.net.forward_box(None,net_im, None, xfa=xfa, nbox=1) #add_box=[state_net] 103 | state_net = net_out_bb[0].detach() 104 | state_mod = state_net.squeeze().cpu().numpy() / s_f 105 | state = np.array([state_mod[0], state_mod[1], state_mod[2]-state_mod[0], state_mod[3]-state_mod[1]]) 106 | # store results 107 | seq_res[i] = state.copy() 108 | seq_fps[i] = 1./(time.time()-tic) 109 | 110 | if save_res: 111 | np.savetxt('output/'+seq_name+'.txt', seq_res, fmt='%.4f', delimiter=',') 112 | 113 | return seq_res, seq_fps 114 | 115 | 116 | def run_track_db(self, seq_list=None, out_vid=False, calc_auc=True, save_res=False): 117 | # results placeholder 118 | db_res = dict() 119 | db_fps = [] 120 | db_auc = [] 121 | db_suc = [] 122 | # per-sequence operation 123 | seq_list = self.track_dict.keys() if seq_list is None else seq_list 124 | seq_nums = len(seq_list) 125 | for s_i, seq in enumerate(seq_list): 126 | # seq name 127 | print '('+ str(s_i+1) +'/' + str(seq_nums) + '):' + seq 128 | # run tracking 129 | seq_res, seq_fps = self.run_track_seq(seq, save_res=save_res) 130 | db_res[seq] = seq_res 131 | db_fps.append(seq_fps.mean()) 132 | # calc and display auc 133 | if calc_auc: 134 | seq_iou = box_overlap_score(seq_res, self.track_dict[seq]['gt']) 135 | seq_suc = seq_iou>0.5 136 | seq_auc = result_curve(seq_iou, 21) 137 | db_auc.append(seq_auc) 138 | db_suc.append(seq_suc) 139 | print ', fps: ' + str(seq_fps.mean())[:6], 140 | print ', suc: ' + str(float(np.sum(seq_suc))/seq_res.shape[0])[:6], 141 | print ', auc: ' + str(np.mean(seq_auc))[:6] + ', mean_auc: ' + str(np.mean(db_auc))[:6] 142 | if out_vid: 143 | self.draw_vid_seq(seq_res, seq) 144 | 145 | # display overall results 146 | if calc_auc: 147 | print '\nmean fps: ' + str(np.mean(db_fps))[:6] 148 | print 'mean suc: ' + str(np.mean(np.concatenate(db_suc)))[:6] 149 | print 'mean auc: ' + str(np.mean(db_auc))[:6] 150 | 151 | return db_res, db_fps, db_auc 152 | 153 | 154 | def draw_vid_seq(self, seq_res, seq_name): 155 | print '> make video seq...', 156 | # preliminaries 157 | seq_dict = self.track_dict[seq_name] 158 | seq_path = seq_name if not seq_dict.has_key('path') else seq_dict['path'] 159 | if self.track_dbnm is not 'got10k': 160 | seq_path = os.path.join(seq_path, 'img/') 161 | seq_path = os.path.join(self.track_path, seq_path) 162 | seq_len = len(seq_dict['img']) 163 | # draw for all frames 164 | im_slist = [] 165 | for i, imf in enumerate(seq_dict['img']): 166 | # read img 167 | im_frame = imread_to_rgb(os.path.join(seq_path,imf)) 168 | # draw bb = [xmin, ymin, width, height] 169 | bb = seq_res[i].astype(int) 170 | im_frame = cv2.rectangle(im_frame, (bb[0], bb[1]), (bb[0]+bb[2], bb[1]+bb[3]), (1,0,0), 3) 171 | # fnum text 172 | fnum_str = str('%04d'%i) 173 | im_frame = cv2.putText(im_frame, fnum_str, (0,im_frame.shape[0]), cv2.FONT_HERSHEY_DUPLEX, im_frame.shape[0]/350., (1,1,0)) 174 | # save img 175 | im_sname = os.path.join('.temp/', seq_name +'_'+ fnum_str + '.jpg') 176 | im_slist.append(im_sname) 177 | plt.imsave(im_sname, im_frame) 178 | 179 | # encode video 180 | vid_clip = mpe.ImageSequenceClip(im_slist, fps=30) 181 | vid_clip.write_videofile('test.mp4', logger=None) 182 | print 'done' 183 | return 184 | 185 | 186 | def clean_temp_dir(self, temp_dir='.temp/'): 187 | flist = os.listdir(temp_dir) 188 | for f in flist: 189 | os.remove(os.path.join(temp_dir, f)) 190 | print '> cleaned cache folder' 191 | return 192 | 193 | -------------------------------------------------------------------------------- /tracker_batch.py: -------------------------------------------------------------------------------- 1 | import os,sys,time,cv2 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import moviepy.editor as mpe 6 | 7 | import torch, torchvision 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch.utils.data import Dataset, DataLoader 11 | 12 | from model.build_model import Track_Model 13 | from utils import crop_img, imread_to_rgb 14 | from track_utils import box_overlap_score, result_curve 15 | 16 | _batch_size = 8 17 | _num_thread = 2 18 | 19 | # tracking dataset 20 | class Seq_dataset(Dataset): 21 | def __init__(self, cfg, seq_path, seq_imlist): 22 | self.cfg = cfg 23 | self.seq_path = seq_path 24 | self.seq_imlist = seq_imlist 25 | self.len = len(seq_imlist)-1 26 | 27 | def __len__(self): 28 | return self.len 29 | 30 | def read_img_resize(self, imf): 31 | img_orig = imread_to_rgb(imf) 32 | h_orig, w_orig, _ = img_orig.shape 33 | MAX_H, MAX_W = self.cfg.im_size 34 | s_f = float(MAX_H) / float(h_orig) 35 | if float(w_orig)*s_f > MAX_W: 36 | s_f = float(MAX_W) / float(w_orig) 37 | img_mod = cv2.resize(img_orig, (int(w_orig*s_f), int(h_orig*s_f)) ) 38 | h_mod, w_mod, _ = img_mod.shape 39 | img_zero = np.zeros([MAX_H, MAX_W, 3]) 40 | img_zero[:h_mod, :w_mod, :] = img_mod 41 | return img_zero, s_f 42 | 43 | def __getitem__(self, idx): 44 | seq_imlist = self.seq_imlist[1:] 45 | im_path = os.path.join(self.seq_path, seq_imlist[idx]) 46 | im_frame,_ = self.read_img_resize(im_path) 47 | out_im = torch.Tensor(im_frame).permute(2,0,1) 48 | return out_im 49 | 50 | 51 | # tracker object 52 | class Tracker(nn.Module): 53 | def __init__(self, cfg, db_name=None, idx=-1): 54 | super(type(self), self).__init__() 55 | # model object 56 | cfg.batch_size = _batch_size 57 | self.net = Track_Model(cfg).cuda() 58 | self.net.eval() 59 | self.cfg = cfg 60 | # tracking db placeholders 61 | self.track_dbnm = None 62 | self.track_path = None 63 | self.track_dict = None 64 | # load model weights 65 | self.chkpt_file = [ckpt for ckpt in sorted(os.listdir(cfg.CHKPT_PATH)) if ckpt.find(cfg.CHKPT_CODE)>0][idx] 66 | ckpt = torch.load(cfg.CHKPT_PATH+self.chkpt_file) 67 | self.net.load_state_dict(ckpt['model_state_dict'], strict=False) 68 | print 'ckpt: ' + self.chkpt_file 69 | # load db 70 | if db_name is not None: 71 | self.load_track_db(db_name) 72 | 73 | 74 | def load_track_db(self, name): 75 | # load dataset 76 | self.track_dbnm = name 77 | self.track_path = self.cfg.db_info[name]['path'] 78 | self.track_dict = np.load(self.cfg.db_info[name]['dict'], allow_pickle=True).item() 79 | print 'dataset: ' + name 80 | 81 | 82 | def read_img_resize(self, imf): 83 | img_orig = imread_to_rgb(imf) 84 | h_orig, w_orig, _ = img_orig.shape 85 | MAX_H, MAX_W = self.cfg.im_size 86 | s_f = float(MAX_H) / float(h_orig) 87 | if float(w_orig)*s_f > MAX_W: 88 | s_f = float(MAX_W) / float(w_orig) 89 | img_mod = cv2.resize(img_orig, (int(w_orig*s_f), int(h_orig*s_f)) ) 90 | h_mod, w_mod, _ = img_mod.shape 91 | img_zero = np.zeros([MAX_H, MAX_W, 3]) 92 | img_zero[:h_mod, :w_mod, :] = img_mod 93 | return img_zero, s_f 94 | 95 | 96 | def run_track_seq(self, seq_name, seq_path, seq_imlist, seq_gt, save_res=False): 97 | # preliminary 98 | if ['got10k', 'trackingnet', 'uav123', 'uav20l', 'nuspro'].count(self.track_dbnm) == 0: 99 | seq_path = os.path.join(seq_path, 'img/') 100 | seq_path = os.path.join(self.track_path, seq_path) 101 | # results placeholder 102 | seq_len = len(seq_imlist) 103 | seq_res, seq_fps = [],[] 104 | # seq db 105 | seq_tdb = Seq_dataset(self.cfg, seq_path, seq_imlist) 106 | seq_tdl = DataLoader(seq_tdb, batch_size=self.cfg.batch_size, num_workers=_num_thread) 107 | # initial frame 108 | i = 0 109 | # init state = [xmin, ymin , width, height] 110 | state = seq_gt[0,:].copy().astype(float) 111 | seq_res.append(np.expand_dims(state.copy(),0)) 112 | # init frame 113 | im_frame, s_f = self.read_img_resize(os.path.join(seq_path, seq_imlist[0])) 114 | # convert state to [xmin, ymin, xmax, ymax]*scale_factor 115 | state_mod = np.array([state[0], state[1], state[0]+state[2], state[1]+state[3]])*s_f 116 | # init feats 117 | net_im = torch.Tensor(im_frame).unsqueeze(0).permute(0,3,1,2).repeat_interleave(self.cfg.batch_size,0).cuda() 118 | net_bb = [torch.Tensor(state_mod).unsqueeze(0).cuda()]*self.cfg.batch_size 119 | with torch.no_grad(): 120 | xfa = self.net.get_feats_xfa(net_im, net_bb) 121 | 122 | # tracking part 123 | for i, im_frame in enumerate(seq_tdl): 124 | sys.stdout.write("\r"+str((i)*self.cfg.batch_size)+'/'+str(seq_len)) 125 | # subsequent frames 126 | # read img 127 | tic = time.time() 128 | temp_sz = im_frame.shape[0] 129 | net_im = torch.zeros(self.cfg.batch_size, 3, self.cfg.im_size[0], self.cfg.im_size[1]) 130 | net_im[:temp_sz] = im_frame 131 | net_im = net_im.cuda() 132 | # find target 133 | with torch.no_grad(): 134 | net_out_bb, _, _ = self.net.forward_box(None,net_im, None, xfa=xfa, nbox=1) 135 | state_mod = net_out_bb.squeeze().detach().cpu().numpy() / s_f 136 | state = np.zeros_like(state_mod) 137 | state[:,0], state[:,1], state[:,2], state[:,3] = state_mod[:,0], state_mod[:,1], state_mod[:,2]-state_mod[:,0], state_mod[:,3]-state_mod[:,1] 138 | # store results 139 | seq_res.append(state.copy()) 140 | seq_fps.append((time.time()-tic)) 141 | 142 | # concat dims 143 | seq_res = np.concatenate(seq_res)[:seq_len] 144 | seq_fps = 1./(np.sum(seq_fps)/float(seq_len)) 145 | # save res 146 | if save_res: 147 | 148 | if self.track_dbnm == 'got10k': 149 | os.mkdir('output/'+seq_name) 150 | np.savetxt('output/'+seq_name+'/'+seq_name+'_001.txt', seq_res, fmt='%.4f', delimiter=',') 151 | else: 152 | np.savetxt('output/'+seq_name+'.txt', seq_res, fmt='%.4f', delimiter=',') 153 | 154 | return seq_res, seq_fps 155 | 156 | 157 | def run_track_db(self, seq_list=None, out_vid=False, calc_auc=True, save_res=False): 158 | # results placeholder 159 | db_res = dict() 160 | db_fps = [] 161 | db_auc = [] 162 | db_suc = [] 163 | # per-sequence operation 164 | seq_list = self.track_dict.keys() if seq_list is None else seq_list 165 | seq_nums = len(seq_list) 166 | for s_i, seq in enumerate(seq_list): 167 | # print seq name 168 | print '('+ str(s_i+1) +'/' + str(seq_nums) + '):' + seq 169 | # seq path+imlist+gt 170 | seq_dict = self.track_dict[seq] 171 | seq_path = seq if not seq_dict.has_key('path') else seq_dict['path'] 172 | seq_imlist = seq_dict['img'] 173 | seq_gt = seq_dict['gt'] 174 | # run tracking 175 | seq_res, seq_fps = self.run_track_seq(seq, seq_path, seq_imlist, seq_gt, save_res=save_res) 176 | db_res[seq] = seq_res 177 | db_fps.append(seq_fps.mean()) 178 | # calc and display auc 179 | if calc_auc: 180 | seq_iou = box_overlap_score(seq_res, self.track_dict[seq]['gt']) 181 | seq_suc = seq_iou>0.5 182 | seq_auc = result_curve(seq_iou, 21) 183 | db_auc.append(seq_auc) 184 | db_suc.append(seq_suc) 185 | print ', fps: ' + str(seq_fps.mean())[:6], 186 | print ', suc: ' + str(float(np.sum(seq_suc))/seq_res.shape[0])[:6], 187 | print ', auc: ' + str(np.mean(seq_auc))[:6] + ', mean_auc: ' + str(np.mean(db_auc))[:6] 188 | if out_vid: 189 | self.draw_vid_seq(seq_res, seq) 190 | 191 | # display overall results 192 | if calc_auc: 193 | print '\nmean fps: ' + str(np.mean(db_fps))[:6] 194 | print 'mean suc: ' + str(np.mean(np.concatenate(db_suc)))[:6] 195 | print 'mean auc: ' + str(np.mean(db_auc))[:6] 196 | 197 | return db_res, db_fps, db_auc 198 | 199 | 200 | def draw_vid_seq(self, seq_res, seq_name): 201 | print '> make video seq...', 202 | # preliminaries 203 | seq_dict = self.track_dict[seq_name] 204 | seq_path = seq_name if not seq_dict.has_key('path') else seq_dict['path'] 205 | if self.track_dbnm is not 'got10k': 206 | seq_path = os.path.join(seq_path, 'img/') 207 | seq_path = os.path.join(self.track_path, seq_path) 208 | seq_len = len(seq_dict['img']) 209 | # draw for all frames 210 | im_slist = [] 211 | for i, imf in enumerate(seq_dict['img']): 212 | # read img 213 | im_frame = imread_to_rgb(os.path.join(seq_path,imf)) 214 | # draw bb = [xmin, ymin, width, height] 215 | bb = seq_res[i].astype(int) 216 | im_frame = cv2.rectangle(im_frame, (bb[0], bb[1]), (bb[0]+bb[2], bb[1]+bb[3]), (1,0,0), 3) 217 | # fnum text 218 | fnum_str = str('%04d'%i) 219 | im_frame = cv2.putText(im_frame, fnum_str, (0,im_frame.shape[0]), cv2.FONT_HERSHEY_DUPLEX, im_frame.shape[0]/350., (1,1,0)) 220 | # save img 221 | im_sname = os.path.join('.temp/', seq_name +'_'+ fnum_str + '.jpg') 222 | im_slist.append(im_sname) 223 | plt.imsave(im_sname, im_frame) 224 | 225 | # encode video 226 | vid_clip = mpe.ImageSequenceClip(im_slist, fps=30) 227 | vid_clip.write_videofile('test.mp4', logger=None) 228 | print 'done' 229 | return 230 | 231 | 232 | def clean_temp_dir(self, temp_dir='.temp/'): 233 | flist = os.listdir(temp_dir) 234 | for f in flist: 235 | os.remove(os.path.join(temp_dir, f)) 236 | print '> cleaned cache folder' 237 | return 238 | 239 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import cv2,time 4 | 5 | 6 | def get_dtstr(sec=True): 7 | tst = time.localtime() 8 | if sec: 9 | outstr = str(tst.tm_year)[-2:] + str('%02d' % tst.tm_mon) + str('%02d' % tst.tm_mday) + str('%02d' % tst.tm_hour)+ str('%02d' % tst.tm_min)+ str('%02d' % tst.tm_sec) 10 | else: 11 | outstr = str(tst.tm_year)[-2:] + str('%02d' % tst.tm_mon) + str('%02d' % tst.tm_mday) + str('%02d' % tst.tm_hour)+ str('%02d' % tst.tm_min) 12 | return outstr 13 | 14 | def imread_to_rgb(path): 15 | img_in = np.flip(cv2.imread(path, flags=cv2.IMREAD_COLOR), 2)/255. 16 | return img_in 17 | 18 | def crop_img(I, x, y, w, h, center=False, mfill=False): 19 | im_h = I.shape[0] 20 | im_w = I.shape[1] 21 | 22 | if center: 23 | w0 = w // 2; w1 = w - w0 # w = w0+w1 24 | h0 = h // 2; h1 = h - h0 # h = h0+h1 25 | 26 | x_min = x - w0; x_max = x+w1-1; 27 | y_min = y - h0; y_max = y+h1-1; 28 | else: 29 | x_min = x; x_max = x+w-1; 30 | y_min = y; y_max = y+h-1; 31 | 32 | pad_l = 0; pad_r = 0; 33 | pad_u = 0; pad_d = 0; 34 | 35 | # bounds 36 | if x_min < 0: pad_l = -x_min; x_min = 0; 37 | if x_max > im_w-1: pad_r = x_max-(im_w-1); x_max = im_w-1; 38 | if y_min < 0: pad_u = -y_min; y_min = 0; 39 | if y_max > im_h-1: pad_d = y_max-(im_h-1); y_max = im_h-1; 40 | 41 | # crop & append 42 | J = I[y_min:y_max+1, x_min:x_max+1, :] 43 | 44 | # 0 size errors 45 | if J.shape[0] == 0 or J.shape[1] == 0: 46 | plt.imsave('crop_error_'+time.strftime('%y%m%d_%H%M%S',time.localtime())+'.png', I) 47 | print 'i: ',I.shape, (x,y,w,h),J.shape 48 | print 'i: ',(y_min,y_max+1),(x_min,x_max+1) 49 | # return black image for zero-dim images 50 | return np.zeros([h,w,3]) 51 | 52 | if mfill: 53 | rsel = np.linspace(0, J.shape[0], 8, endpoint=False, dtype=int) 54 | csel = np.linspace(0, J.shape[1], 8, endpoint=False, dtype=int) 55 | fill = np.mean(J[rsel][:,csel], axis=(0,1)) 56 | else: 57 | fill = (0,0,0) 58 | J = cv2.copyMakeBorder(J, pad_u,pad_d,pad_l,pad_r, cv2.BORDER_CONSTANT, value=fill) 59 | return J 60 | 61 | 62 | def draw_bb_img(img0, x_min,y_min,width,height, color, stroke): 63 | img = img0.copy() 64 | img_h = img.shape[0]; img_w = img.shape[1]; 65 | 66 | x_rng = np.array(range(width)) + x_min 67 | y_rng = np.array(range(height))+ y_min 68 | 69 | x_rng[x_rng> img_w-1-stroke] = img_w-1-stroke 70 | y_rng[y_rng> img_h-1-stroke] = img_h-1-stroke 71 | 72 | x_max = np.max(x_rng) 73 | y_max = np.max(y_rng) 74 | 75 | img[y_min:y_min+stroke][:, x_rng, :] = color # up 76 | img[y_max-stroke:y_max][:, x_rng, :] = color # down 77 | img[:, x_min:x_min+stroke, :][y_rng] = color # left 78 | img[:, x_max-stroke:x_max, :][y_rng] = color # right 79 | 80 | return img 81 | 82 | 83 | def dist_succ(v_pred, v_gt, batch_size): 84 | maxvals = v_pred.max(axis=1).max(axis=1) 85 | v_gt_mod = v_gt.copy() + 1. 86 | 87 | idxs = list(); gt_idxs = list(); 88 | for b_i in range(batch_size): 89 | maxpos = np.where(v_pred == maxvals[b_i])[1:3] 90 | if np.shape(maxpos)[1] > 1: 91 | maxpos = (np.array([maxpos[0][0]]), np.array([maxpos[1][0]])) 92 | idxs.append(maxpos) 93 | gt_idxs.append(center_of_mass(v_gt_mod[b_i])) 94 | 95 | idxs = np.array(idxs).reshape([batch_size, 2]).astype(float) 96 | gt_idxs = np.array(gt_idxs).reshape([batch_size, 2]) 97 | 98 | dist = np.sum( ( idxs - gt_idxs )**2, axis=1 ) 99 | dist = np.sqrt( dist ) 100 | succ = (dist <= np.sqrt(2.)) 101 | 102 | return dist, succ 103 | 104 | 105 | def down2n(x, n): 106 | # returns input length of x after n-times of pooling/strides of 2 107 | if n == 1: 108 | return np.ceil(x/2.).astype(int) 109 | else: 110 | return down2n(np.ceil(x/2.), n-1).astype(int) 111 | 112 | 113 | def gray2jet(I): 114 | # convert input gray image I to jet colormap image J 115 | # trapezoid func map [0,1]->[0,1] (rise:t0~t1, down:t2~t3) 116 | def tpz(xin, t0,t1,t2,t3): 117 | x = xin.copy() 118 | x[xin<=t0] = 0. 119 | x[(xin>t0)*(xin<=t1)] = (xin[(xin>t0)*(xin<=t1)] - t0) / (t1-t0) 120 | x[(xin>t1)*(xin<=t2)] = 1. 121 | x[(xin>t2)*(xin<=t3)] = (xin[(xin>t2)*(xin<=t3)] - t3) / (t2-t3) 122 | x[xin>t3] = 0. 123 | return x 124 | 125 | # respective rgb channel mappings 126 | J_r = tpz(I, 0.375, 0.625, 0.875, 1.125) 127 | J_g = tpz(I, 0.125, 0.375, 0.625, 0.875) 128 | J_b = tpz(I, -0.125, 0.125, 0.375, 0.625) 129 | 130 | J = np.zeros([I.shape[0], I.shape[1], 3]) 131 | J[:,:,0] = J_r 132 | J[:,:,1] = J_g 133 | J[:,:,2] = J_b 134 | return J 135 | 136 | --------------------------------------------------------------------------------