├── .gitignore ├── LICENSE ├── README.md ├── assets ├── PRL-Track_zt.png └── qualitative_evaluation.png ├── experiments └── config.yaml ├── pretrained_models └── alexnet-bn.pth ├── pysot ├── __init__.py ├── core │ ├── __init__.py │ └── config.py ├── datasets │ ├── __init__.py │ ├── anchortarget.py │ ├── augmentation.py │ └── dataset.py ├── models │ ├── __init__.py │ ├── backbone │ │ ├── __init__.py │ │ ├── alexnet.py │ │ ├── mobile_v2.py │ │ ├── newalexnet.py │ │ └── resnet_atrous.py │ ├── loss.py │ ├── model_builder.py │ └── utile │ │ ├── coarse_module.py │ │ ├── pos_utils.py │ │ ├── tran.py │ │ ├── utile.py │ │ └── utils.py ├── tracker │ ├── __init__.py │ ├── base_tracker.py │ ├── prl_tracker.py │ └── utils.py └── utils │ ├── __init__.py │ ├── average_meter.py │ ├── bbox.py │ ├── distributed.py │ ├── location_grid.py │ ├── log_helper.py │ ├── lr_scheduler.py │ ├── misc.py │ ├── model_load.py │ └── xcorr.py ├── requirements.txt ├── setup.py ├── toolkit ├── __init__.py ├── datasets │ ├── __init__.py │ ├── dataset.py │ ├── dtb.py │ ├── uav.py │ ├── uav10fps.py │ ├── uav112.py │ ├── uav112l.py │ ├── uav20l.py │ ├── uavdt.py │ ├── video.py │ └── visdrone.py ├── evaluation │ ├── __init__.py │ └── ope_benchmark.py ├── utils │ ├── __init__.py │ ├── c_region.pxd │ ├── misc.py │ ├── region.c │ ├── region.cpython-38-x86_64-linux-gnu.so │ ├── region.pyx │ ├── src │ │ ├── buffer.h │ │ ├── region.c │ │ └── region.h │ └── statistics.py └── visualization │ ├── __init__.py │ ├── draw_success_precision.py │ └── draw_utils.py └── tools ├── eval.py ├── snapshot └── README.md ├── test.py └── train.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vision4robotics/PRL-Track/b89beed235b38f24c93f875276b5221c99f6fb29/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🏃‍♀️ PRL-Track: Progressive Representation Learning for Real-Time UAV Tracking (IROS 2024) 2 | Changhong Fu∗, Xiang Lei, Haobo Zuo, Liangliang Yao, Guangze Zheng, and Jia Pan 3 | 4 | \* Corresponding author. 5 | 6 | This is the official code for the paper "Progressive Representation Learning for Real-Time UAV Tracking". 7 | 8 | ![PRL-Track_zt](./assets/PRL-Track_zt.png) 9 | 10 | ## Abstract 11 | > Visual object tracking has significantly promoted autonomous applications for unmanned aerial vehicles (UAVs). However, learning robust object representations for UAV tracking is especially challenging in complex dynamic environments, when confronted with aspect ratio change and occlusion. These challenges severely alter the original information of the object. To handle the above issues, this work proposes a novel progressive representation learning framework for UAV tracking, i.e., PRL-Track. Specifically, PRL-Track is divided into coarse representation learning and fine representation learning. For coarse representation learning, an innovative appearance-aware regulator and a convenient semantic-aware regulator are designed to mitigate appearance interference and capture semantic information. Furthermore, for fine representation learning, a new hierarchical modeling generator is developed to intertwine coarse object representations. Exhaustive experiments demonstrate that the proposed PRL-Track delivers exceptional performance on three authoritative UAV tracking benchmarks. Real-world tests indicate that the proposed PRL-Track realizes superior tracking performance with 42.6 frames per second on the typical UAV platform equipped with an edge smart camera. The code, model, and demo videos are available at here. 12 | 13 | ## 🎞️ Video Demo 14 | [![PRL-Track: Progressive Representation Learning for Real-Time UAV Tracking](https://res.cloudinary.com/marcomontalbano/image/upload/v1712205733/video_to_markdown/images/youtube--39lBuLESIFE-c05b58ac6eb4c4700831b2b3070cd403.jpg)](https://www.youtube.com/watch?v=39lBuLESIFE "PRL-Track: Progressive Representation Learning for Real-Time UAV Tracking") 15 | 16 | ## 🛠️ Setup 17 | 18 | ### Requirements 19 | This code has been tested on Ubuntu 18.04, Python 3.8.3, Pytorch 1.13.1, and CUDA 11.6. 20 | 21 | Please install related libraries before running this code: 22 | 23 | ```bash 24 | pip install -r requirements.txt 25 | ``` 26 | 27 | ## 🚀 Getting started 28 | 29 | ### Training 30 | 31 | #### Prepare training datasets 32 | Download the datasets: 33 | * [COCO](http://cocodataset.org) 34 | * [GOT-10K](http://got-10k.aitestunion.com/downloads) 35 | * [LaSOT](http://vision.cs.stonybrook.edu/~lasot) 36 | 37 | Note: Crop data following the instruction for [COCO](https://github.com/vision4robotics/HiFT/blob/main/training_dataset/coco/readme.md), [GOT-10k](https://github.com/vision4robotics/HiFT/blob/main/training_dataset/got10k/readme.md) and LaSOT. 38 | 39 | Then modify the corresponding path in `pysot/core/config.py`. 40 | 41 | ```bash 42 | python tools/train.py 43 | ``` 44 | 45 | ### Testing 46 | Download pretrained model: [PRL-Track](https://drive.google.com/drive/folders/1WYQf_zAMy9Xf1tLH1MRmQELe5ywwsB5d?usp=drive_link) and put it into `tools/snapshot` directory. 47 | 48 | Download testing datasets and put them into `test_dataset` directory. If you want to test the tracker on a new dataset, please refer to [pysot-toolkit](https://github.com/StrangerZhang/pysot-toolkit) to set test_dataset. 49 | 50 | ```bash 51 | python tools/test.py 52 | ``` 53 | 54 | The testing result will be saved in the `results/dataset_name/tracker_name` directory, and the experimental results in our paper can be downloaded from [Google Drive](https://drive.google.com/drive/folders/1WYQf_zAMy9Xf1tLH1MRmQELe5ywwsB5d?usp=drive_link). 55 | 56 | ### Evaluation 57 | We provide the tracking results of UAVTrack112, UAVTrack112_L, and UAV123. 58 | 59 | If you want to evaluate the tracker, please put those results into results directory. 60 | 61 | ```bash 62 | python tools/eval.py 63 | ``` 64 | 65 | ## Qualitative Evaluation 66 | ![qualitative evaluation](./assets/qualitative_evaluation.png) 67 | 68 | ## 🥰 Acknowledgments 69 | The code is implemented based on [pysot](https://github.com/STVIR/pysot) and [HiFT](https://github.com/vision4robotics/HiFT). We would like to express our sincere thanks to the contributors. 70 | 71 | ## Contact 72 | If you have any questions, please contact Xiang Lei at [2053932@tongji.edu.cn](2053932@tongji.edu.cn). 73 | -------------------------------------------------------------------------------- /assets/PRL-Track_zt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vision4robotics/PRL-Track/b89beed235b38f24c93f875276b5221c99f6fb29/assets/PRL-Track_zt.png -------------------------------------------------------------------------------- /assets/qualitative_evaluation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vision4robotics/PRL-Track/b89beed235b38f24c93f875276b5221c99f6fb29/assets/qualitative_evaluation.png -------------------------------------------------------------------------------- /experiments/config.yaml: -------------------------------------------------------------------------------- 1 | META_ARC: "PRLTrack_alexnet" 2 | 3 | BACKBONE: 4 | TYPE: "alexnet" 5 | PRETRAINED: 'alexnet-bn.pth' 6 | TRAIN_LAYERS: ['layer3','layer4','layer5'] 7 | TRAIN_EPOCH: 10 8 | LAYERS_LR: 0.1 9 | 10 | TRACK: 11 | TYPE: 'PRLTracker' 12 | EXEMPLAR_SIZE: 127 13 | INSTANCE_SIZE: 287 14 | CONTEXT_AMOUNT: 0.5 15 | STRIDE: 8 16 | PENALTY_K: 0.08 17 | WINDOW_INFLUENCE: 0.42 18 | LR: 0.30 19 | w2: 1.0 20 | w3: 1.0 21 | 22 | TRAIN: 23 | EPOCH: 100 24 | START_EPOCH: 0 25 | BATCH_SIZE: 128 26 | NUM_GPU: 2 27 | BASE_LR: 0.005 28 | RESUME: "" 29 | WEIGHT_DECAY : 0.0001 30 | PRETRAINED: 0 31 | OUTPUT_SIZE: 11 32 | NUM_WORKERS: 6 33 | LOC_WEIGHT: 2.4 34 | CLS_WEIGHT: 1.0 35 | w2: 1.0 36 | w3: 1.0 37 | w4: 1.0 38 | w5: 1.0 39 | 40 | POS_NUM : 16 41 | TOTAL_NUM : 64 42 | NEG_NUM : 16 43 | LARGER: 1.0 44 | range : 1.0 45 | LR: 46 | TYPE: 'log' 47 | KWARGS: 48 | start_lr: 0.01 49 | end_lr: 0.0001 50 | 51 | LR_WARMUP: 52 | TYPE: 'step' 53 | EPOCH: 5 54 | KWARGS: 55 | start_lr: 0.005 56 | end_lr: 0.01 57 | step: 1 58 | 59 | DATASET: 60 | NAMES: 61 | - 'COCO' 62 | - 'GOT' 63 | - 'LaSOT' 64 | 65 | TEMPLATE: 66 | SHIFT: 4 67 | SCALE: 0.05 68 | BLUR: 0.0 69 | FLIP: 0.0 70 | COLOR: 1.0 71 | 72 | SEARCH: 73 | SHIFT: 64 74 | SCALE: 0.18 75 | BLUR: 0.2 76 | FLIP: 0.0 77 | COLOR: 1.0 78 | 79 | NEG: 0.05 80 | GRAY: 0.0 81 | -------------------------------------------------------------------------------- /pretrained_models/alexnet-bn.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vision4robotics/PRL-Track/b89beed235b38f24c93f875276b5221c99f6fb29/pretrained_models/alexnet-bn.pth -------------------------------------------------------------------------------- /pysot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vision4robotics/PRL-Track/b89beed235b38f24c93f875276b5221c99f6fb29/pysot/__init__.py -------------------------------------------------------------------------------- /pysot/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vision4robotics/PRL-Track/b89beed235b38f24c93f875276b5221c99f6fb29/pysot/core/__init__.py -------------------------------------------------------------------------------- /pysot/core/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) SenseTime. All Rights Reserved. 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | 8 | from yacs.config import CfgNode as CN 9 | 10 | __C = CN() 11 | 12 | cfg = __C 13 | 14 | __C.META_ARC = "PRL_alexnet" 15 | 16 | __C.CUDA = True 17 | 18 | # ------------------------------------------------------------------------ # 19 | # Training options 20 | # ------------------------------------------------------------------------ # 21 | __C.TRAIN = CN() 22 | 23 | 24 | __C.TRAIN.THR_HIGH = 0.6 25 | 26 | __C.TRAIN.clsandlocchannel = 256 27 | 28 | __C.TRAIN.groupchannel = 32 29 | 30 | __C.TRAIN.THR_LOW = 0.3 31 | 32 | __C.TRAIN.NEG_NUM = 16 33 | 34 | __C.TRAIN.POS_NUM = 16 35 | 36 | __C.TRAIN.TOTAL_NUM = 64 37 | 38 | __C.TRAIN.PR = 1 39 | 40 | __C.TRAIN.CLS_WEIGHT = 1.0 41 | 42 | __C.TRAIN.LOC_WEIGHT = 3.0 43 | 44 | __C.TRAIN.SHAPE_WEIGHT = 2.0 45 | 46 | __C.TRAIN.EXEMPLAR_SIZE = 127 47 | 48 | __C.TRAIN.SEARCH_SIZE = 287 # 255 49 | 50 | __C.TRAIN.OUTPUTFEATURE_SIZE = 11 51 | 52 | __C.TRAIN.LABEL_RANGE = 4 53 | 54 | __C.TRAIN.BASE_SIZE = 8 55 | 56 | __C.TRAIN.OUTPUT_SIZE = 21 # 25 57 | 58 | __C.TRAIN.RESUME = "" 59 | 60 | __C.TRAIN.PRETRAINED = 1 61 | 62 | __C.TRAIN.LARGER = 2.0 63 | 64 | __C.TRAIN.LOG_DIR = "./logs" 65 | 66 | __C.TRAIN.SNAPSHOT_DIR = "./snapshot" 67 | 68 | __C.TRAIN.EPOCH = 30 69 | 70 | __C.TRAIN.START_EPOCH = 0 71 | 72 | __C.TRAIN.BATCH_SIZE = 100 73 | 74 | __C.TRAIN.NUM_GPU = 2 75 | 76 | __C.TRAIN.NUM_WORKERS = 1 77 | 78 | __C.TRAIN.MOMENTUM = 0.9 79 | 80 | __C.TRAIN.WEIGHT_DECAY = 0.0001 81 | 82 | __C.TRAIN.w1 = 1.0 83 | 84 | __C.TRAIN.w2 = 1.0 85 | 86 | __C.TRAIN.w3 = 1.0 87 | 88 | __C.TRAIN.w4 = 1.0 89 | 90 | __C.TRAIN.w5 = 1.0 91 | 92 | __C.TRAIN.range = 2.0 93 | 94 | 95 | __C.TRAIN.MASK_WEIGHT = 1 96 | 97 | __C.TRAIN.PRINT_FREQ = 20 98 | 99 | __C.TRAIN.LOG_GRADS = False 100 | 101 | __C.TRAIN.GRAD_CLIP = 10.0 102 | 103 | __C.TRAIN.BASE_LR = 0.005 104 | 105 | __C.TRAIN.LR = CN() 106 | 107 | __C.TRAIN.LR.TYPE = "log" 108 | 109 | __C.TRAIN.LR.KWARGS = CN(new_allowed=True) 110 | 111 | __C.TRAIN.LR_WARMUP = CN() 112 | 113 | __C.TRAIN.LR_WARMUP.WARMUP = True 114 | 115 | __C.TRAIN.LR_WARMUP.TYPE = "step" 116 | 117 | __C.TRAIN.LR_WARMUP.EPOCH = 5 118 | 119 | __C.TRAIN.LR_WARMUP.KWARGS = CN(new_allowed=True) 120 | 121 | # ------------------------------------------------------------------------ # 122 | # Dataset options 123 | # ------------------------------------------------------------------------ # 124 | __C.DATASET = CN(new_allowed=True) 125 | 126 | # Augmentation 127 | # for template 128 | __C.DATASET.TEMPLATE = CN() 129 | 130 | # for detail discussion 131 | __C.DATASET.TEMPLATE.SHIFT = 4 132 | 133 | __C.DATASET.TEMPLATE.SCALE = 0.05 134 | 135 | __C.DATASET.TEMPLATE.BLUR = 0.0 136 | 137 | __C.DATASET.TEMPLATE.FLIP = 0.0 138 | 139 | __C.DATASET.TEMPLATE.COLOR = 1.0 140 | 141 | __C.DATASET.SEARCH = CN() 142 | 143 | __C.DATASET.SEARCH.SHIFT = 64 144 | 145 | __C.DATASET.SEARCH.SCALE = 0.18 146 | 147 | __C.DATASET.SEARCH.BLUR = 0.0 148 | 149 | __C.DATASET.SEARCH.FLIP = 0.0 150 | 151 | __C.DATASET.SEARCH.COLOR = 1.0 152 | 153 | # for detail discussion 154 | __C.DATASET.NEG = 0.2 155 | 156 | __C.DATASET.GRAY = 0.0 157 | 158 | __C.DATASET.NAMES = ("VID", "COCO", "GOT", "YOUTUBEBB", "LaSOT") 159 | 160 | __C.DATASET.VID = CN() 161 | __C.DATASET.VID.ROOT = "/mnt/new_sdd/OriginDataset/vid/crop511" 162 | __C.DATASET.VID.ANNO = "/mnt/new_sdd/OriginDataset/vid/train.json" 163 | __C.DATASET.VID.FRAME_RANGE = 100 164 | __C.DATASET.VID.NUM_USE = 100000 # repeat until reach NUM_USE 165 | 166 | __C.DATASET.YOUTUBEBB = CN() 167 | __C.DATASET.YOUTUBEBB.ROOT = "/mnt/sdg/Train_dataset/yt_bb/crop511" 168 | __C.DATASET.YOUTUBEBB.ANNO = "/mnt/sdg/Train_dataset/yt_bb/train.json" 169 | __C.DATASET.YOUTUBEBB.FRAME_RANGE = 3 170 | __C.DATASET.YOUTUBEBB.NUM_USE = -1 # use all not repeat 171 | 172 | __C.DATASET.COCO = CN() 173 | __C.DATASET.COCO.ROOT = "/mnt/new_sdd/OriginDataset/coco2017/crop511" 174 | __C.DATASET.COCO.ANNO = "/mnt/new_sdd/OriginDataset/coco2017/train2017.json" 175 | __C.DATASET.COCO.FRAME_RANGE = 1 176 | __C.DATASET.COCO.NUM_USE = -1 177 | 178 | 179 | __C.DATASET.GOT = CN() 180 | __C.DATASET.GOT.ROOT = "/mnt/new_sdd/OriginDataset/GOT10K/crop511" # GOT dataset path 181 | __C.DATASET.GOT.ANNO = "/mnt/new_sdd/OriginDataset/GOT10K/train.json" 182 | __C.DATASET.GOT.FRAME_RANGE = 50 183 | __C.DATASET.GOT.NUM_USE = 100000 184 | 185 | 186 | __C.DATASET.LaSOT = CN() 187 | __C.DATASET.LaSOT.ROOT = ( 188 | "/mnt/new_sdd/OriginDataset/LaSOT/crop511" # LaSOT dataset path 189 | ) 190 | __C.DATASET.LaSOT.ANNO = "/mnt/new_sdd/OriginDataset/LaSOT/train.json" 191 | __C.DATASET.LaSOT.FRAME_RANGE = 50 # 100 192 | __C.DATASET.LaSOT.NUM_USE = 100000 193 | 194 | __C.DATASET.VIDEOS_PER_EPOCH = 600000 195 | 196 | 197 | # ------------------------------------------------------------------------ # 198 | # Backbone options 199 | # ------------------------------------------------------------------------ # 200 | __C.BACKBONE = CN() 201 | 202 | # Backbone type, current only support resnet18,34,50;alexnet;mobilenet 203 | __C.BACKBONE.TYPE = "alexnet" 204 | 205 | __C.BACKBONE.KWARGS = CN(new_allowed=True) 206 | 207 | # Pretrained backbone weights 208 | __C.BACKBONE.PRETRAINED = "alexnet-bn.pth" 209 | 210 | # Train layers 211 | __C.BACKBONE.TRAIN_LAYERS = ["layer3", "layer4", "layer5"] 212 | 213 | # Layer LR 214 | __C.BACKBONE.LAYERS_LR = 0.1 215 | 216 | # Switch to train layer 217 | __C.BACKBONE.TRAIN_EPOCH = 10 218 | 219 | 220 | # # ------------------------------------------------------------------------ # 221 | # # Anchor options 222 | # # ------------------------------------------------------------------------ # 223 | __C.ANCHOR = CN() 224 | 225 | # # Anchor stride 226 | __C.ANCHOR.STRIDE = 16 227 | 228 | 229 | # ------------------------------------------------------------------------ # 230 | # Tracker options 231 | # ------------------------------------------------------------------------ # 232 | __C.TRACK = CN() 233 | 234 | __C.TRACK.TYPE = "PRLTracker" 235 | 236 | # Scale penalty 237 | __C.TRACK.PENALTY_K = 0.04 238 | 239 | # Window influence 240 | __C.TRACK.WINDOW_INFLUENCE = 0.44 241 | 242 | # Interpolation learning rate 243 | __C.TRACK.LR = 0.4 244 | 245 | __C.TRACK.w1 = 1.2 246 | 247 | __C.TRACK.w2 = 1.0 248 | 249 | __C.TRACK.w3 = 1.6 250 | 251 | __C.TRACK.LARGER = 1.4 252 | # Exemplar size 253 | __C.TRACK.EXEMPLAR_SIZE = 127 254 | 255 | # Instance size 256 | __C.TRACK.INSTANCE_SIZE = 255 257 | 258 | # Base size 259 | __C.TRACK.BASE_SIZE = 8 260 | 261 | __C.TRACK.STRIDE = 8 262 | 263 | # Context amount 264 | __C.TRACK.CONTEXT_AMOUNT = 0.5 265 | 266 | # Long term lost search size 267 | __C.TRACK.LOST_INSTANCE_SIZE = 831 268 | 269 | # Long term confidence low 270 | __C.TRACK.CONFIDENCE_LOW = 0.85 271 | 272 | # Long term confidence high 273 | __C.TRACK.CONFIDENCE_HIGH = 0.998 274 | 275 | # Mask threshold 276 | __C.TRACK.MASK_THERSHOLD = 0.30 277 | 278 | # Mask output size 279 | __C.TRACK.MASK_OUTPUT_SIZE = 127 280 | -------------------------------------------------------------------------------- /pysot/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vision4robotics/PRL-Track/b89beed235b38f24c93f875276b5221c99f6fb29/pysot/datasets/__init__.py -------------------------------------------------------------------------------- /pysot/datasets/anchortarget.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | from __future__ import unicode_literals 8 | 9 | import numpy as np 10 | import torch as t 11 | from pysot.core.config import cfg 12 | from pysot.utils.bbox import IoU 13 | 14 | 15 | 16 | class AnchorTarget(): 17 | def __init__(self): 18 | 19 | return 20 | def select(self,position, keep_num=16): 21 | num = position[0].shape[0] 22 | if num <= keep_num: 23 | return position, num 24 | slt = np.arange(num) 25 | np.random.shuffle(slt) 26 | slt = slt[:keep_num] 27 | return tuple(p[slt] for p in position), keep_num 28 | 29 | 30 | def get(self,bbox,size): 31 | 32 | labelcls1=np.zeros((1,size,size))-1 33 | 34 | pre=(16*(np.linspace(0,size-1,size))+63).reshape(-1,1)-cfg.TRAIN.SEARCH_SIZE//2 35 | pr=np.zeros((size**2,2)) 36 | pr[:,0]=np.maximum(0,np.tile(pre,(size)).T.reshape(-1)+cfg.TRAIN.SEARCH_SIZE//2) 37 | pr[:,1]=np.maximum(0,np.tile(pre,(size)).reshape(-1)+cfg.TRAIN.SEARCH_SIZE//2) 38 | 39 | labelxff=np.zeros((4, size, size), dtype=np.float32) 40 | 41 | labelcls2=np.zeros((1,size,size)) 42 | weightxff=np.zeros((1,size,size)) 43 | 44 | 45 | target=np.array([bbox.x1,bbox.y1,bbox.x2,bbox.y2]) 46 | 47 | index2=np.int32(np.minimum(size-1,np.maximum(0,(target-63)/16))) 48 | w=int(index2[2]-index2[0]+1) 49 | h=int(index2[3]-index2[1]+1) 50 | 51 | range=cfg.TRAIN.LABEL_RANGE 52 | for ii in np.arange(0,size): 53 | for jj in np.arange(0,size): 54 | weightxff[0,ii,jj]=(((ii-(index2[1]+index2[3])/2)*range)**2+((jj-(index2[0]+index2[2])/2)*range)**2) 55 | 56 | 57 | se=weightxff[np.where(weightxff<((w//2+h//2)*range/1.5)**2)] 58 | 59 | weightxff[np.where(weightxff<((w//2+h//2)*range/1.5)**2)]=1-((se-se.min())/(se.max()-se.min()+1e-4)) 60 | 61 | weightxff[np.where(weightxff>((w//2+h//2)*range/1.5)**2)]=0 62 | 63 | pos=np.where(weightxff.squeeze()>0.8) 64 | num=len(pos[0]) 65 | pos = self.select(pos, num//4) 66 | weightxff[:,pos[0][0],pos[0][1]] = 1.5 67 | 68 | 69 | 70 | index=np.int32(np.minimum(size-1,np.maximum(0,(target-63)/16))) 71 | w=int(index[2]-index[0]+1) 72 | h=int(index[3]-index[1]+1) 73 | 74 | 75 | 76 | for ii in np.arange(0,size): 77 | for jj in np.arange(0,size): 78 | labelcls2[0,ii,jj]=(((ii-(index[1]+index[3])/2)*range)**2+((jj-(index[0]+index[2])/2)*range)**2) 79 | 80 | 81 | see=labelcls2[np.where(labelcls2<((w//2+h//2)*range/1.2)**2)] 82 | 83 | labelcls2[np.where(labelcls2<((w//2+h//2)*range/1.2)**2)]=1-((see-see.min())/(see.max()-see.min()+1e-4)) 84 | weightcls3=np.zeros((1,size,size)) 85 | weightcls3[np.where(labelcls2<((w//2+h//2)*range/1.2)**2)]=1 86 | labelcls2=labelcls2*weightcls3 87 | 88 | 89 | def con(x): 90 | return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x)) 91 | def dcon(x): 92 | return (np.log(1+x)-np.log(1-x))/2 93 | 94 | labelxff[0,:,:]=(pr[:,0]-target[0]).reshape(cfg.TRAIN.OUTPUTFEATURE_SIZE,cfg.TRAIN.OUTPUTFEATURE_SIZE) 95 | labelxff[1,:,:]=(target[2]-pr[:,0]).reshape(cfg.TRAIN.OUTPUTFEATURE_SIZE,cfg.TRAIN.OUTPUTFEATURE_SIZE) 96 | labelxff[2,:,:]=(pr[:,1]-target[1]).reshape(cfg.TRAIN.OUTPUTFEATURE_SIZE,cfg.TRAIN.OUTPUTFEATURE_SIZE) 97 | labelxff[3,:,:]=(target[3]-pr[:,1]).reshape(cfg.TRAIN.OUTPUTFEATURE_SIZE,cfg.TRAIN.OUTPUTFEATURE_SIZE) 98 | labelxff=con(labelxff/(cfg.TRAIN.SEARCH_SIZE//2)) 99 | 100 | 101 | 102 | 103 | labelcls1[0,index[1]-h//4:index[3]+1+h//4,index[0]-w//4:index[2]+1+w//4]=-2 104 | labelcls1[0,index[1]+h//4:index[3]+1-h//4,index[0]+w//4:index[2]+1-w//4]=1 105 | 106 | neg2=np.where(labelcls1.squeeze()==-1) 107 | neg2 = self.select(neg2, int(len(np.where(labelcls1==1)[0])*2.5)) 108 | labelcls1[:,neg2[0][0],neg2[0][1]] = 0 109 | 110 | 111 | return labelcls1,labelxff,labelcls2,weightxff 112 | 113 | 114 | -------------------------------------------------------------------------------- /pysot/datasets/augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) SenseTime. All Rights Reserved. 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | 8 | import numpy as np 9 | import cv2 10 | 11 | from pysot.utils.bbox import corner2center, \ 12 | Center, center2corner, Corner 13 | 14 | 15 | class Augmentation: 16 | def __init__(self, shift, scale, blur, flip, color): 17 | self.shift = shift 18 | self.scale = scale 19 | self.blur = blur 20 | self.flip = flip 21 | self.color = color 22 | self.rgbVar = np.array( 23 | [[-0.55919361, 0.98062831, - 0.41940627], 24 | [1.72091413, 0.19879334, - 1.82968581], 25 | [4.64467907, 4.73710203, 4.88324118]], dtype=np.float32) 26 | 27 | @staticmethod 28 | def random(): 29 | return np.random.random() * 2 - 1.0 30 | 31 | def _crop_roi(self, image, bbox, out_sz, padding=(0, 0, 0)): 32 | bbox = [float(x) for x in bbox] 33 | a = (out_sz-1) / (bbox[2]-bbox[0]) 34 | b = (out_sz-1) / (bbox[3]-bbox[1]) 35 | c = -a * bbox[0] 36 | d = -b * bbox[1] 37 | mapping = np.array([[a, 0, c], 38 | [0, b, d]]).astype(np.float) 39 | crop = cv2.warpAffine(image, mapping, (out_sz, out_sz), 40 | borderMode=cv2.BORDER_CONSTANT, 41 | borderValue=padding) 42 | return crop 43 | 44 | def _blur_aug(self, image): 45 | def rand_kernel(): 46 | sizes = np.arange(5, 46, 2) 47 | size = np.random.choice(sizes) 48 | kernel = np.zeros((size, size)) 49 | c = int(size/2) 50 | wx = np.random.random() 51 | kernel[:, c] += 1. / size * wx 52 | kernel[c, :] += 1. / size * (1-wx) 53 | return kernel 54 | kernel = rand_kernel() 55 | image = cv2.filter2D(image, -1, kernel) 56 | return image 57 | 58 | def _color_aug(self, image): 59 | offset = np.dot(self.rgbVar, np.random.randn(3, 1)) 60 | offset = offset[::-1] # bgr 2 rgb 61 | offset = offset.reshape(3) 62 | image = image - offset 63 | return image 64 | 65 | def _gray_aug(self, image): 66 | grayed = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) 67 | image = cv2.cvtColor(grayed, cv2.COLOR_GRAY2BGR) 68 | return image 69 | 70 | def _shift_scale_aug(self, image, bbox, crop_bbox, size): 71 | im_h, im_w = image.shape[:2] 72 | 73 | # adjust crop bounding box 74 | crop_bbox_center = corner2center(crop_bbox) 75 | if self.scale: 76 | scale_x = (1.0 + Augmentation.random() * self.scale) 77 | scale_y = (1.0 + Augmentation.random() * self.scale) 78 | h, w = crop_bbox_center.h, crop_bbox_center.w 79 | scale_x = min(scale_x, float(im_w) / w) 80 | scale_y = min(scale_y, float(im_h) / h) 81 | crop_bbox_center = Center(crop_bbox_center.x, 82 | crop_bbox_center.y, 83 | crop_bbox_center.w * scale_x, 84 | crop_bbox_center.h * scale_y) 85 | 86 | crop_bbox = center2corner(crop_bbox_center) 87 | if self.shift: 88 | sx = Augmentation.random() * self.shift 89 | sy = Augmentation.random() * self.shift 90 | 91 | x1, y1, x2, y2 = crop_bbox 92 | 93 | sx = max(-x1, min(im_w - 1 - x2, sx)) 94 | sy = max(-y1, min(im_h - 1 - y2, sy)) 95 | 96 | crop_bbox = Corner(x1 + sx, y1 + sy, x2 + sx, y2 + sy) 97 | 98 | # adjust target bounding box 99 | x1, y1 = crop_bbox.x1, crop_bbox.y1 100 | bbox = Corner(bbox.x1 - x1, bbox.y1 - y1, 101 | bbox.x2 - x1, bbox.y2 - y1) 102 | 103 | if self.scale: 104 | bbox = Corner(bbox.x1 / scale_x, bbox.y1 / scale_y, 105 | bbox.x2 / scale_x, bbox.y2 / scale_y) 106 | 107 | image = self._crop_roi(image, crop_bbox, size) 108 | return image, bbox 109 | 110 | def _flip_aug(self, image, bbox): 111 | image = cv2.flip(image, 1) 112 | width = image.shape[1] 113 | bbox = Corner(width - 1 - bbox.x2, bbox.y1, 114 | width - 1 - bbox.x1, bbox.y2) 115 | return image, bbox 116 | 117 | def __call__(self, image, bbox, size, gray=False): 118 | shape = image.shape 119 | crop_bbox = center2corner(Center(shape[0]//2, shape[1]//2, 120 | size-1, size-1)) 121 | # gray augmentation 122 | if gray: 123 | image = self._gray_aug(image) 124 | 125 | # shift scale augmentation 126 | image, bbox = self._shift_scale_aug(image, bbox, crop_bbox, size) 127 | 128 | # color augmentation 129 | if self.color > np.random.random(): 130 | image = self._color_aug(image) 131 | 132 | # blur augmentation 133 | if self.blur > np.random.random(): 134 | image = self._blur_aug(image) 135 | 136 | # flip augmentation 137 | if self.flip and self.flip > np.random.random(): 138 | image, bbox = self._flip_aug(image, bbox) 139 | 140 | return image, bbox 141 | -------------------------------------------------------------------------------- /pysot/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vision4robotics/PRL-Track/b89beed235b38f24c93f875276b5221c99f6fb29/pysot/models/__init__.py -------------------------------------------------------------------------------- /pysot/models/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) SenseTime. All Rights Reserved. 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | 8 | from pysot.models.backbone.alexnet import alexnetlegacy, alexnet 9 | from pysot.models.backbone.mobile_v2 import mobilenetv2 10 | from pysot.models.backbone.resnet_atrous import resnet18, resnet34, resnet50 11 | 12 | BACKBONES = { 13 | "alexnetlegacy": alexnetlegacy, 14 | "mobilenetv2": mobilenetv2, 15 | "resnet18": resnet18, 16 | "resnet34": resnet34, 17 | "resnet50": resnet50, 18 | "alexnet": alexnet, 19 | } 20 | 21 | 22 | def get_backbone(name, **kwargs): 23 | return BACKBONES[name](**kwargs) 24 | -------------------------------------------------------------------------------- /pysot/models/backbone/alexnet.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | 6 | import torch.nn as nn 7 | 8 | 9 | class AlexNetLegacy(nn.Module): 10 | configs = [3, 96, 256, 384, 384, 256] 11 | 12 | def __init__(self, width_mult=1): 13 | configs = list( 14 | map(lambda x: 3 if x == 3 else int(x * width_mult), AlexNet.configs) 15 | ) 16 | super(AlexNetLegacy, self).__init__() 17 | self.features = nn.Sequential( 18 | nn.Conv2d(configs[0], configs[1], kernel_size=11, stride=2), 19 | nn.BatchNorm2d(configs[1]), 20 | nn.MaxPool2d(kernel_size=3, stride=2), 21 | nn.ReLU(inplace=True), 22 | nn.Conv2d(configs[1], configs[2], kernel_size=5), 23 | nn.BatchNorm2d(configs[2]), 24 | nn.MaxPool2d(kernel_size=3, stride=2), 25 | nn.ReLU(inplace=True), 26 | nn.Conv2d(configs[2], configs[3], kernel_size=3), 27 | nn.BatchNorm2d(configs[3]), 28 | nn.ReLU(inplace=True), 29 | nn.Conv2d(configs[3], configs[4], kernel_size=3), 30 | nn.BatchNorm2d(configs[4]), 31 | nn.ReLU(inplace=True), 32 | nn.Conv2d(configs[4], configs[5], kernel_size=3), 33 | nn.BatchNorm2d(configs[5]), 34 | ) 35 | self.feature_size = configs[5] 36 | 37 | def forward(self, x): 38 | x = self.features(x) 39 | return x 40 | 41 | 42 | class AlexNet(nn.Module): 43 | configs = [3, 96, 256, 384, 384, 256] 44 | 45 | def __init__(self, width_mult=1): 46 | configs = list( 47 | map(lambda x: 3 if x == 3 else int(x * width_mult), AlexNet.configs) 48 | ) 49 | super(AlexNet, self).__init__() 50 | 51 | self.layer1 = nn.Sequential( 52 | nn.Conv2d(configs[0], configs[1], kernel_size=11, stride=2), 53 | nn.BatchNorm2d(configs[1]), 54 | nn.MaxPool2d(kernel_size=3, stride=2), 55 | nn.ReLU(inplace=True), 56 | ) 57 | self.layer2 = nn.Sequential( 58 | nn.Conv2d(configs[1], configs[2], kernel_size=5), 59 | nn.BatchNorm2d(configs[2]), 60 | nn.MaxPool2d(kernel_size=3, stride=2), 61 | nn.ReLU(inplace=True), 62 | ) 63 | self.layer3 = nn.Sequential( 64 | nn.Conv2d(configs[2], configs[3], kernel_size=3), 65 | nn.BatchNorm2d(configs[3]), 66 | nn.ReLU(inplace=True), 67 | ) 68 | self.layer4 = nn.Sequential( 69 | nn.Conv2d(configs[3], configs[4], kernel_size=3), 70 | nn.BatchNorm2d(configs[4]), 71 | nn.ReLU(inplace=True), 72 | ) 73 | self.layer5 = nn.Sequential( 74 | nn.Conv2d(configs[4], configs[5], kernel_size=3), 75 | nn.BatchNorm2d(configs[5]), 76 | ) 77 | self.feature_size = configs[5] 78 | 79 | def forward(self, x): 80 | x = self.layer1(x) 81 | x = self.layer2(x) 82 | x = self.layer3(x) 83 | x = self.layer4(x) 84 | x = self.layer5(x) 85 | return x 86 | 87 | 88 | def alexnetlegacy(**kwargs): 89 | return AlexNetLegacy(**kwargs) 90 | 91 | 92 | def alexnet(**kwargs): 93 | return AlexNet(**kwargs) 94 | -------------------------------------------------------------------------------- /pysot/models/backbone/mobile_v2.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | def conv_bn(inp, oup, stride, padding=1): 11 | return nn.Sequential( 12 | nn.Conv2d(inp, oup, 3, stride, padding, bias=False), 13 | nn.BatchNorm2d(oup), 14 | nn.ReLU6(inplace=True), 15 | ) 16 | 17 | 18 | def conv_1x1_bn(inp, oup): 19 | return nn.Sequential( 20 | nn.Conv2d(inp, oup, 1, 1, 0, bias=False), 21 | nn.BatchNorm2d(oup), 22 | nn.ReLU6(inplace=True), 23 | ) 24 | 25 | 26 | class InvertedResidual(nn.Module): 27 | def __init__(self, inp, oup, stride, expand_ratio, dilation=1): 28 | super(InvertedResidual, self).__init__() 29 | self.stride = stride 30 | 31 | self.use_res_connect = self.stride == 1 and inp == oup 32 | 33 | padding = 2 - stride 34 | if dilation > 1: 35 | padding = dilation 36 | 37 | self.conv = nn.Sequential( 38 | # pw 39 | nn.Conv2d(inp, inp * expand_ratio, 1, 1, 0, bias=False), 40 | nn.BatchNorm2d(inp * expand_ratio), 41 | nn.ReLU6(inplace=True), 42 | # dw 43 | nn.Conv2d( 44 | inp * expand_ratio, 45 | inp * expand_ratio, 46 | 3, 47 | stride, 48 | padding, 49 | dilation=dilation, 50 | groups=inp * expand_ratio, 51 | bias=False, 52 | ), 53 | nn.BatchNorm2d(inp * expand_ratio), 54 | nn.ReLU6(inplace=True), 55 | # pw-linear 56 | nn.Conv2d(inp * expand_ratio, oup, 1, 1, 0, bias=False), 57 | nn.BatchNorm2d(oup), 58 | ) 59 | 60 | def forward(self, x): 61 | if self.use_res_connect: 62 | return x + self.conv(x) 63 | else: 64 | return self.conv(x) 65 | 66 | 67 | class MobileNetV2(nn.Sequential): 68 | def __init__(self, width_mult=1.0, used_layers=[3, 5, 7]): 69 | super(MobileNetV2, self).__init__() 70 | 71 | self.interverted_residual_setting = [ 72 | # t, c, n, s 73 | [1, 16, 1, 1, 1], 74 | [6, 24, 2, 2, 1], 75 | [6, 32, 3, 2, 1], 76 | [6, 64, 4, 2, 1], 77 | [6, 96, 3, 1, 1], 78 | [6, 160, 3, 2, 1], 79 | [6, 320, 1, 1, 1], 80 | ] 81 | # 0,2,3,4,6 82 | 83 | self.interverted_residual_setting = [ 84 | # t, c, n, s 85 | [1, 16, 1, 1, 1], 86 | [6, 24, 2, 2, 1], 87 | [6, 32, 3, 2, 1], 88 | [6, 64, 4, 1, 2], 89 | [6, 96, 3, 1, 2], 90 | [6, 160, 3, 1, 4], 91 | [6, 320, 1, 1, 4], 92 | ] 93 | 94 | self.channels = [24, 32, 96, 320] 95 | self.channels = [int(c * width_mult) for c in self.channels] 96 | 97 | input_channel = int(32 * width_mult) 98 | self.last_channel = int(1280 * width_mult) if width_mult > 1.0 else 1280 99 | 100 | self.add_module("layer0", conv_bn(3, input_channel, 2, 0)) 101 | 102 | last_dilation = 1 103 | 104 | self.used_layers = used_layers 105 | 106 | for idx, (t, c, n, s, d) in enumerate( 107 | self.interverted_residual_setting, start=1 108 | ): 109 | output_channel = int(c * width_mult) 110 | 111 | layers = [] 112 | 113 | for i in range(n): 114 | if i == 0: 115 | if d == last_dilation: 116 | dd = d 117 | else: 118 | dd = max(d // 2, 1) 119 | layers.append( 120 | InvertedResidual(input_channel, output_channel, s, t, dd) 121 | ) 122 | else: 123 | layers.append( 124 | InvertedResidual(input_channel, output_channel, 1, t, d) 125 | ) 126 | input_channel = output_channel 127 | 128 | last_dilation = d 129 | 130 | self.add_module("layer%d" % (idx), nn.Sequential(*layers)) 131 | 132 | def forward(self, x): 133 | outputs = [] 134 | for idx in range(8): 135 | name = "layer%d" % idx 136 | x = getattr(self, name)(x) 137 | outputs.append(x) 138 | p0, p1, p2, p3, p4 = [outputs[i] for i in [1, 2, 3, 5, 7]] 139 | out = [outputs[i] for i in self.used_layers] 140 | return out 141 | 142 | 143 | def mobilenetv2(**kwargs): 144 | model = MobileNetV2(**kwargs) 145 | return model 146 | 147 | 148 | if __name__ == "__main__": 149 | net = mobilenetv2() 150 | 151 | print(net) 152 | 153 | from torch.autograd import Variable 154 | 155 | tensor = Variable(torch.Tensor(1, 3, 255, 255)).cuda() 156 | 157 | net = net.cuda() 158 | 159 | out = net(tensor) 160 | 161 | for i, p in enumerate(out): 162 | print(i, p.size()) 163 | -------------------------------------------------------------------------------- /pysot/models/backbone/newalexnet.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch.nn as nn 4 | 5 | 6 | class Bottleneck(nn.Module): 7 | expansion = 4 8 | 9 | def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1): 10 | super(Bottleneck, self).__init__() 11 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 12 | self.bn1 = nn.BatchNorm2d(planes) 13 | padding = 2 - stride 14 | if downsample is not None and dilation > 1: 15 | dilation = dilation // 2 16 | padding = dilation 17 | 18 | assert ( 19 | stride == 1 or dilation == 1 20 | ), "stride and dilation must have one equals to zero at least" 21 | 22 | if dilation > 1: 23 | padding = dilation 24 | self.conv2 = nn.Conv2d( 25 | planes, 26 | planes, 27 | kernel_size=3, 28 | stride=stride, 29 | padding=padding, 30 | bias=False, 31 | dilation=dilation, 32 | ) 33 | self.bn2 = nn.BatchNorm2d(planes) 34 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 35 | self.bn3 = nn.BatchNorm2d(planes * 4) 36 | self.relu = nn.ReLU(inplace=True) 37 | self.downsample = downsample 38 | self.stride = stride 39 | 40 | def forward(self, x): 41 | residual = x 42 | 43 | out = self.conv1(x) 44 | out = self.bn1(out) 45 | out = self.relu(out) 46 | 47 | out = self.conv2(out) 48 | out = self.bn2(out) 49 | out = self.relu(out) 50 | 51 | out = self.conv3(out) 52 | out = self.bn3(out) 53 | 54 | if self.downsample is not None: 55 | residual = self.downsample(x) 56 | 57 | out += residual 58 | 59 | out = self.relu(out) 60 | 61 | return out 62 | 63 | 64 | class ResNet(nn.Module): 65 | def __init__(self, block, layers, used_layers): 66 | self.inplanes = 64 67 | super(ResNet, self).__init__() 68 | self.conv1 = nn.Conv2d( 69 | 3, 64, kernel_size=7, stride=2, padding=0, bias=False # 3 70 | ) 71 | self.bn1 = nn.BatchNorm2d(64) 72 | self.relu = nn.ReLU(inplace=True) 73 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 74 | self.layer1 = self._make_layer(block, 64, layers[0]) 75 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 76 | 77 | self.feature_size = 128 * block.expansion 78 | 79 | layer3 = True if 3 in used_layers else False 80 | layer4 = True if 4 in used_layers else False 81 | 82 | if layer3: 83 | self.layer3 = self._make_layer( 84 | block, 256, layers[2], stride=1, dilation=2 85 | ) # 15x15, 7x7 86 | self.feature_size = (256 + 128) * block.expansion 87 | else: 88 | self.layer3 = lambda x: x # identity 89 | 90 | if layer4: 91 | self.layer4 = self._make_layer( 92 | block, 512, layers[3], stride=1, dilation=4 93 | ) # 7x7, 3x3 94 | self.feature_size = 512 * block.expansion 95 | else: 96 | self.layer4 = lambda x: x # identity 97 | 98 | for m in self.modules(): 99 | if isinstance(m, nn.Conv2d): 100 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 101 | m.weight.data.normal_(0, math.sqrt(2.0 / n)) 102 | elif isinstance(m, nn.BatchNorm2d): 103 | m.weight.data.fill_(1) 104 | m.bias.data.zero_() 105 | 106 | def _make_layer(self, block, planes, blocks, stride=1, dilation=1): 107 | downsample = None 108 | dd = dilation 109 | if stride != 1 or self.inplanes != planes * block.expansion: 110 | if stride == 1 and dilation == 1: 111 | downsample = nn.Sequential( 112 | nn.Conv2d( 113 | self.inplanes, 114 | planes * block.expansion, 115 | kernel_size=1, 116 | stride=stride, 117 | bias=False, 118 | ), 119 | nn.BatchNorm2d(planes * block.expansion), 120 | ) 121 | else: 122 | if dilation > 1: 123 | dd = dilation // 2 124 | padding = dd 125 | else: 126 | dd = 1 127 | padding = 0 128 | downsample = nn.Sequential( 129 | nn.Conv2d( 130 | self.inplanes, 131 | planes * block.expansion, 132 | kernel_size=3, 133 | stride=stride, 134 | bias=False, 135 | padding=padding, 136 | dilation=dd, 137 | ), 138 | nn.BatchNorm2d(planes * block.expansion), 139 | ) 140 | 141 | layers = [] 142 | layers.append( 143 | block(self.inplanes, planes, stride, downsample, dilation=dilation) 144 | ) 145 | self.inplanes = planes * block.expansion 146 | for i in range(1, blocks): 147 | layers.append(block(self.inplanes, planes, dilation=dilation)) 148 | 149 | return nn.Sequential(*layers) 150 | 151 | def forward(self, x): 152 | x = self.conv1(x) 153 | x = self.bn1(x) 154 | x_ = self.relu(x) 155 | x = self.maxpool(x_) 156 | 157 | p1 = self.layer1(x) 158 | p2 = self.layer2(p1) 159 | p3 = self.layer3(p2) 160 | p4 = self.layer4(p3) 161 | 162 | return p3, p4 163 | 164 | 165 | def resnet50(): 166 | """Constructs a ResNet-50 model.""" 167 | model = ResNet(Bottleneck, [3, 4, 6, 3], [2, 3, 4]) 168 | return model 169 | 170 | 171 | class AlexNet(nn.Module): 172 | configs = [3, 96, 256, 384, 384, 256] 173 | 174 | def __init__(self, width_mult=1): 175 | configs = list( 176 | map(lambda x: 3 if x == 3 else int(x * width_mult), AlexNet.configs) 177 | ) 178 | super(AlexNet, self).__init__() 179 | self.layer1 = nn.Sequential( 180 | nn.Conv2d(configs[0], configs[1], kernel_size=11, stride=2), 181 | nn.BatchNorm2d(configs[1]), 182 | nn.MaxPool2d(kernel_size=3, stride=2), 183 | nn.ReLU(inplace=True), 184 | ) 185 | self.layer2 = nn.Sequential( 186 | nn.Conv2d(configs[1], configs[2], kernel_size=5), 187 | nn.BatchNorm2d(configs[2]), 188 | nn.MaxPool2d(kernel_size=3, stride=2), 189 | nn.ReLU(inplace=True), 190 | ) 191 | self.layer3 = nn.Sequential( 192 | nn.Conv2d(configs[2], configs[3], kernel_size=3), 193 | nn.BatchNorm2d(configs[3]), 194 | nn.ReLU(inplace=True), 195 | ) 196 | self.layer4 = nn.Sequential( 197 | nn.Conv2d(configs[3], configs[4], kernel_size=3), 198 | nn.BatchNorm2d(configs[4]), 199 | nn.ReLU(inplace=True), 200 | ) 201 | 202 | self.layer5 = nn.Sequential( 203 | nn.Conv2d(configs[4], configs[5], kernel_size=3), 204 | nn.BatchNorm2d(configs[5]), 205 | ) 206 | self.feature_size = configs[5] 207 | for param in self.layer1.parameters(): 208 | param.requires_grad = False 209 | for param in self.layer2.parameters(): 210 | param.requires_grad = False 211 | 212 | def forward(self, x): 213 | x1 = self.layer1(x) 214 | x2 = self.layer2(x1) 215 | x3 = self.layer3(x2) 216 | x4 = self.layer4(x3) 217 | x5 = self.layer5(x4) 218 | return x1, x2, x3, x4, x5 219 | -------------------------------------------------------------------------------- /pysot/models/loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) SenseTime. All Rights Reserved. 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | 8 | import torch 9 | from torch import nn 10 | import torch.nn.functional as F 11 | 12 | 13 | def get_cls_loss(pred, label, select): 14 | if len(select.size()) == 0 or select.size() == torch.Size([0]): 15 | return 0 16 | pred = torch.index_select(pred, 0, select) 17 | label = torch.index_select(label, 0, select) 18 | label = label.long() 19 | return F.nll_loss(pred, label) 20 | 21 | 22 | def select_cross_entropy_loss(pred, label): 23 | pred = pred.view(-1, 2) 24 | label = label.view(-1) 25 | pos = label.data.eq(1).nonzero(as_tuple=False).squeeze().cuda() 26 | neg = label.data.eq(0).nonzero(as_tuple=False).squeeze().cuda() 27 | loss_pos = get_cls_loss(pred, label, pos) 28 | loss_neg = get_cls_loss(pred, label, neg) 29 | return loss_pos * 0.5 + loss_neg * 0.5 30 | 31 | 32 | class IOULoss(nn.Module): 33 | def forward(self, pred, target, weight=None): 34 | pred_left = pred[:, :, 0] 35 | pred_top = pred[:, :, 1] 36 | pred_right = pred[:, :, 2] 37 | pred_bottom = pred[:, :, 3] 38 | 39 | target_left = target[:, :, 0] 40 | target_top = target[:, :, 1] 41 | target_right = target[:, :, 2] 42 | target_bottom = target[:, :, 3] 43 | 44 | target_aera = (target_right - target_left) * (target_bottom - target_top) 45 | pred_aera = (pred_right - pred_left) * (pred_bottom - pred_top) 46 | 47 | w_intersect = torch.min(pred_right, target_right) - torch.max( 48 | pred_left, target_left 49 | ) 50 | w_intersect = w_intersect.clamp(min=0) 51 | h_intersect = torch.min(pred_bottom, target_bottom) - torch.max( 52 | pred_top, target_top 53 | ) 54 | h_intersect = h_intersect.clamp(min=0) 55 | area_intersect = w_intersect * h_intersect 56 | area_union = target_aera + pred_aera - area_intersect 57 | ious = ((area_intersect) / (area_union + 1e-6)).clamp(min=0) + 1e-6 58 | 59 | losses = 1 - ious 60 | weight = weight.view(losses.size()) 61 | if weight.sum() > 0: 62 | return (losses * weight).sum() / (weight.sum() + 1e-6) 63 | else: 64 | return (losses * weight).sum() 65 | -------------------------------------------------------------------------------- /pysot/models/model_builder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | 6 | import torch as t 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import numpy as np 10 | 11 | from pysot.core.config import cfg 12 | from pysot.models.utile.utile import PRL 13 | from pysot.models.backbone.newalexnet import AlexNet 14 | from pysot.models.loss import select_cross_entropy_loss, IOULoss 15 | 16 | 17 | class ModelBuilder(nn.Module): 18 | def __init__(self): 19 | super(ModelBuilder, self).__init__() 20 | self.backbone = AlexNet().cuda() 21 | self.grader = PRL(cfg).cuda() 22 | self.cls2loss = nn.BCEWithLogitsLoss() 23 | self.IOULoss = IOULoss() 24 | 25 | def template(self, z): 26 | with t.no_grad(): 27 | zf = self.backbone(z) 28 | self.zf = zf 29 | 30 | def track(self, x): 31 | with t.no_grad(): 32 | 33 | xf = self.backbone(x) 34 | loc, cls1, cls2 = self.grader(xf, self.zf) 35 | 36 | return {"cls1": cls1, "cls2": cls2, "loc": loc} 37 | 38 | def log_softmax(self, cls): 39 | b, a2, h, w = cls.size() 40 | cls = cls.view(b, 2, a2 // 2, h, w) 41 | cls = cls.permute(0, 2, 3, 4, 1).contiguous() 42 | cls = F.log_softmax(cls, dim=4) 43 | 44 | return cls 45 | 46 | def getcentercuda(self, mapp): 47 | 48 | def con(x): 49 | return x * 143 50 | 51 | def dcon(x): 52 | x[t.where(x <= -1)] = -0.99 53 | x[t.where(x >= 1)] = 0.99 54 | return (t.log(1 + x) - t.log(1 - x)) / 2 55 | 56 | size = mapp.size()[3] 57 | # location 58 | x = t.Tensor( 59 | np.tile( 60 | (16 * (np.linspace(0, size - 1, size)) + 63) 61 | - cfg.TRAIN.SEARCH_SIZE // 2, 62 | size, 63 | ).reshape(-1) 64 | ).cuda() 65 | y = t.Tensor( 66 | np.tile( 67 | (16 * (np.linspace(0, size - 1, size)) + 63).reshape(-1, 1) 68 | - cfg.TRAIN.SEARCH_SIZE // 2, 69 | size, 70 | ).reshape(-1) 71 | ).cuda() 72 | 73 | shap = dcon(mapp) * 143 74 | 75 | xx = np.int16(np.tile(np.linspace(0, size - 1, size), size).reshape(-1)) 76 | yy = np.int16( 77 | np.tile(np.linspace(0, size - 1, size).reshape(-1, 1), size).reshape(-1) 78 | ) 79 | 80 | w = shap[:, 0, yy, xx] + shap[:, 1, yy, xx] 81 | h = shap[:, 2, yy, xx] + shap[:, 3, yy, xx] 82 | x = x - shap[:, 0, yy, xx] + w / 2 + cfg.TRAIN.SEARCH_SIZE // 2 83 | y = y - shap[:, 2, yy, xx] + h / 2 + cfg.TRAIN.SEARCH_SIZE // 2 84 | 85 | anchor = t.zeros((cfg.TRAIN.BATCH_SIZE // cfg.TRAIN.NUM_GPU, size**2, 4)).cuda() 86 | 87 | anchor[:, :, 0] = x - w / 2 88 | anchor[:, :, 1] = y - h / 2 89 | anchor[:, :, 2] = x + w / 2 90 | anchor[:, :, 3] = y + h / 2 91 | return anchor 92 | 93 | def forward(self, data): 94 | """only used in training""" 95 | 96 | template = data["template"].cuda() 97 | search = data["search"].cuda() 98 | bbox = data["bbox"].cuda() 99 | labelcls1 = data["labelcls1"].cuda() 100 | labelxff = data["labelxff"].cuda() 101 | labelcls2 = data["labelcls2"].cuda() 102 | weightxff = data["weightxff"].cuda() 103 | 104 | zf = self.backbone(template) 105 | xf = self.backbone(search) 106 | loc, cls1, cls2 = self.grader(xf, zf) 107 | 108 | cls1 = self.log_softmax(cls1) 109 | 110 | cls_loss1 = select_cross_entropy_loss(cls1, labelcls1) 111 | cls_loss2 = self.cls2loss(cls2, labelcls2) 112 | 113 | pre_bbox = self.getcentercuda(loc) 114 | bbo = self.getcentercuda(labelxff) 115 | 116 | loc_loss = cfg.TRAIN.w3 * self.IOULoss(pre_bbox, bbo, weightxff) 117 | 118 | cls_loss = cfg.TRAIN.w4 * cls_loss1 + cfg.TRAIN.w5 * cls_loss2 119 | 120 | outputs = {} 121 | outputs["total_loss"] = ( 122 | cfg.TRAIN.LOC_WEIGHT * loc_loss + cfg.TRAIN.CLS_WEIGHT * cls_loss 123 | ) 124 | outputs["cls_loss"] = cls_loss 125 | outputs["loc_loss"] = loc_loss 126 | 127 | return outputs 128 | -------------------------------------------------------------------------------- /pysot/models/utile/coarse_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | def lip2d(x, logit, kernel=3, stride=2, padding=1): 7 | """LIP: Local Importance-Based Pooling""" 8 | weight = logit.exp() 9 | return F.avg_pool2d(x * weight, kernel, stride, padding) / ( 10 | F.avg_pool2d(weight, kernel, stride, padding) + 1e-8 11 | ) 12 | 13 | 14 | class SELayer(nn.Module): 15 | def __init__(self, channel, reduction=16): 16 | super(SELayer, self).__init__() 17 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 18 | self.fc = nn.Sequential( 19 | nn.Linear(channel, channel // reduction, bias=False), 20 | nn.ReLU(inplace=True), 21 | nn.Linear(channel // reduction, channel, bias=False), 22 | nn.Sigmoid(), 23 | ) 24 | 25 | def init_weights(self): 26 | for m in self.modules(): 27 | if isinstance(m, nn.Conv2d): 28 | init.kaiming_normal_(m.weight, mode="fan_out") 29 | if m.bias is not None: 30 | init.constant_(m.bias, 0) 31 | elif isinstance(m, nn.BatchNorm2d): 32 | init.constant_(m.weight, 1) 33 | init.constant_(m.bias, 0) 34 | elif isinstance(m, nn.Linear): 35 | init.normal_(m.weight, std=0.001) 36 | if m.bias is not None: 37 | init.constant_(m.bias, 0) 38 | 39 | def forward(self, x): 40 | b, c, _, _ = x.size() 41 | y = self.avg_pool(x).view(b, c) 42 | y = self.fc(y).view(b, c, 1, 1) 43 | return x * y.expand_as(x) 44 | 45 | 46 | class AR(nn.Module): 47 | def __init__(self, in_channels1, in_channels2): 48 | super(AR, self).__init__() 49 | self.conv1 = nn.Conv2d(in_channels1, 1, kernel_size=1) 50 | self.conv2 = nn.Conv2d(in_channels2, 1, kernel_size=1) 51 | self.conv_concat = nn.Conv2d(2, 1, kernel_size=1) 52 | self.bn = nn.BatchNorm2d(1) 53 | 54 | self.relu = nn.ReLU(inplace=True) 55 | self.se = SELayer(channel=192) 56 | self.conv = nn.Sequential( 57 | nn.Conv2d(384, 192, kernel_size=3, bias=False, stride=2, padding=1), 58 | nn.BatchNorm2d(192), 59 | nn.ReLU(inplace=True), 60 | ) 61 | 62 | def forward(self, x, g1, g2): 63 | g1_conv = self.conv1(g1) # torch.Size([128, 1, 41, 41]) 64 | g2_conv = self.conv2(g2) # torch.Size([128, 1, 21, 21]) 65 | g1_conv_norm = self.bn(g1_conv) 66 | g1_conv_downsampled = lip2d(g1_conv_norm, logit=g1_conv_norm) 67 | concatenated_features = torch.cat([g1_conv_downsampled, g2_conv], dim=1) 68 | 69 | g12 = self.conv_concat(concatenated_features) 70 | g12 = self.relu(g12) 71 | 72 | gating = torch.mul(x, g12) 73 | 74 | w = x + gating 75 | result = self.conv(w) 76 | result = self.se(result) 77 | 78 | return result 79 | 80 | 81 | class SR(nn.Module): 82 | def __init__(self, x_channels1, t_channels): 83 | super(SR, self).__init__() 84 | self.conv1 = nn.Conv2d(t_channels, 1, kernel_size=1) 85 | self.sigmoid = nn.Sigmoid() 86 | 87 | self.se = SELayer(channel=192) 88 | 89 | self.conv2 = nn.Sequential( 90 | nn.Conv2d(x_channels1, 192, kernel_size=3, bias=False, stride=2, padding=1), 91 | nn.BatchNorm2d(192), 92 | nn.ReLU(inplace=True), 93 | ) 94 | 95 | def forward(self, x, t): 96 | t_upsampled = F.interpolate(t, size=x.size()[2:], mode="nearest") 97 | t = self.conv1(t_upsampled) 98 | special = torch.mul(x, t) 99 | 100 | w = x + special 101 | result = self.conv2(w) 102 | result = self.se(result) 103 | 104 | return result 105 | -------------------------------------------------------------------------------- /pysot/models/utile/pos_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # -------------------------------------------------------- 7 | # Position embedding utils 8 | # -------------------------------------------------------- 9 | 10 | import numpy as np 11 | 12 | import torch 13 | 14 | 15 | # -------------------------------------------------------- 16 | # 2D sine-cosine position embedding 17 | # References: 18 | # Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py 19 | # MoCo v3: https://github.com/facebookresearch/moco-v3 20 | # -------------------------------------------------------- 21 | def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): 22 | """ 23 | grid_size: int of the grid height and width 24 | return: 25 | pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) 26 | """ 27 | grid_h = np.arange(grid_size, dtype=np.float32) 28 | grid_w = np.arange(grid_size, dtype=np.float32) 29 | grid = np.meshgrid(grid_w, grid_h) # here w goes first 30 | grid = np.stack(grid, axis=0) 31 | 32 | grid = grid.reshape([2, 1, grid_size, grid_size]) 33 | pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) 34 | if cls_token: 35 | pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) 36 | return pos_embed 37 | 38 | 39 | def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): 40 | assert embed_dim % 2 == 0 41 | 42 | # use half of dimensions to encode grid_h 43 | emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) 44 | emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) 45 | 46 | emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) 47 | return emb 48 | 49 | 50 | def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): 51 | """ 52 | embed_dim: output dimension for each position 53 | pos: a list of positions to be encoded: size (M,) 54 | out: (M, D) 55 | """ 56 | assert embed_dim % 2 == 0 57 | omega = np.arange(embed_dim // 2, dtype=np.float) 58 | omega /= embed_dim / 2.0 59 | omega = 1.0 / 10000**omega # (D/2,) 60 | 61 | pos = pos.reshape(-1) # (M,) 62 | out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product 63 | 64 | emb_sin = np.sin(out) # (M, D/2) 65 | emb_cos = np.cos(out) # (M, D/2) 66 | 67 | emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) 68 | return emb 69 | 70 | 71 | # -------------------------------------------------------- 72 | # Interpolate position embeddings for high-resolution 73 | # References: 74 | # DeiT: https://github.com/facebookresearch/deit 75 | # -------------------------------------------------------- 76 | def interpolate_pos_embed(model, checkpoint_model): 77 | if "pos_embed" in checkpoint_model: 78 | pos_embed_checkpoint = checkpoint_model["pos_embed"] 79 | embedding_size = pos_embed_checkpoint.shape[-1] 80 | num_patches = model.patch_embed.num_patches 81 | num_extra_tokens = model.pos_embed.shape[-2] - num_patches 82 | # height (== width) for the checkpoint position embedding 83 | orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) 84 | # height (== width) for the new position embedding 85 | new_size = int(num_patches**0.5) 86 | # class_token and dist_token are kept unchanged 87 | if orig_size != new_size: 88 | print( 89 | "Position interpolate from %dx%d to %dx%d" 90 | % (orig_size, orig_size, new_size, new_size) 91 | ) 92 | extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] 93 | # only the position tokens are interpolated 94 | pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] 95 | pos_tokens = pos_tokens.reshape( 96 | -1, orig_size, orig_size, embedding_size 97 | ).permute(0, 3, 1, 2) 98 | pos_tokens = torch.nn.functional.interpolate( 99 | pos_tokens, 100 | size=(new_size, new_size), 101 | mode="bicubic", 102 | align_corners=False, 103 | ) 104 | pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) 105 | new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) 106 | checkpoint_model["pos_embed"] = new_pos_embed 107 | -------------------------------------------------------------------------------- /pysot/models/utile/tran.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import timm.models.vision_transformer 5 | from timm.models.layers import DropPath, Mlp 6 | 7 | from pysot.models.utile.utils import to_2tuple 8 | from pysot.models.utile.pos_utils import get_2d_sincos_pos_embed 9 | 10 | 11 | class PatchEmbed(nn.Module): 12 | 13 | def __init__( 14 | self, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True 15 | ): 16 | super().__init__() 17 | patch_size = to_2tuple(patch_size) 18 | self.flatten = flatten 19 | 20 | self.proj = nn.Conv2d( 21 | in_chans, embed_dim, kernel_size=patch_size, stride=patch_size 22 | ) 23 | self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() 24 | 25 | def forward(self, x): 26 | x = self.proj(x) 27 | if self.flatten: 28 | x = x.flatten(2).transpose(1, 2).contiguous() # BCHW -> BNC 29 | x = self.norm(x) 30 | return x 31 | 32 | 33 | def split_channels(tensor, num_heads, dim=3): 34 | # Assuming the channels are the last dimension (dim=3 by default) 35 | channel_size = tensor.shape[dim] // num_heads 36 | return [ 37 | tensor[:, :, :, i * channel_size : (i + 1) * channel_size] 38 | for i in range(num_heads) 39 | ] 40 | 41 | 42 | class Attention(nn.Module): 43 | def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0): 44 | super().__init__() 45 | assert dim % num_heads == 0, "dim should be divisible by num_heads" 46 | self.num_heads = num_heads 47 | head_dim = dim // num_heads 48 | self.scale = head_dim**-0.5 49 | 50 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 51 | self.attn_drop = nn.Dropout(attn_drop) 52 | self.proj = nn.Linear(dim, dim) 53 | self.proj_drop = nn.Dropout(proj_drop) 54 | 55 | self.qkv_mem = None 56 | 57 | def forward(self, x): 58 | B, N, C = x.shape 59 | qkv = ( 60 | self.qkv(x) 61 | .reshape(B, N, 3, self.num_heads, C // self.num_heads) 62 | .permute(2, 0, 3, 1, 4) 63 | ) 64 | q, k, v = qkv.unbind(0) 65 | 66 | q3, q4, q5 = torch.split( 67 | q, [q.shape[3] // 3, q.shape[3] // 3, q.shape[3] // 3], dim=3 68 | ) 69 | k3, k4, k5 = torch.split( 70 | k, [k.shape[3] // 3, k.shape[3] // 3, k.shape[3] // 3], dim=3 71 | ) 72 | v3, v4, v5 = torch.split( 73 | v, [v.shape[3] // 3, v.shape[3] // 3, v.shape[3] // 3], dim=3 74 | ) 75 | 76 | # 3 and 4 77 | attn = (q4 @ torch.cat([k3, k4], dim=2).transpose(-2, -1)) * self.scale 78 | attn = attn.softmax(dim=-1) 79 | attn = self.attn_drop(attn) 80 | x3 = (attn @ torch.cat([v3, v4], dim=2)).transpose(1, 2).reshape(B, N, C // 3) 81 | 82 | # 3 and 5 83 | attn = (q5 @ torch.cat([k3, k5], dim=2).transpose(-2, -1)) * self.scale 84 | 85 | attn = attn.softmax(dim=-1) 86 | attn = self.attn_drop(attn) 87 | x4 = (attn @ torch.cat([v3, v5], dim=2)).transpose(1, 2).reshape(B, N, C // 3) 88 | 89 | # 4 and 5 90 | attn = (q5 @ torch.cat([k4, k5], dim=2).transpose(-2, -1)) * self.scale 91 | attn = attn.softmax(dim=-1) 92 | attn = self.attn_drop(attn) 93 | x5 = (attn @ torch.cat([v4, v5], dim=2)).transpose(1, 2).reshape(B, N, C // 3) 94 | 95 | x = torch.cat([x3, x4, x5], dim=2) 96 | x = self.proj(x) 97 | x = self.proj_drop(x) 98 | 99 | return x 100 | 101 | 102 | class Block(nn.Module): 103 | def __init__( 104 | self, 105 | dim, 106 | num_heads, 107 | mlp_ratio=4.0, 108 | qkv_bias=False, 109 | drop=0.0, 110 | attn_drop=0.0, 111 | drop_path=0.0, 112 | act_layer=nn.GELU, 113 | norm_layer=nn.LayerNorm, 114 | ): 115 | super().__init__() 116 | self.norm1 = norm_layer(dim * 3) 117 | self.attn = Attention( 118 | dim * 3, 119 | num_heads=num_heads, 120 | qkv_bias=qkv_bias, 121 | attn_drop=attn_drop, 122 | proj_drop=drop, 123 | ) 124 | self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() 125 | 126 | self.norm2 = norm_layer(dim * 3) 127 | mlp_hidden_dim = int(dim * mlp_ratio) 128 | self.mlp = Mlp( 129 | in_features=dim * 3, 130 | hidden_features=mlp_hidden_dim, 131 | act_layer=act_layer, 132 | drop=drop, 133 | ) 134 | self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() 135 | 136 | def forward(self, x): 137 | x = x + self.drop_path1(self.attn(self.norm1(x))) 138 | x = x + self.drop_path2(self.mlp(self.norm2(x))) 139 | return x 140 | 141 | 142 | class VisionTransformer(timm.models.vision_transformer.VisionTransformer): 143 | def __init__( 144 | self, 145 | img_size=256, 146 | patch_size=16, 147 | in_chans=3, 148 | num_classes=1000, 149 | embed_dim=768, 150 | depth=12, 151 | num_heads=12, 152 | mlp_ratio=4.0, 153 | qkv_bias=True, 154 | drop_rate=0.0, 155 | attn_drop_rate=0.0, 156 | drop_path_rate=0.0, 157 | weight_init="", 158 | embed_layer=PatchEmbed, 159 | norm_layer=None, 160 | act_layer=None, 161 | ): 162 | super(VisionTransformer, self).__init__( 163 | img_size=256, 164 | patch_size=patch_size, 165 | in_chans=in_chans, 166 | num_classes=num_classes, 167 | embed_dim=embed_dim, 168 | depth=depth, 169 | num_heads=num_heads, 170 | mlp_ratio=mlp_ratio, 171 | qkv_bias=qkv_bias, 172 | drop_rate=drop_rate, 173 | attn_drop_rate=attn_drop_rate, 174 | drop_path_rate=drop_path_rate, 175 | weight_init=weight_init, 176 | norm_layer=norm_layer, 177 | act_layer=act_layer, 178 | ) 179 | 180 | self.patch_embed = embed_layer( 181 | patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim 182 | ) 183 | dpr = [ 184 | x.item() for x in torch.linspace(0, drop_path_rate, depth) 185 | ] # stochastic depth decay rule 186 | self.blocks = nn.Sequential( 187 | *[ 188 | Block( 189 | dim=embed_dim, 190 | num_heads=num_heads, 191 | mlp_ratio=mlp_ratio, 192 | qkv_bias=qkv_bias, 193 | drop=drop_rate, 194 | attn_drop=attn_drop_rate, 195 | drop_path=dpr[i], 196 | norm_layer=norm_layer, 197 | ) 198 | for i in range(depth) 199 | ] 200 | ) 201 | 202 | self.grid_size = img_size // patch_size 203 | self.num_patches = self.grid_size**2 204 | self.pos_embed1 = nn.Parameter( 205 | torch.zeros(1, self.num_patches, embed_dim), requires_grad=False 206 | ) 207 | 208 | self.pos_embed2 = nn.Parameter( 209 | torch.zeros(1, self.num_patches, embed_dim), requires_grad=False 210 | ) 211 | 212 | self.pos_embed3 = nn.Parameter( 213 | torch.zeros(1, self.num_patches, embed_dim), requires_grad=False 214 | ) 215 | 216 | self.init_pos_embed() 217 | 218 | if weight_init != "skip": 219 | self.init_weights(weight_init) 220 | 221 | def init_pos_embed(self): 222 | # initialize (and freeze) pos_embed by sin-cos embedding 223 | pos_embed1 = get_2d_sincos_pos_embed( 224 | self.pos_embed1.shape[-1], int(self.num_patches**0.5), cls_token=False 225 | ) 226 | self.pos_embed1.data.copy_(torch.from_numpy(pos_embed1).float().unsqueeze(0)) 227 | 228 | pos_embed2 = get_2d_sincos_pos_embed( 229 | self.pos_embed2.shape[-1], int(self.num_patches**0.5), cls_token=False 230 | ) 231 | self.pos_embed2.data.copy_(torch.from_numpy(pos_embed2).float().unsqueeze(0)) 232 | 233 | pos_embed3 = get_2d_sincos_pos_embed( 234 | self.pos_embed3.shape[-1], int(self.num_patches**0.5), cls_token=False 235 | ) 236 | self.pos_embed3.data.copy_(torch.from_numpy(pos_embed3).float().unsqueeze(0)) 237 | 238 | def forward(self, x3, x4, x5): 239 | x3 = self.patch_embed(x3) # BCHW-->BNC 240 | x4 = self.patch_embed(x4) 241 | x5 = self.patch_embed(x5) 242 | B, C = x3.size(0), x3.size(-1) 243 | H = W = self.grid_size 244 | 245 | x3 = x3 + self.pos_embed1 246 | x4 = x4 + self.pos_embed2 247 | x5 = x5 + self.pos_embed3 248 | x = torch.cat([x3, x4, x5], dim=2) 249 | x = self.pos_drop(x) 250 | 251 | for blk in self.blocks: 252 | x = blk(x) 253 | 254 | x3, x4, x5 = torch.split(x, x.shape[2] // 3, dim=2) 255 | 256 | x5_2d = x5.transpose(1, 2).reshape(B, C, H, W) 257 | 258 | return x5_2d 259 | -------------------------------------------------------------------------------- /pysot/models/utile/utile.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | from functools import partial 5 | from pysot.models.utile.coarse_module import AR, SR 6 | from pysot.models.utile.tran import VisionTransformer 7 | 8 | 9 | class PRL(nn.Module): 10 | 11 | def __init__(self, cfg): 12 | super(PRL, self).__init__() 13 | 14 | self.ar = AR(96, 256) 15 | self.sr1 = SR(384, 192) 16 | self.sr2 = SR(256, 192) 17 | 18 | channel = 192 19 | 20 | self.convloc = nn.Sequential( 21 | nn.Conv2d(channel, channel, kernel_size=3, stride=1, padding=1), 22 | nn.GroupNorm(cfg.TRAIN.groupchannel, channel), 23 | nn.ReLU(inplace=True), 24 | nn.Conv2d(channel, channel, kernel_size=3, stride=1, padding=1), 25 | nn.GroupNorm(cfg.TRAIN.groupchannel, channel), 26 | nn.ReLU(inplace=True), 27 | nn.Conv2d(channel, channel, kernel_size=3, stride=1, padding=1), 28 | nn.GroupNorm(cfg.TRAIN.groupchannel, channel), 29 | nn.ReLU(inplace=True), 30 | nn.Conv2d(channel, 4, kernel_size=3, stride=1, padding=1), 31 | ) 32 | 33 | self.convcls = nn.Sequential( 34 | nn.Conv2d(channel, channel, kernel_size=3, stride=1, padding=1), 35 | nn.GroupNorm(cfg.TRAIN.groupchannel, channel), 36 | nn.ReLU(inplace=True), 37 | nn.Conv2d(channel, channel, kernel_size=3, stride=1, padding=1), 38 | nn.GroupNorm(cfg.TRAIN.groupchannel, channel), 39 | nn.ReLU(inplace=True), 40 | nn.Conv2d(channel, channel, kernel_size=3, stride=1, padding=1), 41 | nn.GroupNorm(cfg.TRAIN.groupchannel, channel), 42 | nn.ReLU(inplace=True), 43 | ) 44 | 45 | self.vit = VisionTransformer( 46 | img_size=11, 47 | patch_size=1, 48 | in_chans=192, 49 | embed_dim=192, 50 | depth=2, 51 | num_heads=12, 52 | mlp_ratio=4, 53 | qkv_bias=True, 54 | norm_layer=partial(nn.LayerNorm, eps=1e-6), 55 | drop_path_rate=0.1, 56 | ) 57 | 58 | self.cls1 = nn.Conv2d(channel, 2, kernel_size=3, stride=1, padding=1) 59 | self.cls2 = nn.Conv2d(channel, 1, kernel_size=3, stride=1, padding=1) 60 | 61 | def xcorr_depthwise(self, x, kernel): 62 | """depthwise cross correlation""" 63 | batch = kernel.size(0) 64 | channel = kernel.size(1) 65 | x = x.view(1, batch * channel, x.size(2), x.size(3)) 66 | kernel = kernel.view(batch * channel, 1, kernel.size(2), kernel.size(3)) 67 | out = F.conv2d(x, kernel, groups=batch * channel) 68 | out = out.view(batch, channel, out.size(2), out.size(3)) 69 | return out 70 | 71 | def forward(self, x, z): 72 | s1 = self.xcorr_depthwise(x[0], z[0]) 73 | s2 = self.xcorr_depthwise(x[1], z[1]) 74 | s3 = self.xcorr_depthwise(x[2], z[2]) 75 | s4 = self.xcorr_depthwise(x[3], z[3]) 76 | s5 = self.xcorr_depthwise(x[4], z[4]) 77 | 78 | res1 = self.ar(s3, s1, s2) 79 | res2 = self.sr1(s4, res1) 80 | res3 = self.sr2(s5, res2) 81 | 82 | res = self.vit(res1, res2, res3) 83 | 84 | loc = self.convloc(res) 85 | acls = self.convcls(res) 86 | 87 | cls1 = self.cls1(acls) 88 | cls2 = self.cls2(acls) 89 | 90 | return loc, cls1, cls2 91 | -------------------------------------------------------------------------------- /pysot/models/utile/utils.py: -------------------------------------------------------------------------------- 1 | import collections.abc 2 | from itertools import repeat 3 | 4 | import torch 5 | 6 | 7 | def _ntuple(n): 8 | def parse(x): 9 | if isinstance(x, collections.abc.Iterable): 10 | return x 11 | return tuple(repeat(x, n)) 12 | 13 | return parse 14 | 15 | 16 | to_2tuple = _ntuple(2) 17 | 18 | 19 | class FrozenBatchNorm2d(torch.nn.Module): 20 | """ 21 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 22 | 23 | Copy-paste from torchvision.misc.ops with added eps before rqsrt, 24 | without which any other models than torchvision.models.resnet[18,34,50,101] 25 | produce nans. 26 | """ 27 | 28 | def __init__(self, n): 29 | super(FrozenBatchNorm2d, self).__init__() 30 | self.register_buffer("weight", torch.ones(n)) 31 | self.register_buffer("bias", torch.zeros(n)) 32 | self.register_buffer("running_mean", torch.zeros(n)) 33 | self.register_buffer("running_var", torch.ones(n)) 34 | 35 | def _load_from_state_dict( 36 | self, 37 | state_dict, 38 | prefix, 39 | local_metadata, 40 | strict, 41 | missing_keys, 42 | unexpected_keys, 43 | error_msgs, 44 | ): 45 | num_batches_tracked_key = prefix + "num_batches_tracked" 46 | if num_batches_tracked_key in state_dict: 47 | del state_dict[num_batches_tracked_key] 48 | 49 | super(FrozenBatchNorm2d, self)._load_from_state_dict( 50 | state_dict, 51 | prefix, 52 | local_metadata, 53 | strict, 54 | missing_keys, 55 | unexpected_keys, 56 | error_msgs, 57 | ) 58 | 59 | def forward(self, x): 60 | # move reshapes to the beginning 61 | # to make it fuser-friendly 62 | w = self.weight.reshape(1, -1, 1, 1) 63 | b = self.bias.reshape(1, -1, 1, 1) 64 | rv = self.running_var.reshape(1, -1, 1, 1) 65 | rm = self.running_mean.reshape(1, -1, 1, 1) 66 | eps = 1e-5 67 | scale = w * (rv + eps).rsqrt() # rsqrt(x): 1/sqrt(x), r: reciprocal 68 | bias = b - rm * scale 69 | return x * scale + bias 70 | -------------------------------------------------------------------------------- /pysot/tracker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vision4robotics/PRL-Track/b89beed235b38f24c93f875276b5221c99f6fb29/pysot/tracker/__init__.py -------------------------------------------------------------------------------- /pysot/tracker/base_tracker.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) SenseTime. All Rights Reserved. 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | 8 | import cv2 9 | import torch 10 | import numpy as np 11 | 12 | from pysot.core.config import cfg 13 | 14 | 15 | class BaseTracker(object): 16 | """Base tracker of single objec tracking""" 17 | 18 | def init(self, img, bbox): 19 | """ 20 | args: 21 | img(np.ndarray): BGR image 22 | bbox(list): [x, y, width, height] 23 | x, y need to be 0-based 24 | """ 25 | raise NotImplementedError 26 | 27 | def track(self, img): 28 | """ 29 | args: 30 | img(np.ndarray): BGR image 31 | return: 32 | bbox(list):[x, y, width, height] 33 | """ 34 | raise NotImplementedError 35 | 36 | 37 | class SiameseTracker(BaseTracker): 38 | def get_subwindow(self, im, pos, model_sz, original_sz, avg_chans): 39 | """ 40 | args: 41 | im: bgr based image 42 | pos: center position 43 | model_sz: exemplar size 44 | s_z: original size 45 | avg_chans: channel average 46 | """ 47 | if isinstance(pos, float): 48 | pos = [pos, pos] 49 | sz = original_sz 50 | im_sz = im.shape 51 | c = (original_sz + 1) / 2 52 | context_xmin = np.floor(pos[0] - c + 0.5) 53 | context_xmax = context_xmin + sz - 1 54 | context_ymin = np.floor(pos[1] - c + 0.5) 55 | context_ymax = context_ymin + sz - 1 56 | left_pad = int(max(0.0, -context_xmin)) 57 | top_pad = int(max(0.0, -context_ymin)) 58 | right_pad = int(max(0.0, context_xmax - im_sz[1] + 1)) 59 | bottom_pad = int(max(0.0, context_ymax - im_sz[0] + 1)) 60 | 61 | context_xmin = context_xmin + left_pad 62 | context_xmax = context_xmax + left_pad 63 | context_ymin = context_ymin + top_pad 64 | context_ymax = context_ymax + top_pad 65 | 66 | r, c, k = im.shape 67 | if any([top_pad, bottom_pad, left_pad, right_pad]): 68 | size = (r + top_pad + bottom_pad, c + left_pad + right_pad, k) 69 | te_im = np.zeros(size, np.uint8) 70 | te_im[top_pad : top_pad + r, left_pad : left_pad + c, :] = im 71 | if top_pad: 72 | te_im[0:top_pad, left_pad : left_pad + c, :] = avg_chans 73 | if bottom_pad: 74 | te_im[r + top_pad :, left_pad : left_pad + c, :] = avg_chans 75 | if left_pad: 76 | te_im[:, 0:left_pad, :] = avg_chans 77 | if right_pad: 78 | te_im[:, c + left_pad :, :] = avg_chans 79 | im_patch = te_im[ 80 | int(context_ymin) : int(context_ymax + 1), 81 | int(context_xmin) : int(context_xmax + 1), 82 | :, 83 | ] 84 | else: 85 | im_patch = im[ 86 | int(context_ymin) : int(context_ymax + 1), 87 | int(context_xmin) : int(context_xmax + 1), 88 | :, 89 | ] 90 | 91 | if not np.array_equal(model_sz, original_sz): 92 | im_patch = cv2.resize(im_patch, (model_sz, model_sz)) 93 | im_patch = im_patch.transpose(2, 0, 1) 94 | im_patch = im_patch[np.newaxis, :, :, :] 95 | im_patch = im_patch.astype(np.float32) 96 | im_patch = torch.from_numpy(im_patch) 97 | if cfg.CUDA: 98 | im_patch = im_patch.cuda() 99 | return im_patch 100 | -------------------------------------------------------------------------------- /pysot/tracker/prl_tracker.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | 6 | import numpy as np 7 | import torch.nn.functional as F 8 | 9 | from pysot.core.config import cfg 10 | from pysot.tracker.base_tracker import SiameseTracker 11 | 12 | 13 | class PRLTrack(SiameseTracker): 14 | def __init__(self, model): 15 | super(PRLTrack, self).__init__() 16 | 17 | self.score_size = cfg.TRAIN.OUTPUT_SIZE 18 | self.anchor_num = 1 19 | hanning = np.hanning(self.score_size) 20 | window = np.outer(hanning, hanning) 21 | self.window = np.tile(window.flatten(), self.anchor_num) 22 | self.model = model 23 | self.model.eval() 24 | 25 | def generate_anchor(self, mapp): 26 | def dcon(x): 27 | x[np.where(x <= -1)] = -0.99 28 | x[np.where(x >= 1)] = 0.99 29 | return (np.log(1 + x) - np.log(1 - x)) / 2 30 | 31 | size = cfg.TRAIN.OUTPUT_SIZE 32 | x = np.tile( 33 | (cfg.ANCHOR.STRIDE * (np.linspace(0, size - 1, size)) + 63) 34 | - cfg.TRAIN.SEARCH_SIZE // 2, 35 | size, 36 | ).reshape(-1) 37 | y = np.tile( 38 | (cfg.ANCHOR.STRIDE * (np.linspace(0, size - 1, size)) + 63).reshape(-1, 1) 39 | - cfg.TRAIN.SEARCH_SIZE // 2, 40 | size, 41 | ).reshape(-1) 42 | shap = (dcon(mapp[0].cpu().detach().numpy())) * 143 43 | xx = np.int16(np.tile(np.linspace(0, size - 1, size), size).reshape(-1)) 44 | yy = np.int16( 45 | np.tile(np.linspace(0, size - 1, size).reshape(-1, 1), size).reshape(-1) 46 | ) 47 | w = shap[0, yy, xx] + shap[1, yy, xx] 48 | h = shap[2, yy, xx] + shap[3, yy, xx] 49 | x = x - shap[0, yy, xx] + w / 2 50 | y = y - shap[2, yy, xx] + h / 2 51 | 52 | anchor = np.zeros((size**2, 4)) 53 | 54 | anchor[:, 0] = x 55 | anchor[:, 1] = y 56 | anchor[:, 2] = w 57 | anchor[:, 3] = h 58 | return anchor 59 | 60 | def _convert_bbox(self, delta, anchor): 61 | delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1) 62 | delta = delta.data.cpu().numpy() 63 | 64 | delta[0, :] = delta[0, :] * anchor[:, 2] + anchor[:, 0] 65 | delta[1, :] = delta[1, :] * anchor[:, 3] + anchor[:, 1] 66 | delta[2, :] = np.exp(delta[2, :]) * anchor[:, 2] 67 | delta[3, :] = np.exp(delta[3, :]) * anchor[:, 3] 68 | return delta 69 | 70 | def _convert_score(self, score): 71 | score = score.permute(1, 2, 3, 0).contiguous().view(2, -1).permute(1, 0) 72 | score = F.softmax(score, dim=1).data[:, 1].cpu().numpy() 73 | return score 74 | 75 | def _bbox_clip(self, cx, cy, width, height, boundary): 76 | cx = max(0, min(cx, boundary[1])) 77 | cy = max(0, min(cy, boundary[0])) 78 | width = max(10, min(width, boundary[1])) 79 | height = max(10, min(height, boundary[0])) 80 | return cx, cy, width, height 81 | 82 | def init(self, img, bbox): 83 | """ 84 | args: 85 | img(np.ndarray): BGR image 86 | bbox: (x, y, w, h) bbox 87 | """ 88 | self.image = img 89 | 90 | self.center_pos = np.array( 91 | [bbox[0] + (bbox[2] - 1) / 2, bbox[1] + (bbox[3] - 1) / 2] 92 | ) 93 | 94 | self.size = np.array([bbox[2], bbox[3]]) 95 | self.firstbbox = np.concatenate((self.center_pos, self.size)) 96 | # calculate z crop size 97 | w_z = self.size[0] + cfg.TRACK.CONTEXT_AMOUNT * np.sum(self.size) 98 | h_z = self.size[1] + cfg.TRACK.CONTEXT_AMOUNT * np.sum(self.size) 99 | s_z = round(np.sqrt(w_z * h_z)) 100 | self.scaleaa = s_z 101 | 102 | # calculate channle average 103 | self.channel_average = np.mean(img, axis=(0, 1)) 104 | 105 | # get crop 106 | z_crop = self.get_subwindow( 107 | img, self.center_pos, cfg.TRACK.EXEMPLAR_SIZE, s_z, self.channel_average 108 | ) 109 | self.template = z_crop 110 | 111 | self.model.template(z_crop) 112 | 113 | def con(self, x): 114 | return x * (cfg.TRAIN.SEARCH_SIZE // 2) 115 | 116 | def track(self, img): 117 | """ 118 | args: 119 | img(np.ndarray): BGR image 120 | return: 121 | bbox(list):[x, y, width, height] 122 | """ 123 | 124 | w_z = self.size[0] + cfg.TRACK.CONTEXT_AMOUNT * np.sum(self.size) 125 | h_z = self.size[1] + cfg.TRACK.CONTEXT_AMOUNT * np.sum(self.size) 126 | s_z = np.sqrt(w_z * h_z) 127 | if self.size[0] * self.size[1] > 0.5 * img.shape[0] * img.shape[1]: 128 | s_z = self.scaleaa 129 | scale_z = cfg.TRAIN.EXEMPLAR_SIZE / s_z 130 | 131 | s_x = s_z * (cfg.TRACK.INSTANCE_SIZE / cfg.TRACK.EXEMPLAR_SIZE) 132 | 133 | x_crop = self.get_subwindow( 134 | img, 135 | self.center_pos, 136 | cfg.TRACK.INSTANCE_SIZE, 137 | round(s_x), 138 | self.channel_average, 139 | ) 140 | 141 | outputs = self.model.track(x_crop) 142 | pred_bbox = self.generate_anchor(outputs["loc"]).transpose() 143 | score1 = self._convert_score(outputs["cls1"]) * cfg.TRACK.w2 144 | score2 = (outputs["cls2"]).view(-1).cpu().detach().numpy() * cfg.TRACK.w3 145 | score = (score1 + score2) / 2 146 | 147 | def change(r): 148 | 149 | return np.maximum(r, 1.0 / (r + 1e-5)) 150 | 151 | def sz(w, h): 152 | pad = (w + h) * 0.5 153 | return np.sqrt((w + pad) * (h + pad)) 154 | 155 | # scale penalty 156 | s_c = change( 157 | sz(pred_bbox[2, :], pred_bbox[3, :]) 158 | / (sz(self.size[0] * scale_z, self.size[1] * scale_z)) 159 | ) 160 | 161 | # aspect ratio penalty 162 | r_c = change( 163 | (self.size[0] / (self.size[1] + 1e-5)) 164 | / (pred_bbox[2, :] / (pred_bbox[3, :] + 1e-5)) 165 | ) 166 | penalty = np.exp(-(r_c * s_c - 1) * cfg.TRACK.PENALTY_K) 167 | pscore = penalty * score 168 | 169 | # window penalty 170 | pscore = ( 171 | pscore * (1 - cfg.TRACK.WINDOW_INFLUENCE) 172 | + self.window * cfg.TRACK.WINDOW_INFLUENCE 173 | ) 174 | best_idx = np.argmax(pscore) 175 | 176 | bbox = pred_bbox[:, best_idx] / scale_z 177 | 178 | lr = penalty[best_idx] * score[best_idx] * cfg.TRACK.LR 179 | 180 | cx = bbox[0] + self.center_pos[0] 181 | cy = bbox[1] + self.center_pos[1] 182 | 183 | # smooth bbox 184 | width = self.size[0] * (1 - lr) + bbox[2] * lr 185 | height = self.size[1] * (1 - lr) + bbox[3] * lr 186 | 187 | # clip boundary 188 | cx, cy, width, height = self._bbox_clip(cx, cy, width, height, img.shape[:2]) 189 | 190 | # udpate state 191 | self.center_pos = np.array([cx, cy]) 192 | self.size = np.array([width, height]) 193 | 194 | bbox = [cx - width / 2, cy - height / 2, width, height] 195 | 196 | best_score = score[best_idx] 197 | 198 | return { 199 | "bbox": bbox, 200 | "best_score": best_score, 201 | } 202 | -------------------------------------------------------------------------------- /pysot/tracker/utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import torch 3 | import numpy as np 4 | 5 | 6 | def visualize_cam(mask, img): 7 | """Make heatmap from mask and synthesize GradCAM result image using heatmap and img. 8 | Args: 9 | mask (torch.tensor): mask shape of (1, 1, H, W) and each element has value in range [0, 1] 10 | img (torch.tensor): img shape of (1, 3, H, W) and each pixel value is in range [0, 1] 11 | 12 | Return: 13 | heatmap (torch.tensor): heatmap img shape of (3, H, W) 14 | result (torch.tensor): synthesized GradCAM result of same shape with heatmap. 15 | """ 16 | if img.max() > 1: 17 | img = img.div(255) 18 | mask = mask.cpu() 19 | heatmap = cv2.applyColorMap(np.uint8(255 * mask.squeeze()), cv2.COLORMAP_JET) 20 | heatmap = torch.from_numpy(heatmap).permute(2, 0, 1).float().div(255) 21 | b, g, r = heatmap.split(1) 22 | heatmap = torch.cat([r, g, b]) 23 | 24 | result = heatmap + img.cpu() 25 | result = result.div(result.max()).squeeze() 26 | 27 | return heatmap, result 28 | 29 | 30 | def find_resnet_layer(arch, target_layer_name): 31 | """Find resnet layer to calculate GradCAM and GradCAM++ 32 | 33 | Args: 34 | arch: default torchvision densenet models 35 | target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below. 36 | target_layer_name = 'conv1' 37 | target_layer_name = 'layer1' 38 | target_layer_name = 'layer1_basicblock0' 39 | target_layer_name = 'layer1_basicblock0_relu' 40 | target_layer_name = 'layer1_bottleneck0' 41 | target_layer_name = 'layer1_bottleneck0_conv1' 42 | target_layer_name = 'layer1_bottleneck0_downsample' 43 | target_layer_name = 'layer1_bottleneck0_downsample_0' 44 | target_layer_name = 'avgpool' 45 | target_layer_name = 'fc' 46 | 47 | Return: 48 | target_layer: found layer. this layer will be hooked to get forward/backward pass information. 49 | """ 50 | if "layer" in target_layer_name: 51 | hierarchy = target_layer_name.split("_") 52 | layer_num = int(hierarchy[0].lstrip("layer")) 53 | if layer_num == 1: 54 | target_layer = arch.layer1 55 | elif layer_num == 2: 56 | target_layer = arch.layer2 57 | elif layer_num == 3: 58 | target_layer = arch.layer3 59 | elif layer_num == 4: 60 | target_layer = arch.layer4 61 | else: 62 | raise ValueError("unknown layer : {}".format(target_layer_name)) 63 | 64 | if len(hierarchy) >= 2: 65 | bottleneck_num = int( 66 | hierarchy[1].lower().lstrip("bottleneck").lstrip("basicblock") 67 | ) 68 | target_layer = target_layer[bottleneck_num] 69 | 70 | if len(hierarchy) >= 3: 71 | target_layer = target_layer._modules[hierarchy[2]] 72 | 73 | if len(hierarchy) == 4: 74 | target_layer = target_layer._modules[hierarchy[3]] 75 | 76 | else: 77 | target_layer = arch._modules[target_layer_name] 78 | 79 | return target_layer 80 | 81 | 82 | def find_densenet_layer(arch, target_layer_name): 83 | """Find densenet layer to calculate GradCAM and GradCAM++ 84 | 85 | Args: 86 | arch: default torchvision densenet models 87 | target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below. 88 | target_layer_name = 'features' 89 | target_layer_name = 'features_transition1' 90 | target_layer_name = 'features_transition1_norm' 91 | target_layer_name = 'features_denseblock2_denselayer12' 92 | target_layer_name = 'features_denseblock2_denselayer12_norm1' 93 | target_layer_name = 'features_denseblock2_denselayer12_norm1' 94 | target_layer_name = 'classifier' 95 | 96 | Return: 97 | target_layer: found layer. this layer will be hooked to get forward/backward pass information. 98 | """ 99 | 100 | hierarchy = target_layer_name.split("_") 101 | target_layer = arch._modules[hierarchy[0]] 102 | 103 | if len(hierarchy) >= 2: 104 | target_layer = target_layer._modules[hierarchy[1]] 105 | 106 | if len(hierarchy) >= 3: 107 | target_layer = target_layer._modules[hierarchy[2]] 108 | 109 | if len(hierarchy) == 4: 110 | target_layer = target_layer._modules[hierarchy[3]] 111 | 112 | return target_layer 113 | 114 | 115 | def find_vgg_layer(arch, target_layer_name): 116 | """Find vgg layer to calculate GradCAM and GradCAM++ 117 | 118 | Args: 119 | arch: default torchvision densenet models 120 | target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below. 121 | target_layer_name = 'features' 122 | target_layer_name = 'features_42' 123 | target_layer_name = 'classifier' 124 | target_layer_name = 'classifier_0' 125 | 126 | Return: 127 | target_layer: found layer. this layer will be hooked to get forward/backward pass information. 128 | """ 129 | hierarchy = target_layer_name.split("_") 130 | 131 | if len(hierarchy) >= 1: 132 | target_layer = arch.features 133 | 134 | if len(hierarchy) == 2: 135 | target_layer = target_layer[int(hierarchy[1])] 136 | 137 | return target_layer 138 | 139 | 140 | def find_alexnet_layer(arch, target_layer_name): 141 | """Find alexnet layer to calculate GradCAM and GradCAM++ 142 | 143 | Args: 144 | arch: default torchvision densenet models 145 | target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below. 146 | target_layer_name = 'features' 147 | target_layer_name = 'features_0' 148 | target_layer_name = 'classifier' 149 | target_layer_name = 'classifier_0' 150 | 151 | Return: 152 | target_layer: found layer. this layer will be hooked to get forward/backward pass information. 153 | """ 154 | hierarchy = target_layer_name.split("_") 155 | 156 | if len(hierarchy) >= 1: 157 | target_layer = arch.features 158 | 159 | if len(hierarchy) == 2: 160 | target_layer = target_layer[int(hierarchy[1])] 161 | 162 | return target_layer 163 | 164 | 165 | def find_squeezenet_layer(arch, target_layer_name): 166 | """Find squeezenet layer to calculate GradCAM and GradCAM++ 167 | 168 | Args: 169 | arch: default torchvision densenet models 170 | target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below. 171 | target_layer_name = 'features_12' 172 | target_layer_name = 'features_12_expand3x3' 173 | target_layer_name = 'features_12_expand3x3_activation' 174 | 175 | Return: 176 | target_layer: found layer. this layer will be hooked to get forward/backward pass information. 177 | """ 178 | hierarchy = target_layer_name.split("_") 179 | target_layer = arch._modules[hierarchy[0]] 180 | 181 | if len(hierarchy) >= 2: 182 | target_layer = target_layer._modules[hierarchy[1]] 183 | 184 | if len(hierarchy) == 3: 185 | target_layer = target_layer._modules[hierarchy[2]] 186 | 187 | elif len(hierarchy) == 4: 188 | target_layer = target_layer._modules[hierarchy[2] + "_" + hierarchy[3]] 189 | 190 | return target_layer 191 | 192 | 193 | def denormalize(tensor, mean, std): 194 | if not tensor.ndimension() == 4: 195 | raise TypeError("tensor should be 4D") 196 | 197 | mean = torch.FloatTensor(mean).view(1, 3, 1, 1).expand_as(tensor).to(tensor.device) 198 | std = torch.FloatTensor(std).view(1, 3, 1, 1).expand_as(tensor).to(tensor.device) 199 | 200 | return tensor.mul(std).add(mean) 201 | 202 | 203 | def normalize(tensor, mean, std): 204 | if not tensor.ndimension() == 4: 205 | raise TypeError("tensor should be 4D") 206 | 207 | mean = torch.FloatTensor(mean).view(1, 3, 1, 1).expand_as(tensor).to(tensor.device) 208 | std = torch.FloatTensor(std).view(1, 3, 1, 1).expand_as(tensor).to(tensor.device) 209 | 210 | return tensor.sub(mean).div(std) 211 | 212 | 213 | class Normalize(object): 214 | def __init__(self, mean, std): 215 | self.mean = mean 216 | self.std = std 217 | 218 | def __call__(self, tensor): 219 | return self.do(tensor) 220 | 221 | def do(self, tensor): 222 | return normalize(tensor, self.mean, self.std) 223 | 224 | def undo(self, tensor): 225 | return denormalize(tensor, self.mean, self.std) 226 | 227 | def __repr__(self): 228 | return self.__class__.__name__ + "(mean={0}, std={1})".format( 229 | self.mean, self.std 230 | ) 231 | -------------------------------------------------------------------------------- /pysot/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vision4robotics/PRL-Track/b89beed235b38f24c93f875276b5221c99f6fb29/pysot/utils/__init__.py -------------------------------------------------------------------------------- /pysot/utils/average_meter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) SenseTime. All Rights Reserved. 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | 8 | 9 | class Meter(object): 10 | def __init__(self, name, val, avg): 11 | self.name = name 12 | self.val = val 13 | self.avg = avg 14 | 15 | def __repr__(self): 16 | return "{name}: {val:.6f} ({avg:.6f})".format( 17 | name=self.name, val=self.val, avg=self.avg 18 | ) 19 | 20 | def __format__(self, *tuples, **kwargs): 21 | return self.__repr__() 22 | 23 | 24 | class AverageMeter: 25 | """Computes and stores the average and current value""" 26 | 27 | def __init__(self, num=100): 28 | self.num = num 29 | self.reset() 30 | 31 | def reset(self): 32 | self.val = {} 33 | self.sum = {} 34 | self.count = {} 35 | self.history = {} 36 | 37 | def update(self, batch=1, **kwargs): 38 | val = {} 39 | for k in kwargs: 40 | val[k] = kwargs[k] / float(batch) 41 | self.val.update(val) 42 | for k in kwargs: 43 | if k not in self.sum: 44 | self.sum[k] = 0 45 | self.count[k] = 0 46 | self.history[k] = [] 47 | self.sum[k] += kwargs[k] 48 | self.count[k] += batch 49 | for _ in range(batch): 50 | self.history[k].append(val[k]) 51 | 52 | if self.num <= 0: 53 | # < 0, average all 54 | self.history[k] = [] 55 | 56 | # == 0: no average 57 | if self.num == 0: 58 | self.sum[k] = self.val[k] 59 | self.count[k] = 1 60 | 61 | elif len(self.history[k]) > self.num: 62 | pop_num = len(self.history[k]) - self.num 63 | for _ in range(pop_num): 64 | self.sum[k] -= self.history[k][0] 65 | del self.history[k][0] 66 | self.count[k] -= 1 67 | 68 | def __repr__(self): 69 | s = "" 70 | for k in self.sum: 71 | s += self.format_str(k) 72 | return s 73 | 74 | def format_str(self, attr): 75 | return "{name}: {val:.6f} ({avg:.6f}) ".format( 76 | name=attr, 77 | val=float(self.val[attr]), 78 | avg=float(self.sum[attr]) / self.count[attr], 79 | ) 80 | 81 | def __getattr__(self, attr): 82 | if attr in self.__dict__: 83 | return super(AverageMeter, self).__getattr__(attr) 84 | if attr not in self.sum: 85 | print("invalid key '{}'".format(attr)) 86 | return Meter(attr, 0, 0) 87 | return Meter(attr, self.val[attr], self.avg(attr)) 88 | 89 | def avg(self, attr): 90 | return float(self.sum[attr]) / self.count[attr] 91 | 92 | 93 | if __name__ == "__main__": 94 | avg1 = AverageMeter(10) 95 | avg2 = AverageMeter(0) 96 | avg3 = AverageMeter(-1) 97 | 98 | for i in range(20): 99 | avg1.update(s=i) 100 | avg2.update(s=i) 101 | avg3.update(s=i) 102 | 103 | print("iter {}".format(i)) 104 | print(avg1.s) 105 | print(avg2.s) 106 | print(avg3.s) 107 | -------------------------------------------------------------------------------- /pysot/utils/bbox.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) SenseTime. All Rights Reserved. 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | 8 | from collections import namedtuple 9 | 10 | import numpy as np 11 | 12 | 13 | Corner = namedtuple("Corner", "x1 y1 x2 y2") 14 | # alias 15 | BBox = Corner 16 | Center = namedtuple("Center", "x y w h") 17 | 18 | 19 | def corner2center(corner): 20 | """convert (x1, y1, x2, y2) to (cx, cy, w, h) 21 | Args: 22 | conrner: Corner or np.array (4*N) 23 | Return: 24 | Center or np.array (4 * N) 25 | """ 26 | if isinstance(corner, Corner): 27 | x1, y1, x2, y2 = corner 28 | return Center((x1 + x2) * 0.5, (y1 + y2) * 0.5, (x2 - x1), (y2 - y1)) 29 | else: 30 | x1, y1, x2, y2 = corner[0], corner[1], corner[2], corner[3] 31 | x = (x1 + x2) * 0.5 32 | y = (y1 + y2) * 0.5 33 | w = x2 - x1 34 | h = y2 - y1 35 | return x, y, w, h 36 | 37 | 38 | def center2corner(center): 39 | """convert (cx, cy, w, h) to (x1, y1, x2, y2) 40 | Args: 41 | center: Center or np.array (4 * N) 42 | Return: 43 | center or np.array (4 * N) 44 | """ 45 | if isinstance(center, Center): 46 | x, y, w, h = center 47 | return Corner(x - w * 0.5, y - h * 0.5, x + w * 0.5, y + h * 0.5) 48 | else: 49 | x, y, w, h = center[0], center[1], center[2], center[3] 50 | x1 = x - w * 0.5 51 | y1 = y - h * 0.5 52 | x2 = x + w * 0.5 53 | y2 = y + h * 0.5 54 | return x1, y1, x2, y2 55 | 56 | 57 | def IoU(rect1, rect2): 58 | """caculate interection over union 59 | Args: 60 | rect1: (x1, y1, x2, y2) 61 | rect2: (x1, y1, x2, y2) 62 | Returns: 63 | iou 64 | """ 65 | # overlap 66 | x1, y1, x2, y2 = rect1[0], rect1[1], rect1[2], rect1[3] 67 | tx1, ty1, tx2, ty2 = ( 68 | rect2[0], 69 | rect2[1], 70 | rect2[2], 71 | rect2[3], 72 | ) ##rect2[0], rect2[1], rect2[2], rect2[3] 73 | 74 | xx1 = np.maximum(tx1, x1) 75 | yy1 = np.maximum(ty1, y1) 76 | xx2 = np.minimum(tx2, x2) 77 | yy2 = np.minimum(ty2, y2) 78 | 79 | ww = np.maximum(0, xx2 - xx1) 80 | hh = np.maximum(0, yy2 - yy1) 81 | 82 | area = (x2 - x1) * (y2 - y1) 83 | target_a = (tx2 - tx1) * (ty2 - ty1) 84 | inter = ww * hh 85 | iou = inter / (area + target_a - inter) 86 | return iou 87 | 88 | 89 | def cxy_wh_2_rect(pos, sz): 90 | """convert (cx, cy, w, h) to (x1, y1, w, h), 0-index""" 91 | return np.array([pos[0] - sz[0] / 2, pos[1] - sz[1] / 2, sz[0], sz[1]]) 92 | 93 | 94 | def rect_2_cxy_wh(rect): 95 | """convert (x1, y1, w, h) to (cx, cy, w, h), 0-index""" 96 | return np.array([rect[0] + rect[2] / 2, rect[1] + rect[3] / 2]), np.array( 97 | [rect[2], rect[3]] 98 | ) 99 | 100 | 101 | def cxy_wh_2_rect1(pos, sz): 102 | """convert (cx, cy, w, h) to (x1, y1, w, h), 1-index""" 103 | return np.array([pos[0] - sz[0] / 2 + 1, pos[1] - sz[1] / 2 + 1, sz[0], sz[1]]) 104 | 105 | 106 | def rect1_2_cxy_wh(rect): 107 | """convert (x1, y1, w, h) to (cx, cy, w, h), 1-index""" 108 | return np.array([rect[0] + rect[2] / 2 - 1, rect[1] + rect[3] / 2 - 1]), np.array( 109 | [rect[2], rect[3]] 110 | ) 111 | 112 | 113 | def get_axis_aligned_bbox(region): 114 | """convert region to (cx, cy, w, h) that represent by axis aligned box""" 115 | nv = region.size 116 | if nv == 8: 117 | cx = np.mean(region[0::2]) 118 | cy = np.mean(region[1::2]) 119 | x1 = min(region[0::2]) 120 | x2 = max(region[0::2]) 121 | y1 = min(region[1::2]) 122 | y2 = max(region[1::2]) 123 | A1 = np.linalg.norm(region[0:2] - region[2:4]) * np.linalg.norm( 124 | region[2:4] - region[4:6] 125 | ) 126 | A2 = (x2 - x1) * (y2 - y1) 127 | s = np.sqrt(A1 / A2) 128 | w = s * (x2 - x1) + 1 129 | h = s * (y2 - y1) + 1 130 | else: 131 | x = region[0] 132 | y = region[1] 133 | w = region[2] 134 | h = region[3] 135 | cx = x + w / 2 136 | cy = y + h / 2 137 | return cx, cy, w, h 138 | 139 | 140 | def get_min_max_bbox(region): 141 | """convert region to (cx, cy, w, h) that represent by mim-max box""" 142 | nv = region.size 143 | if nv == 8: 144 | cx = np.mean(region[0::2]) 145 | cy = np.mean(region[1::2]) 146 | x1 = min(region[0::2]) 147 | x2 = max(region[0::2]) 148 | y1 = min(region[1::2]) 149 | y2 = max(region[1::2]) 150 | w = x2 - x1 151 | h = y2 - y1 152 | else: 153 | x = region[0] 154 | y = region[1] 155 | w = region[2] 156 | h = region[3] 157 | cx = x + w / 2 158 | cy = y + h / 2 159 | return cx, cy, w, h 160 | -------------------------------------------------------------------------------- /pysot/utils/distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) SenseTime. All Rights Reserved. 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | 8 | import os 9 | import socket 10 | import logging 11 | 12 | import torch 13 | import torch.nn as nn 14 | import torch.distributed as dist 15 | 16 | from pysot.utils.log_helper import log_once 17 | 18 | logger = logging.getLogger("global") 19 | 20 | 21 | def average_reduce(v): 22 | if get_world_size() == 1: 23 | return v 24 | tensor = torch.cuda.FloatTensor(1) 25 | tensor[0] = v 26 | dist.all_reduce(tensor) 27 | v = tensor[0] / get_world_size() 28 | return v 29 | 30 | 31 | class DistModule(nn.Module): 32 | def __init__(self, module, bn_method=0): 33 | super(DistModule, self).__init__() 34 | self.module = module 35 | self.bn_method = bn_method 36 | if get_world_size() > 1: 37 | broadcast_params(self.module) 38 | else: 39 | self.bn_method = 0 # single proccess 40 | 41 | def forward(self, *args, **kwargs): 42 | broadcast_buffers(self.module, self.bn_method) 43 | return self.module(*args, **kwargs) 44 | 45 | def train(self, mode=True): 46 | super(DistModule, self).train(mode) 47 | self.module.train(mode) 48 | return self 49 | 50 | 51 | def broadcast_params(model): 52 | """broadcast model parameters""" 53 | for p in model.state_dict().values(): 54 | dist.broadcast(p, 0) 55 | 56 | 57 | def broadcast_buffers(model, method=0): 58 | """broadcast model buffers""" 59 | if method == 0: 60 | return 61 | 62 | world_size = get_world_size() 63 | 64 | for b in model._all_buffers(): 65 | if method == 1: # broadcast from main proccess 66 | dist.broadcast(b, 0) 67 | elif method == 2: # average 68 | dist.all_reduce(b) 69 | b /= world_size 70 | else: 71 | raise Exception("Invalid buffer broadcast code {}".format(method)) 72 | 73 | 74 | inited = False 75 | 76 | 77 | def _dist_init(): 78 | """ 79 | if guess right: 80 | ntasks: world_size (process num) 81 | proc_id: rank 82 | """ 83 | # rank = int(os.environ['RANK']) 84 | rank = 0 85 | num_gpus = torch.cuda.device_count() 86 | torch.cuda.set_device(rank % num_gpus) 87 | dist.init_process_group(backend="nccl") 88 | world_size = dist.get_world_size() 89 | return rank, world_size 90 | 91 | 92 | def _get_local_ip(): 93 | try: 94 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 95 | s.connect(("8.8.8.8", 80)) 96 | ip = s.getsockname()[0] 97 | finally: 98 | s.close() 99 | return ip 100 | 101 | 102 | def dist_init(): 103 | global rank, world_size, inited 104 | # try: 105 | # rank, world_size = _dist_init() 106 | # except RuntimeError as e: 107 | # if 'public' in e.args[0]: 108 | # logger.info(e) 109 | # logger.info('Warning: use single process') 110 | # rank, world_size = 0, 1 111 | # else: 112 | # raise RuntimeError(*e.args) 113 | rank, world_size = 0, 1 114 | inited = True 115 | return rank, world_size 116 | 117 | 118 | def get_rank(): 119 | if not inited: 120 | raise (Exception("dist not inited")) 121 | return rank 122 | 123 | 124 | def get_world_size(): 125 | if not inited: 126 | raise (Exception("dist not inited")) 127 | return world_size 128 | 129 | 130 | def reduce_gradients(model, _type="sum"): 131 | types = ["sum", "avg"] 132 | assert _type in types, 'gradients method must be in "{}"'.format(types) 133 | log_once("gradients method is {}".format(_type)) 134 | if get_world_size() > 1: 135 | for param in model.parameters(): 136 | if param.requires_grad: 137 | dist.all_reduce(param.grad.data) 138 | if _type == "avg": 139 | param.grad.data /= get_world_size() 140 | else: 141 | return None 142 | -------------------------------------------------------------------------------- /pysot/utils/location_grid.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def compute_locations(features, stride): 5 | h, w = features.size()[-2:] 6 | locations_per_level = compute_locations_per_level(h, w, stride, features.device) 7 | return locations_per_level 8 | 9 | 10 | def compute_locations_per_level(h, w, stride, device): 11 | shifts_x = torch.arange( 12 | 0, w * stride, step=stride, dtype=torch.float32, device=device 13 | ) 14 | shifts_y = torch.arange( 15 | 0, h * stride, step=stride, dtype=torch.float32, device=device 16 | ) 17 | shift_y, shift_x = torch.meshgrid((shifts_y, shifts_x)) 18 | shift_x = shift_x.reshape(-1) 19 | shift_y = shift_y.reshape(-1) 20 | # locations = torch.stack((shift_x, shift_y), dim=1) + stride + 3*stride # (size_z-1)/2*size_z 28 21 | # locations = torch.stack((shift_x, shift_y), dim=1) + stride 22 | locations = torch.stack((shift_x, shift_y), dim=1) + 32 # alex:48 // 32 23 | return locations 24 | -------------------------------------------------------------------------------- /pysot/utils/log_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) SenseTime. All Rights Reserved. 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | 8 | import os 9 | import logging 10 | import math 11 | import sys 12 | 13 | 14 | if hasattr(sys, "frozen"): # support for py2exe 15 | _srcfile = "logging%s__init__%s" % (os.sep, __file__[-4:]) 16 | elif __file__[-4:].lower() in [".pyc", ".pyo"]: 17 | _srcfile = __file__[:-4] + ".py" 18 | else: 19 | _srcfile = __file__ 20 | _srcfile = os.path.normcase(_srcfile) 21 | 22 | logs = set() 23 | 24 | 25 | class Filter: 26 | def __init__(self, flag): 27 | self.flag = flag 28 | 29 | def filter(self, x): 30 | return self.flag 31 | 32 | 33 | class Dummy: 34 | def __init__(self, *arg, **kwargs): 35 | pass 36 | 37 | def __getattr__(self, arg): 38 | def dummy(*args, **kwargs): 39 | pass 40 | 41 | return dummy 42 | 43 | 44 | def get_format(logger, level): 45 | if "RANK" in os.environ: 46 | rank = int(os.environ["RANK"]) 47 | 48 | if level == logging.INFO: 49 | logger.addFilter(Filter(rank == 0)) 50 | else: 51 | rank = 0 52 | format_str = "[%(asctime)s-rk{}-%(filename)s#%(lineno)3d] %(message)s".format(rank) 53 | formatter = logging.Formatter(format_str) 54 | return formatter 55 | 56 | 57 | def get_format_custom(logger, level): 58 | if "RANK" in os.environ: 59 | rank = int(os.environ["RANK"]) 60 | if level == logging.INFO: 61 | logger.addFilter(Filter(rank == 0)) 62 | else: 63 | rank = 0 64 | format_str = "[%(asctime)s-rk{}-%(message)s".format(rank) 65 | formatter = logging.Formatter(format_str) 66 | return formatter 67 | 68 | 69 | def init_log(name, level=logging.INFO, format_func=get_format): 70 | if (name, level) in logs: 71 | return 72 | logs.add((name, level)) 73 | logger = logging.getLogger(name) 74 | logger.setLevel(level) 75 | ch = logging.StreamHandler() 76 | ch.setLevel(level) 77 | formatter = format_func(logger, level) 78 | ch.setFormatter(formatter) 79 | logger.addHandler(ch) 80 | return logger 81 | 82 | 83 | def add_file_handler(name, log_file, level=logging.INFO): 84 | logger = logging.getLogger(name) 85 | fh = logging.FileHandler(log_file) 86 | fh.setFormatter(get_format(logger, level)) 87 | logger.addHandler(fh) 88 | 89 | 90 | init_log("global") 91 | 92 | 93 | def print_speed(i, i_time, n): 94 | """print_speed(index, index_time, total_iteration)""" 95 | logger = logging.getLogger("global") 96 | average_time = i_time 97 | remaining_time = (n - i) * average_time 98 | remaining_day = math.floor(remaining_time / 86400) 99 | remaining_hour = math.floor(remaining_time / 3600 - remaining_day * 24) 100 | remaining_min = math.floor( 101 | remaining_time / 60 - remaining_day * 1440 - remaining_hour * 60 102 | ) 103 | logger.info( 104 | "Progress: %d / %d [%d%%], Speed: %.3f s/iter, ETA %d:%02d:%02d (D:H:M)\n" 105 | % ( 106 | i, 107 | n, 108 | i / n * 100, 109 | average_time, 110 | remaining_day, 111 | remaining_hour, 112 | remaining_min, 113 | ) 114 | ) 115 | 116 | 117 | def find_caller(): 118 | def current_frame(): 119 | try: 120 | raise Exception 121 | except: 122 | return sys.exc_info()[2].tb_frame.f_back 123 | 124 | f = current_frame() 125 | if f is not None: 126 | f = f.f_back 127 | rv = "(unknown file)", 0, "(unknown function)" 128 | while hasattr(f, "f_code"): 129 | co = f.f_code 130 | filename = os.path.normcase(co.co_filename) 131 | rv = (co.co_filename, f.f_lineno, co.co_name) 132 | if filename == _srcfile: 133 | f = f.f_back 134 | continue 135 | break 136 | rv = list(rv) 137 | rv[0] = os.path.basename(rv[0]) 138 | return rv 139 | 140 | 141 | class LogOnce: 142 | def __init__(self): 143 | self.logged = set() 144 | self.logger = init_log("log_once", format_func=get_format_custom) 145 | 146 | def log(self, strings): 147 | fn, lineno, caller = find_caller() 148 | key = (fn, lineno, caller, strings) 149 | if key in self.logged: 150 | return 151 | self.logged.add(key) 152 | message = "{filename:s}<{caller}>#{lineno:3d}] {strings}".format( 153 | filename=fn, lineno=lineno, strings=strings, caller=caller 154 | ) 155 | self.logger.info(message) 156 | 157 | 158 | once_logger = LogOnce() 159 | 160 | 161 | def log_once(strings): 162 | once_logger.log(strings) 163 | 164 | 165 | def main(): 166 | for i, lvl in enumerate( 167 | [logging.DEBUG, logging.INFO, logging.WARNING, logging.ERROR, logging.CRITICAL] 168 | ): 169 | log_name = str(lvl) 170 | init_log(log_name, lvl) 171 | logger = logging.getLogger(log_name) 172 | print("****cur lvl:{}".format(lvl)) 173 | logger.debug("debug") 174 | logger.info("info") 175 | logger.warning("warning") 176 | logger.error("error") 177 | logger.critical("critiacal") 178 | 179 | 180 | if __name__ == "__main__": 181 | main() 182 | for i in range(10): 183 | log_once("xxx") 184 | -------------------------------------------------------------------------------- /pysot/utils/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) SenseTime. All Rights Reserved. 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | 8 | import math 9 | 10 | import numpy as np 11 | from torch.optim.lr_scheduler import _LRScheduler 12 | 13 | from pysot.core.config import cfg 14 | 15 | 16 | class LRScheduler(_LRScheduler): 17 | def __init__(self, optimizer, last_epoch=-1): 18 | if "lr_spaces" not in self.__dict__: 19 | raise Exception('lr_spaces must be set in "LRSchduler"') 20 | super(LRScheduler, self).__init__(optimizer, last_epoch) 21 | 22 | def get_cur_lr(self): 23 | return self.lr_spaces[self.last_epoch] 24 | 25 | def get_lr(self): 26 | epoch = self.last_epoch 27 | return [ 28 | self.lr_spaces[epoch] * pg["initial_lr"] / self.start_lr 29 | for pg in self.optimizer.param_groups 30 | ] 31 | 32 | def __repr__(self): 33 | return "({}) lr spaces: \n{}".format(self.__class__.__name__, self.lr_spaces) 34 | 35 | 36 | class LogScheduler(LRScheduler): 37 | def __init__( 38 | self, optimizer, start_lr=0.03, end_lr=5e-4, epochs=50, last_epoch=-1, **kwargs 39 | ): 40 | self.start_lr = start_lr 41 | self.end_lr = end_lr 42 | self.epochs = epochs 43 | self.lr_spaces = np.logspace(math.log10(start_lr), math.log10(end_lr), epochs) 44 | 45 | super(LogScheduler, self).__init__(optimizer, last_epoch) 46 | 47 | 48 | class StepScheduler(LRScheduler): 49 | def __init__( 50 | self, 51 | optimizer, 52 | start_lr=0.01, 53 | end_lr=None, 54 | step=10, 55 | mult=0.1, 56 | epochs=50, 57 | last_epoch=-1, 58 | **kwargs 59 | ): 60 | if end_lr is not None: 61 | if start_lr is None: 62 | start_lr = end_lr / (mult ** (epochs // step)) 63 | else: # for warm up policy 64 | mult = math.pow(end_lr / start_lr, 1.0 / (epochs // step)) 65 | self.start_lr = start_lr 66 | self.lr_spaces = self.start_lr * (mult ** (np.arange(epochs) // step)) 67 | self.mult = mult 68 | self._step = step 69 | 70 | super(StepScheduler, self).__init__(optimizer, last_epoch) 71 | 72 | 73 | class MultiStepScheduler(LRScheduler): 74 | def __init__( 75 | self, 76 | optimizer, 77 | start_lr=0.01, 78 | end_lr=None, 79 | steps=[10, 20, 30, 40], 80 | mult=0.5, 81 | epochs=50, 82 | last_epoch=-1, 83 | **kwargs 84 | ): 85 | if end_lr is not None: 86 | if start_lr is None: 87 | start_lr = end_lr / (mult ** (len(steps))) 88 | else: 89 | mult = math.pow(end_lr / start_lr, 1.0 / len(steps)) 90 | self.start_lr = start_lr 91 | self.lr_spaces = self._build_lr(start_lr, steps, mult, epochs) 92 | self.mult = mult 93 | self.steps = steps 94 | 95 | super(MultiStepScheduler, self).__init__(optimizer, last_epoch) 96 | 97 | def _build_lr(self, start_lr, steps, mult, epochs): 98 | lr = [0] * epochs 99 | lr[0] = start_lr 100 | for i in range(1, epochs): 101 | lr[i] = lr[i - 1] 102 | if i in steps: 103 | lr[i] *= mult 104 | return np.array(lr, dtype=np.float32) 105 | 106 | 107 | class LinearStepScheduler(LRScheduler): 108 | def __init__( 109 | self, optimizer, start_lr=0.01, end_lr=0.005, epochs=50, last_epoch=-1, **kwargs 110 | ): 111 | self.start_lr = start_lr 112 | self.end_lr = end_lr 113 | self.lr_spaces = np.linspace(start_lr, end_lr, epochs) 114 | super(LinearStepScheduler, self).__init__(optimizer, last_epoch) 115 | 116 | 117 | class CosStepScheduler(LRScheduler): 118 | def __init__( 119 | self, optimizer, start_lr=0.01, end_lr=0.005, epochs=50, last_epoch=-1, **kwargs 120 | ): 121 | self.start_lr = start_lr 122 | self.end_lr = end_lr 123 | self.lr_spaces = self._build_lr(start_lr, end_lr, epochs) 124 | 125 | super(CosStepScheduler, self).__init__(optimizer, last_epoch) 126 | 127 | def _build_lr(self, start_lr, end_lr, epochs): 128 | index = np.arange(epochs).astype(np.float32) 129 | lr = end_lr + (start_lr - end_lr) * (1.0 + np.cos(index * np.pi / epochs)) * 0.5 130 | return lr.astype(np.float32) 131 | 132 | 133 | class WarmUPScheduler(LRScheduler): 134 | def __init__(self, optimizer, warmup, normal, epochs=50, last_epoch=-1): 135 | warmup = warmup.lr_spaces # [::-1] 136 | normal = normal.lr_spaces 137 | self.lr_spaces = np.concatenate([warmup, normal]) 138 | self.start_lr = normal[0] 139 | 140 | super(WarmUPScheduler, self).__init__(optimizer, last_epoch) 141 | 142 | 143 | LRs = { 144 | "log": LogScheduler, 145 | "step": StepScheduler, 146 | "multi-step": MultiStepScheduler, 147 | "linear": LinearStepScheduler, 148 | "cos": CosStepScheduler, 149 | } 150 | 151 | 152 | def _build_lr_scheduler(optimizer, config, epochs=50, last_epoch=-1): 153 | return LRs[config.TYPE]( 154 | optimizer, last_epoch=last_epoch, epochs=epochs, **config.KWARGS 155 | ) 156 | 157 | 158 | def _build_warm_up_scheduler(optimizer, epochs=50, last_epoch=-1): 159 | warmup_epoch = cfg.TRAIN.LR_WARMUP.EPOCH 160 | sc1 = _build_lr_scheduler(optimizer, cfg.TRAIN.LR_WARMUP, warmup_epoch, last_epoch) 161 | sc2 = _build_lr_scheduler( 162 | optimizer, cfg.TRAIN.LR, epochs - warmup_epoch, last_epoch 163 | ) 164 | return WarmUPScheduler(optimizer, sc1, sc2, epochs, last_epoch) 165 | 166 | 167 | def build_lr_scheduler(optimizer, epochs=50, last_epoch=-1): 168 | if cfg.TRAIN.LR_WARMUP.WARMUP: 169 | return _build_warm_up_scheduler(optimizer, epochs, last_epoch) 170 | else: 171 | return _build_lr_scheduler(optimizer, cfg.TRAIN.LR, epochs, last_epoch) 172 | 173 | 174 | if __name__ == "__main__": 175 | import torch.nn as nn 176 | from torch.optim import SGD 177 | 178 | class Net(nn.Module): 179 | def __init__(self): 180 | super(Net, self).__init__() 181 | self.conv = nn.Conv2d(10, 10, kernel_size=3) 182 | 183 | net = Net().parameters() 184 | optimizer = SGD(net, lr=0.01) 185 | 186 | # test1 187 | step = {"type": "step", "start_lr": 0.01, "step": 10, "mult": 0.1} 188 | lr = build_lr_scheduler(optimizer, step) 189 | print(lr) 190 | 191 | log = { 192 | "type": "log", 193 | "start_lr": 0.03, 194 | "end_lr": 5e-4, 195 | } 196 | lr = build_lr_scheduler(optimizer, log) 197 | 198 | print(lr) 199 | 200 | log = {"type": "multi-step", "start_lr": 0.01, "mult": 0.1, "steps": [10, 15, 20]} 201 | lr = build_lr_scheduler(optimizer, log) 202 | print(lr) 203 | 204 | cos = { 205 | "type": "cos", 206 | "start_lr": 0.01, 207 | "end_lr": 0.0005, 208 | } 209 | lr = build_lr_scheduler(optimizer, cos) 210 | print(lr) 211 | 212 | step = { 213 | "type": "step", 214 | "start_lr": 0.001, 215 | "end_lr": 0.03, 216 | "step": 1, 217 | } 218 | 219 | warmup = log.copy() 220 | warmup["warmup"] = step 221 | warmup["warmup"]["epoch"] = 5 222 | lr = build_lr_scheduler(optimizer, warmup, epochs=55) 223 | print(lr) 224 | 225 | lr.step() 226 | print(lr.last_epoch) 227 | 228 | lr.step(5) 229 | print(lr.last_epoch) 230 | -------------------------------------------------------------------------------- /pysot/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) SenseTime. All Rights Reserved. 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | 8 | import os 9 | import numpy as np 10 | import torch 11 | 12 | from colorama import Fore, Style 13 | 14 | 15 | __all__ = ["commit", "describe"] 16 | 17 | 18 | def _exec(cmd): 19 | f = os.popen(cmd, "r", 1) 20 | return f.read().strip() 21 | 22 | 23 | def _bold(s): 24 | return "\033[1m%s\033[0m" % s 25 | 26 | 27 | def _color(s): 28 | # return f'{Fore.RED}{s}{Style.RESET_ALL}' 29 | return "{}{}{}".format(Fore.RED, s, Style.RESET_ALL) 30 | 31 | 32 | def _describe(model, lines=None, spaces=0): 33 | head = " " * spaces 34 | for name, p in model.named_parameters(): 35 | if "." in name: 36 | continue 37 | if p.requires_grad: 38 | name = _color(name) 39 | line = "{head}- {name}".format(head=head, name=name) 40 | lines.append(line) 41 | 42 | for name, m in model.named_children(): 43 | space_num = len(name) + spaces + 1 44 | if m.training: 45 | name = _color(name) 46 | line = "{head}.{name} ({type})".format( 47 | head=head, name=name, type=m.__class__.__name__ 48 | ) 49 | lines.append(line) 50 | _describe(m, lines, space_num) 51 | 52 | 53 | def commit(): 54 | root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) 55 | cmd = "cd {}; git log | head -n1 | awk '{{print $2}}'".format(root) 56 | commit = _exec(cmd) 57 | cmd = "cd {}; git log --oneline | head -n1".format(root) 58 | commit_log = _exec(cmd) 59 | return "commit : {}\n log : {}".format(commit, commit_log) 60 | 61 | 62 | def describe(net, name=None): 63 | num = 0 64 | lines = [] 65 | if name is not None: 66 | lines.append(name) 67 | num = len(name) 68 | _describe(net, lines, num) 69 | return "\n".join(lines) 70 | 71 | 72 | def bbox_clip(x, min_value, max_value): 73 | new_x = max(min_value, min(x, max_value)) 74 | return new_x 75 | -------------------------------------------------------------------------------- /pysot/utils/model_load.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | 6 | import os 7 | import logging 8 | 9 | import torch 10 | 11 | log_dir = "logs" 12 | os.makedirs(log_dir, exist_ok=True) 13 | script_name = os.path.splitext(os.path.basename(__file__))[0] 14 | log_filename = os.path.join(log_dir, script_name + ".txt") 15 | logging.basicConfig( 16 | filename=log_filename, 17 | level=logging.INFO, 18 | format="%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s", 19 | datefmt="%a, %d %b %Y %H:%M:%S", 20 | ) 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | def check_keys(model, pretrained_state_dict): 25 | ckpt_keys = set(pretrained_state_dict.keys()) 26 | model_keys = set(model.state_dict().keys()) 27 | used_pretrained_keys = model_keys & ckpt_keys 28 | unused_pretrained_keys = ckpt_keys - model_keys 29 | missing_keys = model_keys - ckpt_keys 30 | missing_keys = [x for x in missing_keys if not x.endswith("num_batches_tracked")] 31 | 32 | if len(missing_keys) > 0: 33 | logger.warning("[Warning] missing keys: {}".format(missing_keys)) 34 | logger.info("missing keys:{}".format(len(missing_keys))) 35 | if len(unused_pretrained_keys) > 0: 36 | logger.warning( 37 | "[Warning] unused_pretrained_keys: {}".format(unused_pretrained_keys) 38 | ) 39 | logger.info("unused checkpoint keys:{}".format(len(unused_pretrained_keys))) 40 | 41 | logger.info("used keys:{}".format(len(used_pretrained_keys))) 42 | assert len(used_pretrained_keys) > 0, "load NONE from pretrained checkpoint" 43 | return True 44 | 45 | 46 | def remove_prefix(state_dict, prefix): 47 | """Old style model is stored with all names of parameters 48 | share common prefix 'module.'""" 49 | logger.info("remove prefix '{}'".format(prefix)) 50 | f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x 51 | return {f(key): value for key, value in state_dict.items()} 52 | 53 | 54 | def load_pretrain(model, pretrained_path): 55 | logger.info("load pretrained model from {}".format(pretrained_path)) 56 | device = torch.cuda.current_device() 57 | pretrained_dict = torch.load( 58 | pretrained_path, map_location=lambda storage, loc: storage.cuda(device) 59 | ) 60 | if "state_dict" in pretrained_dict.keys(): 61 | pretrained_dict = remove_prefix(pretrained_dict["state_dict"], "module.") 62 | else: 63 | pretrained_dict = remove_prefix(pretrained_dict, "module.") 64 | 65 | try: 66 | check_keys(model, pretrained_dict) 67 | except: 68 | logger.info( 69 | '[Warning]: using pretrain as features.\ 70 | Adding "features." as prefix' 71 | ) 72 | new_dict = {} 73 | for k, v in pretrained_dict.items(): 74 | k = "features." + k 75 | new_dict[k] = v 76 | pretrained_dict = new_dict 77 | check_keys(model, pretrained_dict) 78 | model.load_state_dict(pretrained_dict, strict=False) 79 | return model 80 | 81 | 82 | def load_pretrain2(model, pretrained_path): 83 | ckpt = torch.load(pretrained_path, map_location="cpu")["model"] 84 | new_dict = {} 85 | for k, v in ckpt.items(): 86 | if "pos_embed" not in k and "mask_token" not in k: # use fixed pos embed 87 | new_dict[k] = v 88 | model.load_state_dict(new_dict, strict=False) 89 | return model 90 | 91 | 92 | def restore_from(model, optimizer, ckpt_path): 93 | device = torch.cuda.current_device() 94 | ckpt = torch.load(ckpt_path, map_location=lambda storage, loc: storage.cuda(device)) 95 | print(ckpt) 96 | epoch = ckpt["epoch"] 97 | 98 | ckpt_model_dict = remove_prefix(ckpt["state_dict"], "module.") 99 | check_keys(model, ckpt_model_dict) 100 | model.load_state_dict(ckpt_model_dict, strict=False) 101 | 102 | # check_keys(optimizer, ckpt['optimizer']) 103 | # optimizer.load_state_dict(ckpt['optimizer']) 104 | 105 | # return model, optimizer, epoch 106 | return model, 0, epoch 107 | -------------------------------------------------------------------------------- /pysot/utils/xcorr.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) SenseTime. All Rights Reserved. 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | def xcorr_slow(x, kernel): 13 | """for loop to calculate cross correlation, slow version""" 14 | batch = x.size()[0] 15 | out = [] 16 | for i in range(batch): 17 | px = x[i] 18 | pk = kernel[i] 19 | px = px.view(1, px.size()[0], px.size()[1], px.size()[2]) 20 | pk = pk.view(-1, px.size()[1], pk.size()[1], pk.size()[2]) 21 | po = F.conv2d(px, pk) 22 | out.append(po) 23 | out = torch.cat(out, 0) 24 | return out 25 | 26 | 27 | def xcorr_fast(x, kernel): 28 | """group conv2d to calculate cross correlation, fast version""" 29 | batch = kernel.size()[0] 30 | pk = kernel.view(-1, x.size()[1], kernel.size()[2], kernel.size()[3]) 31 | px = x.view(1, -1, x.size()[2], x.size()[3]) 32 | po = F.conv2d(px, pk, groups=batch) 33 | po = po.view(batch, -1, po.size()[2], po.size()[3]) 34 | return po 35 | 36 | 37 | def xcorr_depthwise(x, kernel): 38 | """depthwise cross correlation""" 39 | batch = kernel.size(0) 40 | channel = kernel.size(1) 41 | x = x.view(1, batch * channel, x.size(2), x.size(3)) 42 | kernel = kernel.view(batch * channel, 1, kernel.size(2), kernel.size(3)) 43 | out = F.conv2d(x, kernel, groups=batch * channel) 44 | out = out.view(batch, channel, out.size(2), out.size(3)) 45 | return out 46 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | colorama==0.4.3 2 | Cython==0.29.21 3 | matplotlib==3.7.2 4 | numpy==1.23.4 5 | opencv_python==4.7.0.68 6 | Pillow==9.4.0 7 | tensorboardX==2.6.2.2 8 | timm==0.4.12 9 | torch==1.13.1+cu116 10 | tqdm==4.65.0 11 | yacs==0.1.8 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from distutils.extension import Extension 3 | from Cython.Build import cythonize 4 | 5 | ext_modules = [ 6 | Extension( 7 | name='toolkit.utils.region', 8 | sources=[ 9 | 'toolkit/utils/region.pyx', 10 | 'toolkit/utils/src/region.c', 11 | ], 12 | include_dirs=[ 13 | 'toolkit/utils/src' 14 | ] 15 | ) 16 | ] 17 | 18 | setup( 19 | name='toolkit', 20 | packages=['toolkit'], 21 | ext_modules=cythonize(ext_modules) 22 | ) 23 | -------------------------------------------------------------------------------- /toolkit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vision4robotics/PRL-Track/b89beed235b38f24c93f875276b5221c99f6fb29/toolkit/__init__.py -------------------------------------------------------------------------------- /toolkit/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .uav import UAVDataset 2 | from .dtb import DTBDataset 3 | from .uav10fps import UAV10Dataset 4 | from .uav20l import UAV20Dataset 5 | from .uav112l import UAV112LDataset 6 | from .uav112 import UAV112Dataset 7 | from .uavdt import UAVDTDataset 8 | from .visdrone import VISDRONEDDataset 9 | 10 | 11 | class DatasetFactory(object): 12 | @staticmethod 13 | def create_dataset(**kwargs): 14 | """ 15 | Args: 16 | name: dataset name 17 | dataset_root: dataset root 18 | load_img: wether to load image 19 | Return: 20 | dataset 21 | """ 22 | assert "name" in kwargs, "should provide dataset name" 23 | name = kwargs["name"] 24 | if "DTB70" in name: 25 | dataset = DTBDataset(**kwargs) 26 | elif "UAV123_10fps" in name: 27 | dataset = UAV10Dataset(**kwargs) 28 | elif "UAV123_20L" in name: 29 | dataset = UAV20Dataset(**kwargs) 30 | elif "UAV123" in name: 31 | dataset = UAVDataset(**kwargs) 32 | elif "UAVTrack112_L" in name: 33 | dataset = UAV112LDataset(**kwargs) 34 | elif "UAVTrack112" in name: 35 | dataset = UAV112Dataset(**kwargs) 36 | elif "UAVDT" in name: 37 | dataset = UAVDTDataset(**kwargs) 38 | elif "VISDRONED" in name: 39 | dataset = VISDRONEDDataset(**kwargs) 40 | else: 41 | raise Exception("unknow dataset {}".format(kwargs["name"])) 42 | return dataset 43 | -------------------------------------------------------------------------------- /toolkit/datasets/dataset.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | 4 | class Dataset(object): 5 | def __init__(self, name, dataset_root): 6 | self.name = name 7 | self.dataset_root = dataset_root 8 | self.videos = None 9 | 10 | def __getitem__(self, idx): 11 | if isinstance(idx, str): 12 | return self.videos[idx] 13 | elif isinstance(idx, int): 14 | return self.videos[sorted(list(self.videos.keys()))[idx]] 15 | 16 | def __len__(self): 17 | return len(self.videos) 18 | 19 | def __iter__(self): 20 | keys = sorted(list(self.videos.keys())) 21 | for key in keys: 22 | yield self.videos[key] 23 | 24 | def set_tracker(self, path, tracker_names): 25 | """ 26 | Args: 27 | path: path to tracker results, 28 | tracker_names: list of tracker name 29 | """ 30 | self.tracker_path = path 31 | self.tracker_names = tracker_names 32 | # for video in tqdm(self.videos.values(), 33 | # desc='loading tacker result', ncols=100): 34 | # video.load_tracker(path, tracker_names) 35 | -------------------------------------------------------------------------------- /toolkit/datasets/dtb.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import numpy as np 4 | 5 | from PIL import Image 6 | from tqdm import tqdm 7 | from glob import glob 8 | 9 | from .dataset import Dataset 10 | from .video import Video 11 | 12 | 13 | class DTBVideo(Video): 14 | """ 15 | Args: 16 | name: video name 17 | root: dataset root 18 | video_dir: video directory 19 | init_rect: init rectangle 20 | img_names: image names 21 | gt_rect: groundtruth rectangle 22 | attr: attribute of video 23 | """ 24 | 25 | def __init__( 26 | self, name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img=False 27 | ): 28 | super(DTBVideo, self).__init__( 29 | name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img 30 | ) 31 | 32 | def load_tracker(self, path, tracker_names=None, store=True): 33 | """ 34 | Args: 35 | path(str): path to result 36 | tracker_name(list): name of tracker 37 | """ 38 | if not tracker_names: 39 | tracker_names = [x.split("/")[-1] for x in glob(path) if os.path.isdir(x)] 40 | if isinstance(tracker_names, str): 41 | tracker_names = [tracker_names] 42 | for name in tracker_names: 43 | traj_file = os.path.join(path, name, self.name + ".txt") 44 | if not os.path.exists(traj_file): 45 | if self.name == "FleetFace": 46 | txt_name = "fleetface.txt" 47 | elif self.name == "Jogging-1": 48 | txt_name = "jogging_1.txt" 49 | elif self.name == "Jogging-2": 50 | txt_name = "jogging_2.txt" 51 | elif self.name == "Skating2-1": 52 | txt_name = "skating2_1.txt" 53 | elif self.name == "Skating2-2": 54 | txt_name = "skating2_2.txt" 55 | elif self.name == "FaceOcc1": 56 | txt_name = "faceocc1.txt" 57 | elif self.name == "FaceOcc2": 58 | txt_name = "faceocc2.txt" 59 | elif self.name == "Human4-2": 60 | txt_name = "human4_2.txt" 61 | else: 62 | txt_name = self.name[0].lower() + self.name[1:] + ".txt" 63 | traj_file = os.path.join(path, name, txt_name) 64 | if os.path.exists(traj_file): 65 | with open(traj_file, "r") as f: 66 | pred_traj = [ 67 | list(map(float, x.strip().split(","))) for x in f.readlines() 68 | ] 69 | if len(pred_traj) != len(self.gt_traj): 70 | print(name, len(pred_traj), len(self.gt_traj), self.name) 71 | if store: 72 | self.pred_trajs[name] = pred_traj 73 | else: 74 | return pred_traj 75 | else: 76 | print(traj_file) 77 | self.tracker_names = list(self.pred_trajs.keys()) 78 | 79 | 80 | def ca(): 81 | 82 | path = "./test_dataset/DTB70" 83 | txt = "groundtruth_rect.txt" 84 | name_list = os.listdir(path) 85 | name_list.sort() 86 | a = len(name_list) 87 | b = [] 88 | for i in range(a): 89 | b.append(name_list[i]) 90 | c = [] 91 | 92 | for jj in range(a): 93 | imgs = path + "/" + str(name_list[jj]) + "/img" 94 | txt = path + "/" + str(name_list[jj]) + "/groundtruth_rect.txt" 95 | bbox = [] 96 | f = open(txt) # 返回一个文件对象 97 | file = f.readlines() 98 | li = os.listdir(imgs) 99 | li.sort() 100 | for ii in range(len(file)): 101 | li[ii] = name_list[jj] + "/img/" + li[ii] 102 | 103 | line = file[ii].strip("\n").split(",") 104 | 105 | try: 106 | line[0] = int(line[0]) 107 | except: 108 | line[0] = float(line[0]) 109 | try: 110 | line[1] = int(line[1]) 111 | except: 112 | line[1] = float(line[1]) 113 | try: 114 | line[2] = int(line[2]) 115 | except: 116 | line[2] = float(line[2]) 117 | try: 118 | line[3] = int(line[3]) 119 | except: 120 | line[3] = float(line[3]) 121 | bbox.append(line) 122 | 123 | if len(bbox) != len(li): 124 | print(jj) 125 | f.close() 126 | c.append( 127 | { 128 | "attr": [], 129 | "gt_rect": bbox, 130 | "img_names": li, 131 | "init_rect": bbox[0], 132 | "video_dir": name_list[jj], 133 | } 134 | ) 135 | 136 | d = dict(zip(b, c)) 137 | 138 | return d 139 | 140 | 141 | class DTBDataset(Dataset): 142 | """ 143 | Args: 144 | name: dataset name, should be 'OTB100', 'CVPR13', 'OTB50' 145 | dataset_root: dataset root 146 | load_img: wether to load all imgs 147 | """ 148 | 149 | def __init__(self, name, dataset_root, load_img=False): 150 | super(DTBDataset, self).__init__(name, dataset_root) 151 | # with open(os.path.join(dataset_root, name+'.json'), 'r') as f: 152 | # meta_data = json.load(f) 153 | meta_data = ca() 154 | # load videos 155 | pbar = tqdm(meta_data.keys(), desc="loading " + name, ncols=100) 156 | self.videos = {} 157 | for video in pbar: 158 | pbar.set_postfix_str(video) 159 | self.videos[video] = DTBVideo( 160 | video, 161 | dataset_root, 162 | meta_data[video]["video_dir"], 163 | meta_data[video]["init_rect"], 164 | meta_data[video]["img_names"], 165 | meta_data[video]["gt_rect"], 166 | meta_data[video]["attr"], 167 | load_img, 168 | ) 169 | 170 | # set attr 171 | attr = [] 172 | for x in self.videos.values(): 173 | attr += x.attr 174 | attr = set(attr) 175 | self.attr = {} 176 | self.attr["ALL"] = list(self.videos.keys()) 177 | for x in attr: 178 | self.attr[x] = [] 179 | for k, v in self.videos.items(): 180 | for attr_ in v.attr: 181 | self.attr[attr_].append(k) 182 | -------------------------------------------------------------------------------- /toolkit/datasets/uav.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | from tqdm import tqdm 5 | from glob import glob 6 | 7 | from .dataset import Dataset 8 | from .video import Video 9 | 10 | 11 | class UAVVideo(Video): 12 | """ 13 | Args: 14 | name: video name 15 | root: dataset root 16 | video_dir: video directory 17 | init_rect: init rectangle 18 | img_names: image names 19 | gt_rect: groundtruth rectangle 20 | attr: attribute of video 21 | """ 22 | 23 | def __init__( 24 | self, name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img=False 25 | ): 26 | super(UAVVideo, self).__init__( 27 | name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img 28 | ) 29 | 30 | 31 | class UAVDataset(Dataset): 32 | """ 33 | Args: 34 | name: dataset name, should be 'UAV123', 'UAV20L' 35 | dataset_root: dataset root 36 | load_img: wether to load all imgs 37 | """ 38 | 39 | def __init__(self, name, dataset_root, load_img=False): 40 | super(UAVDataset, self).__init__(name, dataset_root) 41 | with open(os.path.join(dataset_root, name + ".json"), "r") as f: 42 | meta_data = json.load(f) 43 | 44 | # load videos 45 | pbar = tqdm(meta_data.keys(), desc="loading " + name, ncols=100) 46 | self.videos = {} 47 | for video in pbar: 48 | pbar.set_postfix_str(video) 49 | self.videos[video] = UAVVideo( 50 | video, 51 | dataset_root + "/data_seq/UAV123", 52 | meta_data[video]["video_dir"], 53 | meta_data[video]["init_rect"], 54 | meta_data[video]["img_names"], 55 | meta_data[video]["gt_rect"], 56 | meta_data[video]["attr"], 57 | ) 58 | 59 | # set attr 60 | attr = [] 61 | for x in self.videos.values(): 62 | attr += x.attr 63 | attr = set(attr) 64 | self.attr = {} 65 | self.attr["ALL"] = list(self.videos.keys()) 66 | for x in attr: 67 | self.attr[x] = [] 68 | for k, v in self.videos.items(): 69 | for attr_ in v.attr: 70 | self.attr[attr_].append(k) 71 | -------------------------------------------------------------------------------- /toolkit/datasets/uav10fps.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import numpy as np 4 | 5 | from PIL import Image 6 | from tqdm import tqdm 7 | from glob import glob 8 | 9 | from .dataset import Dataset 10 | from .video import Video 11 | 12 | 13 | def ca(): 14 | path = "./test_dataset/UAV123_10fps" 15 | 16 | name_list = os.listdir(path + "/data_seq") 17 | name_list.sort() 18 | a = len(name_list) 19 | b = [] 20 | for i in range(a): 21 | b.append(name_list[i]) 22 | c = [] 23 | 24 | for jj in range(a): 25 | imgs = path + "/data_seq/" + str(name_list[jj]) 26 | txt = path + "/anno/" + str(name_list[jj]) + ".txt" 27 | bbox = [] 28 | f = open(txt) # 返回一个文件对象 29 | file = f.readlines() 30 | li = os.listdir(imgs) 31 | li.sort() 32 | for ii in range(len(file)): 33 | li[ii] = name_list[jj] + "/" + li[ii] 34 | 35 | line = file[ii].strip("\n").split(",") 36 | 37 | try: 38 | line[0] = int(line[0]) 39 | except: 40 | line[0] = float(line[0]) 41 | try: 42 | line[1] = int(line[1]) 43 | except: 44 | line[1] = float(line[1]) 45 | try: 46 | line[2] = int(line[2]) 47 | except: 48 | line[2] = float(line[2]) 49 | try: 50 | line[3] = int(line[3]) 51 | except: 52 | line[3] = float(line[3]) 53 | bbox.append(line) 54 | 55 | if len(bbox) != len(li): 56 | print(jj) 57 | f.close() 58 | c.append( 59 | { 60 | "attr": [], 61 | "gt_rect": bbox, 62 | "img_names": li, 63 | "init_rect": bbox[0], 64 | "video_dir": name_list[jj], 65 | } 66 | ) 67 | 68 | d = dict(zip(b, c)) 69 | return d 70 | 71 | 72 | class UAVVideo(Video): 73 | """ 74 | Args: 75 | name: video name 76 | root: dataset root 77 | video_dir: video directory 78 | init_rect: init rectangle 79 | img_names: image names 80 | gt_rect: groundtruth rectangle 81 | attr: attribute of video 82 | """ 83 | 84 | def __init__( 85 | self, name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img=False 86 | ): 87 | super(UAVVideo, self).__init__( 88 | name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img 89 | ) 90 | 91 | 92 | class UAV10Dataset(Dataset): 93 | """ 94 | Args: 95 | name: dataset name, should be 'UAV123', 'UAV20L' 96 | dataset_root: dataset root 97 | load_img: wether to load all imgs 98 | """ 99 | 100 | def __init__(self, name, dataset_root, load_img=False): 101 | super(UAV10Dataset, self).__init__(name, dataset_root) 102 | meta_data = ca() 103 | 104 | # load videos 105 | pbar = tqdm(meta_data.keys(), desc="loading " + name, ncols=100) 106 | self.videos = {} 107 | for video in pbar: 108 | pbar.set_postfix_str(video) 109 | self.videos[video] = UAVVideo( 110 | video, 111 | dataset_root + "/data_seq", # 图片对应的路径 112 | meta_data[video]["video_dir"], 113 | meta_data[video]["init_rect"], 114 | meta_data[video]["img_names"], 115 | meta_data[video]["gt_rect"], 116 | meta_data[video]["attr"], 117 | ) 118 | 119 | # set attr 120 | attr = [] 121 | for x in self.videos.values(): 122 | attr += x.attr 123 | attr = set(attr) 124 | self.attr = {} 125 | self.attr["ALL"] = list(self.videos.keys()) 126 | for x in attr: 127 | self.attr[x] = [] 128 | for k, v in self.videos.items(): 129 | for attr_ in v.attr: 130 | self.attr[attr_].append(k) 131 | -------------------------------------------------------------------------------- /toolkit/datasets/uav112.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from tqdm import tqdm 4 | from glob import glob 5 | 6 | from .dataset import Dataset 7 | from .video import Video 8 | 9 | 10 | def loaddata(dataset_root): 11 | 12 | name_list = os.listdir(dataset_root + "/data_seq") 13 | name_list.sort() 14 | 15 | b = [] 16 | for i in range(len(name_list)): 17 | b.append(name_list[i]) 18 | c = [] 19 | 20 | for jj in range(len(name_list)): 21 | imgs = dataset_root + "/data_seq/" + str(name_list[jj]) 22 | txt = dataset_root + "/anno/" + str(name_list[jj]) + ".txt" 23 | bbox = [] 24 | f = open(txt) 25 | file = f.readlines() 26 | li = os.listdir(imgs) 27 | li.sort() 28 | for ii in range(len(file)): 29 | try: 30 | li[ii] = name_list[jj] + "/" + li[ii] 31 | except: 32 | a = 1 33 | 34 | line = file[ii].strip("\n").split(" ") 35 | 36 | if len(line) != 4: 37 | line = file[ii].strip("\n").split(",") 38 | if len(line) != 4: 39 | line = file[ii].strip("\n").split("\t") 40 | 41 | try: 42 | line[0] = int(line[0]) 43 | except: 44 | line[0] = float(line[0]) 45 | try: 46 | line[1] = int(line[1]) 47 | except: 48 | line[1] = float(line[1]) 49 | try: 50 | line[2] = int(line[2]) 51 | except: 52 | line[2] = float(line[2]) 53 | try: 54 | line[3] = int(line[3]) 55 | except: 56 | line[3] = float(line[3]) 57 | bbox.append(line) 58 | 59 | if len(bbox) != len(li): 60 | print(jj) 61 | f.close() 62 | c.append( 63 | { 64 | "attr": [], 65 | "gt_rect": bbox, 66 | "img_names": li, 67 | "init_rect": bbox[0], 68 | "video_dir": name_list[jj], 69 | } 70 | ) 71 | 72 | d = dict(zip(b, c)) 73 | 74 | return d 75 | 76 | 77 | class UAVVideo(Video): 78 | """ 79 | Args: 80 | name: video name 81 | root: dataset root 82 | video_dir: video directory 83 | init_rect: init rectangle 84 | img_names: image names 85 | gt_rect: groundtruth rectangle 86 | attr: attribute of video 87 | """ 88 | 89 | def __init__( 90 | self, name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img=False 91 | ): 92 | super(UAVVideo, self).__init__( 93 | name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img 94 | ) 95 | 96 | 97 | class UAV112Dataset(Dataset): 98 | """ 99 | Args: 100 | name: dataset name, should be 'UAV123', 'UAV20L' 101 | dataset_root: dataset root 102 | load_img: wether to load all imgs 103 | """ 104 | 105 | def __init__(self, name, dataset_root, load_img=False): 106 | super(UAV112Dataset, self).__init__(name, dataset_root) 107 | meta_data = loaddata(dataset_root) 108 | 109 | # load videos 110 | pbar = tqdm(meta_data.keys(), desc="loading " + name, ncols=100) 111 | self.videos = {} 112 | for video in pbar: 113 | pbar.set_postfix_str(video) 114 | self.videos[video] = UAVVideo( 115 | video, 116 | dataset_root + "/data_seq", 117 | meta_data[video]["video_dir"], 118 | meta_data[video]["init_rect"], 119 | meta_data[video]["img_names"], 120 | meta_data[video]["gt_rect"], 121 | meta_data[video]["attr"], 122 | ) 123 | 124 | # set attr 125 | attr = [] 126 | for x in self.videos.values(): 127 | attr += x.attr 128 | attr = set(attr) 129 | self.attr = {} 130 | self.attr["ALL"] = list(self.videos.keys()) 131 | for x in attr: 132 | self.attr[x] = [] 133 | for k, v in self.videos.items(): 134 | for attr_ in v.attr: 135 | self.attr[attr_].append(k) 136 | -------------------------------------------------------------------------------- /toolkit/datasets/uav112l.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import numpy as np 4 | 5 | from PIL import Image 6 | from tqdm import tqdm 7 | from glob import glob 8 | 9 | from .dataset import Dataset 10 | from .video import Video 11 | 12 | 13 | def ca(): 14 | path = "./test_dataset/UAVTrack112_L" 15 | 16 | name_list = os.listdir(path + "/anno_l") 17 | name_list = [i.split(".")[0] for i in name_list] 18 | name_list.sort() 19 | a = len(name_list) 20 | b = [] 21 | for i in range(a): 22 | b.append(name_list[i]) 23 | c = [] 24 | 25 | for jj in range(a): 26 | imgs = path + "/data_seq/" + str(name_list[jj]) 27 | txt = path + "/anno_l/" + str(name_list[jj]) + ".txt" 28 | bbox = [] 29 | f = open(txt) # 返回一个文件对象 30 | file = f.readlines() 31 | li = os.listdir(imgs) 32 | li.sort() 33 | for ii in range(len(file)): 34 | li[ii] = name_list[jj] + "/" + li[ii] 35 | 36 | line = file[ii].strip("\n").split(",") 37 | 38 | try: 39 | line[0] = int(line[0]) 40 | except: 41 | line[0] = float(line[0]) 42 | try: 43 | line[1] = int(line[1]) 44 | except: 45 | line[1] = float(line[1]) 46 | try: 47 | line[2] = int(line[2]) 48 | except: 49 | line[2] = float(line[2]) 50 | try: 51 | line[3] = int(line[3]) 52 | except: 53 | line[3] = float(line[3]) 54 | bbox.append(line) 55 | 56 | if len(bbox) != len(li): 57 | print(jj) 58 | f.close() 59 | c.append( 60 | { 61 | "attr": [], 62 | "gt_rect": bbox, 63 | "img_names": li, 64 | "init_rect": bbox[0], 65 | "video_dir": name_list[jj], 66 | } 67 | ) 68 | 69 | d = dict(zip(b, c)) 70 | return d 71 | 72 | 73 | class UAVVideo(Video): 74 | """ 75 | Args: 76 | name: video name 77 | root: dataset root 78 | video_dir: video directory 79 | init_rect: init rectangle 80 | img_names: image names 81 | gt_rect: groundtruth rectangle 82 | attr: attribute of video 83 | """ 84 | 85 | def __init__( 86 | self, name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img=False 87 | ): 88 | super(UAVVideo, self).__init__( 89 | name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img 90 | ) 91 | 92 | 93 | class UAV112LDataset(Dataset): 94 | def __init__(self, name, dataset_root, load_img=False): 95 | super(UAV112LDataset, self).__init__(name, dataset_root) 96 | meta_data = ca() 97 | 98 | # load videos 99 | pbar = tqdm(meta_data.keys(), desc="loading " + name, ncols=100) 100 | self.videos = {} 101 | for video in pbar: 102 | pbar.set_postfix_str(video) 103 | self.videos[video] = UAVVideo( 104 | video, 105 | dataset_root + "/data_seq", # 图片对应的路径 106 | meta_data[video]["video_dir"], 107 | meta_data[video]["init_rect"], 108 | meta_data[video]["img_names"], 109 | meta_data[video]["gt_rect"], 110 | meta_data[video]["attr"], 111 | ) 112 | 113 | # set attr 114 | attr = [] 115 | for x in self.videos.values(): 116 | attr += x.attr 117 | attr = set(attr) 118 | self.attr = {} 119 | self.attr["ALL"] = list(self.videos.keys()) 120 | for x in attr: 121 | self.attr[x] = [] 122 | for k, v in self.videos.items(): 123 | for attr_ in v.attr: 124 | self.attr[attr_].append(k) 125 | -------------------------------------------------------------------------------- /toolkit/datasets/uav20l.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import numpy as np 4 | 5 | from PIL import Image 6 | from tqdm import tqdm 7 | from glob import glob 8 | 9 | from .dataset import Dataset 10 | from .video import Video 11 | 12 | 13 | def ca(): 14 | 15 | path = "./test_dataset/UAV123_20L" 16 | 17 | name_list = os.listdir(path + "/data_seq") 18 | name_list.sort() 19 | 20 | b = [] 21 | for i in range(len(name_list)): 22 | b.append(name_list[i]) 23 | c = [] 24 | 25 | for jj in range(len(name_list)): 26 | imgs = path + "/data_seq/" + str(name_list[jj]) 27 | txt = path + "/anno/" + str(name_list[jj]) + ".txt" 28 | bbox = [] 29 | f = open(txt) # 返回一个文件对象 30 | file = f.readlines() 31 | li = os.listdir(imgs) 32 | li.sort() 33 | for ii in range(len(file)): 34 | li[ii] = name_list[jj] + "/" + li[ii] 35 | 36 | line = file[ii].strip("\n").split(",") 37 | 38 | try: 39 | line[0] = int(line[0]) 40 | except: 41 | line[0] = float(line[0]) 42 | try: 43 | line[1] = int(line[1]) 44 | except: 45 | line[1] = float(line[1]) 46 | try: 47 | line[2] = int(line[2]) 48 | except: 49 | line[2] = float(line[2]) 50 | try: 51 | line[3] = int(line[3]) 52 | except: 53 | line[3] = float(line[3]) 54 | bbox.append(line) 55 | 56 | if len(bbox) != len(li): 57 | print(jj) 58 | f.close() 59 | c.append( 60 | { 61 | "attr": [], 62 | "gt_rect": bbox, 63 | "img_names": li, 64 | "init_rect": bbox[0], 65 | "video_dir": name_list[jj], 66 | } 67 | ) 68 | 69 | d = dict(zip(b, c)) 70 | 71 | return d 72 | 73 | 74 | class UAVVideo(Video): 75 | """ 76 | Args: 77 | name: video name 78 | root: dataset root 79 | video_dir: video directory 80 | init_rect: init rectangle 81 | img_names: image names 82 | gt_rect: groundtruth rectangle 83 | attr: attribute of video 84 | """ 85 | 86 | def __init__( 87 | self, name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img=False 88 | ): 89 | super(UAVVideo, self).__init__( 90 | name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img 91 | ) 92 | 93 | 94 | class UAV20Dataset(Dataset): 95 | """ 96 | Args: 97 | name: dataset name, should be 'UAV123', 'UAV20L' 98 | dataset_root: dataset root 99 | load_img: wether to load all imgs 100 | """ 101 | 102 | def __init__(self, name, dataset_root, load_img=False): 103 | super(UAV20Dataset, self).__init__(name, dataset_root) 104 | meta_data = ca() 105 | 106 | # load videos 107 | pbar = tqdm(meta_data.keys(), desc="loading " + name, ncols=100) 108 | self.videos = {} 109 | for video in pbar: 110 | pbar.set_postfix_str(video) 111 | self.videos[video] = UAVVideo( 112 | video, 113 | dataset_root + "/data_seq", 114 | meta_data[video]["video_dir"], 115 | meta_data[video]["init_rect"], 116 | meta_data[video]["img_names"], 117 | meta_data[video]["gt_rect"], 118 | meta_data[video]["attr"], 119 | ) 120 | 121 | # set attr 122 | attr = [] 123 | for x in self.videos.values(): 124 | attr += x.attr 125 | attr = set(attr) 126 | self.attr = {} 127 | self.attr["ALL"] = list(self.videos.keys()) 128 | for x in attr: 129 | self.attr[x] = [] 130 | for k, v in self.videos.items(): 131 | for attr_ in v.attr: 132 | self.attr[attr_].append(k) 133 | -------------------------------------------------------------------------------- /toolkit/datasets/uavdt.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from tqdm import tqdm 4 | from glob import glob 5 | 6 | from .dataset import Dataset 7 | from .video import Video 8 | 9 | 10 | def loaddata(dataset_root): 11 | 12 | name_list = os.listdir(dataset_root + "/data_seq") 13 | name_list.sort() 14 | 15 | b = [] 16 | for i in range(len(name_list)): 17 | b.append(name_list[i]) 18 | c = [] 19 | 20 | for jj in range(len(name_list)): 21 | imgs = dataset_root + "/data_seq/" + str(name_list[jj]) 22 | txt = dataset_root + "/anno/" + str(name_list[jj]) + "_gt.txt" 23 | bbox = [] 24 | f = open(txt) 25 | file = f.readlines() 26 | li = os.listdir(imgs) 27 | li.sort() 28 | for ii in range(len(file)): 29 | li[ii] = name_list[jj] + "/" + li[ii] 30 | 31 | line = file[ii].strip("\n").split(",") 32 | 33 | try: 34 | line[0] = int(line[0]) 35 | except: 36 | line[0] = float(line[0]) 37 | try: 38 | line[1] = int(line[1]) 39 | except: 40 | line[1] = float(line[1]) 41 | try: 42 | line[2] = int(line[2]) 43 | except: 44 | line[2] = float(line[2]) 45 | try: 46 | line[3] = int(line[3]) 47 | except: 48 | line[3] = float(line[3]) 49 | bbox.append(line) 50 | 51 | if len(bbox) != len(li): 52 | print(jj) 53 | f.close() 54 | c.append( 55 | { 56 | "attr": [], 57 | "gt_rect": bbox, 58 | "img_names": li, 59 | "init_rect": bbox[0], 60 | "video_dir": name_list[jj], 61 | } 62 | ) 63 | 64 | d = dict(zip(b, c)) 65 | 66 | return d 67 | 68 | 69 | class UAVVideo(Video): 70 | """ 71 | Args: 72 | name: video name 73 | root: dataset root 74 | video_dir: video directory 75 | init_rect: init rectangle 76 | img_names: image names 77 | gt_rect: groundtruth rectangle 78 | attr: attribute of video 79 | """ 80 | 81 | def __init__( 82 | self, name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img=False 83 | ): 84 | super(UAVVideo, self).__init__( 85 | name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img 86 | ) 87 | 88 | 89 | class UAVDTDataset(Dataset): 90 | """ 91 | Args: 92 | name: dataset name, should be 'UAV123', 'UAV20L' 93 | dataset_root: dataset root 94 | load_img: wether to load all imgs 95 | """ 96 | 97 | def __init__(self, name, dataset_root, load_img=False): 98 | super(UAVDTDataset, self).__init__(name, dataset_root) 99 | meta_data = loaddata(dataset_root) 100 | 101 | # load videos 102 | pbar = tqdm(meta_data.keys(), desc="loading " + name, ncols=100) 103 | self.videos = {} 104 | for video in pbar: 105 | pbar.set_postfix_str(video) 106 | self.videos[video] = UAVVideo( 107 | video, 108 | dataset_root + "/data_seq", 109 | meta_data[video]["video_dir"], 110 | meta_data[video]["init_rect"], 111 | meta_data[video]["img_names"], 112 | meta_data[video]["gt_rect"], 113 | meta_data[video]["attr"], 114 | ) 115 | 116 | # set attr 117 | attr = [] 118 | for x in self.videos.values(): 119 | attr += x.attr 120 | attr = set(attr) 121 | self.attr = {} 122 | self.attr["ALL"] = list(self.videos.keys()) 123 | for x in attr: 124 | self.attr[x] = [] 125 | for k, v in self.videos.items(): 126 | for attr_ in v.attr: 127 | self.attr[attr_].append(k) 128 | -------------------------------------------------------------------------------- /toolkit/datasets/video.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import re 4 | import numpy as np 5 | import json 6 | 7 | from glob import glob 8 | 9 | 10 | class Video(object): 11 | def __init__( 12 | self, name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img=False 13 | ): 14 | self.name = name 15 | self.video_dir = video_dir 16 | self.init_rect = init_rect 17 | self.gt_traj = gt_rect 18 | self.attr = attr 19 | self.pred_trajs = {} 20 | self.img_names = [os.path.join(root, x) for x in img_names] 21 | self.imgs = None 22 | 23 | if load_img: 24 | self.imgs = [cv2.imread(x) for x in self.img_names] 25 | self.width = self.imgs[0].shape[1] 26 | self.height = self.imgs[0].shape[0] 27 | else: 28 | img = cv2.imread(self.img_names[0]) 29 | assert img is not None, self.img_names[0] 30 | self.width = img.shape[1] 31 | self.height = img.shape[0] 32 | 33 | def load_tracker(self, path, tracker_names=None, store=True): 34 | """ 35 | Args: 36 | path(str): path to result 37 | tracker_name(list): name of tracker 38 | """ 39 | if not tracker_names: 40 | tracker_names = [x.split("/")[-1] for x in glob(path) if os.path.isdir(x)] 41 | if isinstance(tracker_names, str): 42 | tracker_names = [tracker_names] 43 | for name in tracker_names: 44 | traj_file = os.path.join(path, name, self.name + ".txt") 45 | if os.path.exists(traj_file): 46 | with open(traj_file, "r") as f: 47 | pred_traj = [ 48 | list(map(float, x.strip().split(","))) for x in f.readlines() 49 | ] 50 | if len(pred_traj) != len(self.gt_traj): 51 | print(name, len(pred_traj), len(self.gt_traj), self.name) 52 | if store: 53 | self.pred_trajs[name] = pred_traj 54 | else: 55 | return pred_traj 56 | else: 57 | print(traj_file) 58 | self.tracker_names = list(self.pred_trajs.keys()) 59 | 60 | def load_img(self): 61 | if self.imgs is None: 62 | self.imgs = [cv2.imread(x) for x in self.img_names] 63 | self.width = self.imgs[0].shape[1] 64 | self.height = self.imgs[0].shape[0] 65 | 66 | def free_img(self): 67 | self.imgs = None 68 | 69 | def __len__(self): 70 | return len(self.img_names) 71 | 72 | def __getitem__(self, idx): 73 | if self.imgs is None: 74 | return cv2.imread(self.img_names[idx]), self.gt_traj[idx] 75 | else: 76 | return self.imgs[idx], self.gt_traj[idx] 77 | 78 | def __iter__(self): 79 | for i in range(len(self.img_names)): 80 | if self.imgs is not None: 81 | yield self.imgs[i], self.gt_traj[i] 82 | else: 83 | yield cv2.imread(self.img_names[i]), self.gt_traj[i] 84 | 85 | def draw_box(self, roi, img, linewidth, color, name=None): 86 | """ 87 | roi: rectangle or polygon 88 | img: numpy array img 89 | linewith: line width of the bbox 90 | """ 91 | if len(roi) > 6 and len(roi) % 2 == 0: 92 | pts = np.array(roi, np.int32).reshape(-1, 1, 2) 93 | color = tuple(map(int, color)) 94 | img = cv2.polylines(img, [pts], True, color, linewidth) 95 | pt = (pts[0, 0, 0], pts[0, 0, 1] - 5) 96 | if name: 97 | img = cv2.putText( 98 | img, name, pt, cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, color, 1 99 | ) 100 | elif len(roi) == 4: 101 | if not np.isnan(roi[0]): 102 | roi = list(map(int, roi)) 103 | color = tuple(map(int, color)) 104 | img = cv2.rectangle( 105 | img, 106 | (roi[0], roi[1]), 107 | (roi[0] + roi[2], roi[1] + roi[3]), 108 | color, 109 | linewidth, 110 | ) 111 | if name: 112 | img = cv2.putText( 113 | img, 114 | name, 115 | (roi[0], roi[1] - 5), 116 | cv2.FONT_HERSHEY_COMPLEX_SMALL, 117 | 1, 118 | color, 119 | 1, 120 | ) 121 | return img 122 | 123 | def show(self, pred_trajs={}, linewidth=2, show_name=False): 124 | """ 125 | pred_trajs: dict of pred_traj, {'tracker_name': list of traj} 126 | pred_traj should contain polygon or rectangle(x, y, width, height) 127 | linewith: line width of the bbox 128 | """ 129 | assert self.imgs is not None 130 | video = [] 131 | cv2.namedWindow(self.name, cv2.WINDOW_NORMAL) 132 | colors = {} 133 | if len(pred_trajs) == 0 and len(self.pred_trajs) > 0: 134 | pred_trajs = self.pred_trajs 135 | for i, (roi, img) in enumerate( 136 | zip(self.gt_traj, self.imgs[self.start_frame : self.end_frame + 1]) 137 | ): 138 | img = img.copy() 139 | if len(img.shape) == 2: 140 | img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) 141 | else: 142 | img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) 143 | img = self.draw_box( 144 | roi, img, linewidth, (0, 255, 0), "gt" if show_name else None 145 | ) 146 | for name, trajs in pred_trajs.items(): 147 | if name not in colors: 148 | color = tuple(np.random.randint(0, 256, 3)) 149 | colors[name] = color 150 | else: 151 | color = colors[name] 152 | img = self.draw_box( 153 | trajs[0][i], img, linewidth, color, name if show_name else None 154 | ) 155 | cv2.putText( 156 | img, 157 | str(i + self.start_frame), 158 | (5, 20), 159 | cv2.FONT_HERSHEY_COMPLEX_SMALL, 160 | 1, 161 | (255, 255, 0), 162 | 2, 163 | ) 164 | cv2.imshow(self.name, img) 165 | cv2.waitKey(40) 166 | video.append(img.copy()) 167 | return video 168 | -------------------------------------------------------------------------------- /toolkit/datasets/visdrone.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from tqdm import tqdm 4 | from glob import glob 5 | 6 | from .dataset import Dataset 7 | from .video import Video 8 | 9 | 10 | def loaddata(dataset_root): 11 | 12 | name_list = os.listdir(dataset_root + "/sequences") 13 | name_list.sort() 14 | 15 | b = [] 16 | for i in range(len(name_list)): 17 | b.append(name_list[i]) 18 | c = [] 19 | 20 | for jj in range(len(name_list)): 21 | imgs = dataset_root + "/sequences/" + str(name_list[jj]) 22 | txt = dataset_root + "/annotations/" + str(name_list[jj]) + ".txt" 23 | bbox = [] 24 | f = open(txt) 25 | file = f.readlines() 26 | li = os.listdir(imgs) 27 | li.sort() 28 | for ii in range(len(file)): 29 | li[ii] = name_list[jj] + "/" + li[ii] 30 | 31 | line = file[ii].strip("\n").split(",") 32 | 33 | try: 34 | line[0] = int(line[0]) 35 | except: 36 | line[0] = float(line[0]) 37 | try: 38 | line[1] = int(line[1]) 39 | except: 40 | line[1] = float(line[1]) 41 | try: 42 | line[2] = int(line[2]) 43 | except: 44 | line[2] = float(line[2]) 45 | try: 46 | line[3] = int(line[3]) 47 | except: 48 | line[3] = float(line[3]) 49 | bbox.append(line) 50 | 51 | if len(bbox) != len(li): 52 | print(jj) 53 | f.close() 54 | c.append( 55 | { 56 | "attr": [], 57 | "gt_rect": bbox, 58 | "img_names": li, 59 | "init_rect": bbox[0], 60 | "video_dir": name_list[jj], 61 | } 62 | ) 63 | 64 | d = dict(zip(b, c)) 65 | 66 | return d 67 | 68 | 69 | class UVADTVideo(Video): 70 | """ 71 | Args: 72 | name: video name 73 | root: dataset root 74 | video_dir: video directory 75 | init_rect: init rectangle 76 | img_names: image names 77 | gt_rect: groundtruth rectangle 78 | attr: attribute of video 79 | """ 80 | 81 | def __init__( 82 | self, name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img=False 83 | ): 84 | super(UVADTVideo, self).__init__( 85 | name, root, video_dir, init_rect, img_names, gt_rect, attr, load_img 86 | ) 87 | 88 | 89 | class VISDRONEDDataset(Dataset): 90 | """ 91 | Args: 92 | name: dataset name, should be 'OTB100', 'CVPR13', 'OTB50' 93 | dataset_root: dataset root 94 | load_img: wether to load all imgs 95 | """ 96 | 97 | def __init__(self, name, dataset_root, load_img=False): 98 | super(VISDRONEDDataset, self).__init__(name, dataset_root) 99 | 100 | meta_data = loaddata(dataset_root) 101 | # load videos 102 | pbar = tqdm(meta_data.keys(), desc="loading " + name, ncols=100) 103 | self.videos = {} 104 | for video in pbar: 105 | pbar.set_postfix_str(video) 106 | self.videos[video] = UVADTVideo( 107 | video, 108 | dataset_root + "/sequences", 109 | meta_data[video]["video_dir"], 110 | meta_data[video]["init_rect"], 111 | meta_data[video]["img_names"], 112 | meta_data[video]["gt_rect"], 113 | meta_data[video]["attr"], 114 | load_img, 115 | ) 116 | 117 | # set attr 118 | attr = [] 119 | for x in self.videos.values(): 120 | attr += x.attr 121 | attr = set(attr) 122 | self.attr = {} 123 | self.attr["ALL"] = list(self.videos.keys()) 124 | for x in attr: 125 | self.attr[x] = [] 126 | for k, v in self.videos.items(): 127 | for attr_ in v.attr: 128 | self.attr[attr_].append(k) 129 | -------------------------------------------------------------------------------- /toolkit/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .ope_benchmark import OPEBenchmark 2 | -------------------------------------------------------------------------------- /toolkit/evaluation/ope_benchmark.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from colorama import Style, Fore 4 | 5 | from ..utils.statistics import success_overlap, success_error 6 | 7 | 8 | class OPEBenchmark: 9 | """ 10 | Args: 11 | result_path: result path of your tracker 12 | should the same format like VOT 13 | """ 14 | 15 | def __init__(self, dataset): 16 | self.dataset = dataset 17 | 18 | def convert_bb_to_center(self, bboxes): 19 | return np.array( 20 | [ 21 | (bboxes[:, 0] + (bboxes[:, 2] - 1) / 2), 22 | (bboxes[:, 1] + (bboxes[:, 3] - 1) / 2), 23 | ] 24 | ).T 25 | 26 | def convert_bb_to_norm_center(self, bboxes, gt_wh): 27 | return self.convert_bb_to_center(bboxes) / (gt_wh + 1e-16) 28 | 29 | def eval_success(self, eval_trackers=None): 30 | """ 31 | Args: 32 | eval_trackers: list of tracker name or single tracker name 33 | Return: 34 | res: dict of results 35 | """ 36 | if eval_trackers is None: 37 | eval_trackers = self.dataset.tracker_names 38 | if isinstance(eval_trackers, str): 39 | eval_trackers = [eval_trackers] 40 | 41 | success_ret = {} 42 | for tracker_name in eval_trackers: 43 | success_ret_ = {} 44 | for video in self.dataset: 45 | gt_traj = np.array(video.gt_traj) 46 | if tracker_name not in video.pred_trajs: 47 | tracker_traj = video.load_tracker( 48 | self.dataset.tracker_path, tracker_name, False 49 | ) 50 | tracker_traj = np.array(tracker_traj) 51 | else: 52 | tracker_traj = np.array(video.pred_trajs[tracker_name]) 53 | n_frame = len(gt_traj) 54 | if hasattr(video, "absent"): 55 | gt_traj = gt_traj[video.absent == 1] 56 | tracker_traj = tracker_traj[video.absent == 1] 57 | success_ret_[video.name] = success_overlap( 58 | gt_traj, tracker_traj, n_frame 59 | ) 60 | success_ret[tracker_name] = success_ret_ 61 | return success_ret 62 | 63 | def eval_precision(self, eval_trackers=None): 64 | """ 65 | Args: 66 | eval_trackers: list of tracker name or single tracker name 67 | Return: 68 | res: dict of results 69 | """ 70 | if eval_trackers is None: 71 | eval_trackers = self.dataset.tracker_names 72 | if isinstance(eval_trackers, str): 73 | eval_trackers = [eval_trackers] 74 | 75 | precision_ret = {} 76 | for tracker_name in eval_trackers: 77 | precision_ret_ = {} 78 | for video in self.dataset: 79 | gt_traj = np.array(video.gt_traj) 80 | if tracker_name not in video.pred_trajs: 81 | tracker_traj = video.load_tracker( 82 | self.dataset.tracker_path, tracker_name, False 83 | ) 84 | tracker_traj = np.array(tracker_traj) 85 | else: 86 | tracker_traj = np.array(video.pred_trajs[tracker_name]) 87 | n_frame = len(gt_traj) 88 | if hasattr(video, "absent"): 89 | gt_traj = gt_traj[video.absent == 1] 90 | tracker_traj = tracker_traj[video.absent == 1] 91 | gt_center = self.convert_bb_to_center(gt_traj) 92 | tracker_center = self.convert_bb_to_center(tracker_traj) 93 | thresholds = np.arange(0, 51, 1) 94 | precision_ret_[video.name] = success_error( 95 | gt_center, tracker_center, thresholds, n_frame 96 | ) 97 | precision_ret[tracker_name] = precision_ret_ 98 | return precision_ret 99 | 100 | def eval_norm_precision(self, eval_trackers=None): 101 | """ 102 | Args: 103 | eval_trackers: list of tracker name or single tracker name 104 | Return: 105 | res: dict of results 106 | """ 107 | if eval_trackers is None: 108 | eval_trackers = self.dataset.tracker_names 109 | if isinstance(eval_trackers, str): 110 | eval_trackers = [eval_trackers] 111 | 112 | norm_precision_ret = {} 113 | for tracker_name in eval_trackers: 114 | norm_precision_ret_ = {} 115 | for video in self.dataset: 116 | gt_traj = np.array(video.gt_traj) 117 | if tracker_name not in video.pred_trajs: 118 | tracker_traj = video.load_tracker( 119 | self.dataset.tracker_path, tracker_name, False 120 | ) 121 | tracker_traj = np.array(tracker_traj) 122 | else: 123 | tracker_traj = np.array(video.pred_trajs[tracker_name]) 124 | n_frame = len(gt_traj) 125 | if hasattr(video, "absent"): 126 | gt_traj = gt_traj[video.absent == 1] 127 | tracker_traj = tracker_traj[video.absent == 1] 128 | gt_center_norm = self.convert_bb_to_norm_center( 129 | gt_traj, gt_traj[:, 2:4] 130 | ) 131 | tracker_center_norm = self.convert_bb_to_norm_center( 132 | tracker_traj, gt_traj[:, 2:4] 133 | ) 134 | thresholds = np.arange(0, 51, 1) / 100 135 | norm_precision_ret_[video.name] = success_error( 136 | gt_center_norm, tracker_center_norm, thresholds, n_frame 137 | ) 138 | norm_precision_ret[tracker_name] = norm_precision_ret_ 139 | return norm_precision_ret 140 | 141 | def show_result( 142 | self, 143 | success_ret, 144 | precision_ret=None, 145 | norm_precision_ret=None, 146 | show_video_level=False, 147 | helight_threshold=0.6, 148 | ): 149 | """pretty print result 150 | Args: 151 | result: returned dict from function eval 152 | """ 153 | # sort tracker 154 | tracker_auc = {} 155 | for tracker_name in success_ret.keys(): 156 | auc = np.mean(list(success_ret[tracker_name].values())) 157 | tracker_auc[tracker_name] = auc 158 | tracker_auc_ = sorted(tracker_auc.items(), key=lambda x: x[1], reverse=True)[ 159 | :20 160 | ] 161 | tracker_names = [x[0] for x in tracker_auc_] 162 | 163 | tracker_name_len = max((max([len(x) for x in success_ret.keys()]) + 2), 12) 164 | header = ("|{:^" + str(tracker_name_len) + "}|{:^9}|{:^16}|{:^11}|").format( 165 | "Tracker name", "Success", "Norm Precision", "Precision" 166 | ) 167 | formatter = "|{:^" + str(tracker_name_len) + "}|{:^9.3f}|{:^16.3f}|{:^11.3f}|" 168 | print("-" * len(header)) 169 | print(header) 170 | print("-" * len(header)) 171 | for tracker_name in tracker_names: 172 | # success = np.mean(list(success_ret[tracker_name].values())) 173 | success = tracker_auc[tracker_name] 174 | if precision_ret is not None: 175 | precision = np.mean(list(precision_ret[tracker_name].values()), axis=0)[ 176 | 20 177 | ] 178 | else: 179 | precision = 0 180 | if norm_precision_ret is not None: 181 | norm_precision = np.mean( 182 | list(norm_precision_ret[tracker_name].values()), axis=0 183 | )[20] 184 | else: 185 | norm_precision = 0 186 | print(formatter.format(tracker_name, success, norm_precision, precision)) 187 | print("-" * len(header)) 188 | 189 | if ( 190 | show_video_level 191 | and len(success_ret) < 10 192 | and precision_ret is not None 193 | and len(precision_ret) < 10 194 | ): 195 | print("\n\n") 196 | header1 = "|{:^21}|".format("Tracker name") 197 | header2 = "|{:^21}|".format("Video name") 198 | for tracker_name in success_ret.keys(): 199 | # col_len = max(20, len(tracker_name)) 200 | header1 += ("{:^21}|").format(tracker_name) 201 | header2 += "{:^9}|{:^11}|".format("success", "precision") 202 | print("-" * len(header1)) 203 | print(header1) 204 | print("-" * len(header1)) 205 | print(header2) 206 | print("-" * len(header1)) 207 | videos = list(success_ret[tracker_name].keys()) 208 | for video in videos: 209 | row = "|{:^21}|".format(video) 210 | for tracker_name in success_ret.keys(): 211 | success = np.mean(success_ret[tracker_name][video]) 212 | precision = np.mean(precision_ret[tracker_name][video]) 213 | success_str = "{:^9.3f}".format(success) 214 | if success < helight_threshold: 215 | row += Fore.RED + success_str + Style.RESET_ALL + "|" 216 | else: 217 | row += success_str + "|" 218 | precision_str = "{:^11.3f}".format(precision) 219 | if precision < helight_threshold: 220 | row += Fore.RED + precision_str + Style.RESET_ALL + "|" 221 | else: 222 | row += precision_str + "|" 223 | print(row) 224 | print("-" * len(header1)) 225 | -------------------------------------------------------------------------------- /toolkit/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import region 2 | from .statistics import * 3 | -------------------------------------------------------------------------------- /toolkit/utils/c_region.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "src/region.h": 2 | ctypedef enum region_type "RegionType": 3 | EMTPY 4 | SPECIAL 5 | RECTANGEL 6 | POLYGON 7 | MASK 8 | 9 | ctypedef struct region_bounds: 10 | float top 11 | float bottom 12 | float left 13 | float right 14 | 15 | ctypedef struct region_rectangle: 16 | float x 17 | float y 18 | float width 19 | float height 20 | 21 | # ctypedef struct region_mask: 22 | # int x 23 | # int y 24 | # int width 25 | # int height 26 | # char *data 27 | 28 | ctypedef struct region_polygon: 29 | int count 30 | float *x 31 | float *y 32 | 33 | ctypedef union region_container_data: 34 | region_rectangle rectangle 35 | region_polygon polygon 36 | # region_mask mask 37 | int special 38 | 39 | ctypedef struct region_container: 40 | region_type type 41 | region_container_data data 42 | 43 | # ctypedef struct region_overlap: 44 | # float overlap 45 | # float only1 46 | # float only2 47 | 48 | # region_overlap region_compute_overlap(const region_container* ra, const region_container* rb, region_bounds bounds) 49 | 50 | float compute_polygon_overlap(const region_polygon* p1, const region_polygon* p2, float *only1, float *only2, region_bounds bounds) 51 | -------------------------------------------------------------------------------- /toolkit/utils/misc.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author fangyi.zhang@vipl.ict.ac.cn 3 | """ 4 | import numpy as np 5 | 6 | def determine_thresholds(confidence, resolution=100): 7 | """choose threshold according to confidence 8 | 9 | Args: 10 | confidence: list or numpy array or numpy array 11 | reolution: number of threshold to choose 12 | 13 | Restures: 14 | threshold: numpy array 15 | """ 16 | if isinstance(confidence, list): 17 | confidence = np.array(confidence) 18 | confidence = confidence.flatten() 19 | confidence = confidence[~np.isnan(confidence)] 20 | confidence.sort() 21 | 22 | assert len(confidence) > resolution and resolution > 2 23 | 24 | thresholds = np.ones((resolution)) 25 | thresholds[0] = - np.inf 26 | thresholds[-1] = np.inf 27 | delta = np.floor(len(confidence) / (resolution - 2)) 28 | idxs = np.linspace(delta, len(confidence)-delta, resolution-2, dtype=np.int32) 29 | thresholds[1:-1] = confidence[idxs] 30 | return thresholds 31 | -------------------------------------------------------------------------------- /toolkit/utils/region.cpython-38-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vision4robotics/PRL-Track/b89beed235b38f24c93f875276b5221c99f6fb29/toolkit/utils/region.cpython-38-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /toolkit/utils/region.pyx: -------------------------------------------------------------------------------- 1 | """ 2 | @author fangyi.zhang@vipl.ict.ac.cn 3 | """ 4 | # distutils: sources = src/region.c 5 | # distutils: include_dirs = src/ 6 | 7 | from libc.stdlib cimport malloc, free 8 | from libc.stdio cimport sprintf 9 | from libc.string cimport strlen 10 | 11 | cimport c_region 12 | 13 | cpdef enum RegionType: 14 | EMTPY 15 | SPECIAL 16 | RECTANGEL 17 | POLYGON 18 | MASK 19 | 20 | cdef class RegionBounds: 21 | cdef c_region.region_bounds* _c_region_bounds 22 | 23 | def __cinit__(self): 24 | self._c_region_bounds = malloc( 25 | sizeof(c_region.region_bounds)) 26 | if not self._c_region_bounds: 27 | self._c_region_bounds = NULL 28 | raise MemoryError() 29 | 30 | def __init__(self, top, bottom, left, right): 31 | self.set(top, bottom, left, right) 32 | 33 | def __dealloc__(self): 34 | if self._c_region_bounds is not NULL: 35 | free(self._c_region_bounds) 36 | self._c_region_bounds = NULL 37 | 38 | def __str__(self): 39 | return "top: {:.3f} bottom: {:.3f} left: {:.3f} reight: {:.3f}".format( 40 | self._c_region_bounds.top, 41 | self._c_region_bounds.bottom, 42 | self._c_region_bounds.left, 43 | self._c_region_bounds.right) 44 | 45 | def get(self): 46 | return (self._c_region_bounds.top, 47 | self._c_region_bounds.bottom, 48 | self._c_region_bounds.left, 49 | self._c_region_bounds.right) 50 | 51 | def set(self, top, bottom, left, right): 52 | self._c_region_bounds.top = top 53 | self._c_region_bounds.bottom = bottom 54 | self._c_region_bounds.left = left 55 | self._c_region_bounds.right = right 56 | 57 | cdef class Rectangle: 58 | cdef c_region.region_rectangle* _c_region_rectangle 59 | 60 | def __cinit__(self): 61 | self._c_region_rectangle = malloc( 62 | sizeof(c_region.region_rectangle)) 63 | if not self._c_region_rectangle: 64 | self._c_region_rectangle = NULL 65 | raise MemoryError() 66 | 67 | def __init__(self, x, y, width, height): 68 | self.set(x, y, width, height) 69 | 70 | def __dealloc__(self): 71 | if self._c_region_rectangle is not NULL: 72 | free(self._c_region_rectangle) 73 | self._c_region_rectangle = NULL 74 | 75 | def __str__(self): 76 | return "x: {:.3f} y: {:.3f} width: {:.3f} height: {:.3f}".format( 77 | self._c_region_rectangle.x, 78 | self._c_region_rectangle.y, 79 | self._c_region_rectangle.width, 80 | self._c_region_rectangle.height) 81 | 82 | def set(self, x, y, width, height): 83 | self._c_region_rectangle.x = x 84 | self._c_region_rectangle.y = y 85 | self._c_region_rectangle.width = width 86 | self._c_region_rectangle.height = height 87 | 88 | def get(self): 89 | """ 90 | return: 91 | (x, y, width, height) 92 | """ 93 | return (self._c_region_rectangle.x, 94 | self._c_region_rectangle.y, 95 | self._c_region_rectangle.width, 96 | self._c_region_rectangle.height) 97 | 98 | cdef class Polygon: 99 | cdef c_region.region_polygon* _c_region_polygon 100 | 101 | def __cinit__(self, points): 102 | """ 103 | args: 104 | points: tuple of point 105 | points = ((1, 1), (10, 10)) 106 | """ 107 | num = len(points) // 2 108 | self._c_region_polygon = malloc( 109 | sizeof(c_region.region_polygon)) 110 | if not self._c_region_polygon: 111 | self._c_region_polygon = NULL 112 | raise MemoryError() 113 | self._c_region_polygon.count = num 114 | self._c_region_polygon.x = malloc(sizeof(float) * num) 115 | if not self._c_region_polygon.x: 116 | raise MemoryError() 117 | self._c_region_polygon.y = malloc(sizeof(float) * num) 118 | if not self._c_region_polygon.y: 119 | raise MemoryError() 120 | 121 | for i in range(num): 122 | self._c_region_polygon.x[i] = points[i*2] 123 | self._c_region_polygon.y[i] = points[i*2+1] 124 | 125 | def __dealloc__(self): 126 | if self._c_region_polygon is not NULL: 127 | if self._c_region_polygon.x is not NULL: 128 | free(self._c_region_polygon.x) 129 | self._c_region_polygon.x = NULL 130 | if self._c_region_polygon.y is not NULL: 131 | free(self._c_region_polygon.y) 132 | self._c_region_polygon.y = NULL 133 | free(self._c_region_polygon) 134 | self._c_region_polygon = NULL 135 | 136 | def __str__(self): 137 | ret = "" 138 | for i in range(self._c_region_polygon.count-1): 139 | ret += "({:.3f} {:.3f}) ".format(self._c_region_polygon.x[i], 140 | self._c_region_polygon.y[i]) 141 | ret += "({:.3f} {:.3f})".format(self._c_region_polygon.x[i], 142 | self._c_region_polygon.y[i]) 143 | return ret 144 | 145 | def vot_overlap(polygon1, polygon2, bounds=None): 146 | """ computing overlap between two polygon 147 | Args: 148 | polygon1: polygon tuple of points 149 | polygon2: polygon tuple of points 150 | bounds: tuple of (left, top, right, bottom) or tuple of (width height) 151 | Return: 152 | overlap: overlap between two polygons 153 | """ 154 | if len(polygon1) == 1 or len(polygon2) == 1: 155 | return float("nan") 156 | 157 | if len(polygon1) == 4: 158 | polygon1_ = Polygon([polygon1[0], polygon1[1], 159 | polygon1[0]+polygon1[2], polygon1[1], 160 | polygon1[0]+polygon1[2], polygon1[1]+polygon1[3], 161 | polygon1[0], polygon1[1]+polygon1[3]]) 162 | else: 163 | polygon1_ = Polygon(polygon1) 164 | 165 | if len(polygon2) == 4: 166 | polygon2_ = Polygon([polygon2[0], polygon2[1], 167 | polygon2[0]+polygon2[2], polygon2[1], 168 | polygon2[0]+polygon2[2], polygon2[1]+polygon2[3], 169 | polygon2[0], polygon2[1]+polygon2[3]]) 170 | else: 171 | polygon2_ = Polygon(polygon2) 172 | 173 | if bounds is not None and len(bounds) == 4: 174 | pno_bounds = RegionBounds(bounds[0], bounds[1], bounds[2], bounds[3]) 175 | elif bounds is not None and len(bounds) == 2: 176 | pno_bounds = RegionBounds(0, bounds[1], 0, bounds[0]) 177 | else: 178 | pno_bounds = RegionBounds(-float("inf"), float("inf"), 179 | -float("inf"), float("inf")) 180 | cdef float only1 = 0 181 | cdef float only2 = 0 182 | cdef c_region.region_polygon* c_polygon1 = polygon1_._c_region_polygon 183 | cdef c_region.region_polygon* c_polygon2 = polygon2_._c_region_polygon 184 | cdef c_region.region_bounds no_bounds = pno_bounds._c_region_bounds[0] # deference 185 | return c_region.compute_polygon_overlap(c_polygon1, 186 | c_polygon2, 187 | &only1, 188 | &only2, 189 | no_bounds) 190 | 191 | def vot_overlap_traj(polygons1, polygons2, bounds=None): 192 | """ computing overlap between two trajectory 193 | Args: 194 | polygons1: list of polygon 195 | polygons2: list of polygon 196 | bounds: tuple of (left, top, right, bottom) or tuple of (width height) 197 | Return: 198 | overlaps: overlaps between all pair of polygons 199 | """ 200 | assert len(polygons1) == len(polygons2) 201 | overlaps = [] 202 | for i in range(len(polygons1)): 203 | overlap = vot_overlap(polygons1[i], polygons2[i], bounds=bounds) 204 | overlaps.append(overlap) 205 | return overlaps 206 | 207 | 208 | def vot_float2str(template, float value): 209 | """ 210 | Args: 211 | tempate: like "%.3f" in C syntax 212 | value: float value 213 | """ 214 | cdef bytes ptemplate = template.encode() 215 | cdef const char* ctemplate = ptemplate 216 | cdef char* output = malloc(sizeof(char) * 100) 217 | if not output: 218 | raise MemoryError() 219 | sprintf(output, ctemplate, value) 220 | try: 221 | ret = output[:strlen(output)].decode() 222 | finally: 223 | free(output) 224 | return ret 225 | -------------------------------------------------------------------------------- /toolkit/utils/src/buffer.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef __STRING_BUFFER_H 3 | #define __STRING_BUFFER_H 4 | 5 | // Enable MinGW secure API for _snprintf_s 6 | #define MINGW_HAS_SECURE_API 1 7 | 8 | #ifdef _MSC_VER 9 | #define __INLINE __inline 10 | #else 11 | #define __INLINE inline 12 | #endif 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | typedef struct string_buffer { 19 | char* buffer; 20 | int position; 21 | int size; 22 | } string_buffer; 23 | 24 | typedef struct string_list { 25 | char** buffer; 26 | int position; 27 | int size; 28 | } string_list; 29 | 30 | #define BUFFER_INCREMENT_STEP 4096 31 | 32 | static __INLINE string_buffer* buffer_create(int L) { 33 | string_buffer* B = (string_buffer*) malloc(sizeof(string_buffer)); 34 | B->size = L; 35 | B->buffer = (char*) malloc(sizeof(char) * B->size); 36 | B->position = 0; 37 | return B; 38 | } 39 | 40 | static __INLINE void buffer_reset(string_buffer* B) { 41 | B->position = 0; 42 | } 43 | 44 | static __INLINE void buffer_destroy(string_buffer** B) { 45 | if (!(*B)) return; 46 | if ((*B)->buffer) { 47 | free((*B)->buffer); 48 | (*B)->buffer = NULL; 49 | } 50 | free((*B)); 51 | (*B) = NULL; 52 | } 53 | 54 | static __INLINE char* buffer_extract(const string_buffer* B) { 55 | char *S = (char*) malloc(sizeof(char) * (B->position + 1)); 56 | memcpy(S, B->buffer, B->position); 57 | S[B->position] = '\0'; 58 | return S; 59 | } 60 | 61 | static __INLINE int buffer_size(const string_buffer* B) { 62 | return B->position; 63 | } 64 | 65 | static __INLINE void buffer_push(string_buffer* B, char C) { 66 | int required = 1; 67 | if (required > B->size - B->position) { 68 | B->size = B->position + BUFFER_INCREMENT_STEP; 69 | B->buffer = (char*) realloc(B->buffer, sizeof(char) * B->size); 70 | } 71 | B->buffer[B->position] = C; 72 | B->position += required; 73 | } 74 | 75 | static __INLINE void buffer_append(string_buffer* B, const char *format, ...) { 76 | 77 | int required; 78 | va_list args; 79 | 80 | #if defined(__OS2__) || defined(__WINDOWS__) || defined(WIN32) || defined(_MSC_VER) 81 | 82 | va_start(args, format); 83 | required = _vscprintf(format, args) + 1; 84 | va_end(args); 85 | if (required >= B->size - B->position) { 86 | B->size = B->position + required + 1; 87 | B->buffer = (char*) realloc(B->buffer, sizeof(char) * B->size); 88 | } 89 | va_start(args, format); 90 | required = _vsnprintf_s(&(B->buffer[B->position]), B->size - B->position, _TRUNCATE, format, args); 91 | va_end(args); 92 | B->position += required; 93 | 94 | #else 95 | va_start(args, format); 96 | required = vsnprintf(&(B->buffer[B->position]), B->size - B->position, format, args); 97 | va_end(args); 98 | if (required >= B->size - B->position) { 99 | B->size = B->position + required + 1; 100 | B->buffer = (char*) realloc(B->buffer, sizeof(char) * B->size); 101 | va_start(args, format); 102 | required = vsnprintf(&(B->buffer[B->position]), B->size - B->position, format, args); 103 | va_end(args); 104 | } 105 | B->position += required; 106 | #endif 107 | 108 | } 109 | 110 | static __INLINE string_list* list_create(int L) { 111 | string_list* B = (string_list*) malloc(sizeof(string_list)); 112 | B->size = L; 113 | B->buffer = (char**) malloc(sizeof(char*) * B->size); 114 | memset(B->buffer, 0, sizeof(char*) * B->size); 115 | B->position = 0; 116 | return B; 117 | } 118 | 119 | static __INLINE void list_reset(string_list* B) { 120 | int i; 121 | for (i = 0; i < B->position; i++) { 122 | if (B->buffer[i]) free(B->buffer[i]); 123 | B->buffer[i] = NULL; 124 | } 125 | B->position = 0; 126 | } 127 | 128 | static __INLINE void list_destroy(string_list **B) { 129 | int i; 130 | 131 | if (!(*B)) return; 132 | 133 | for (i = 0; i < (*B)->position; i++) { 134 | if ((*B)->buffer[i]) free((*B)->buffer[i]); (*B)->buffer[i] = NULL; 135 | } 136 | 137 | if ((*B)->buffer) { 138 | free((*B)->buffer); (*B)->buffer = NULL; 139 | } 140 | 141 | free((*B)); 142 | (*B) = NULL; 143 | } 144 | 145 | static __INLINE char* list_get(const string_list *B, int I) { 146 | if (I < 0 || I >= B->position) { 147 | return NULL; 148 | } else { 149 | if (!B->buffer[I]) { 150 | return NULL; 151 | } else { 152 | char *S; 153 | int length = strlen(B->buffer[I]); 154 | S = (char*) malloc(sizeof(char) * (length + 1)); 155 | memcpy(S, B->buffer[I], length + 1); 156 | return S; 157 | } 158 | } 159 | } 160 | 161 | static __INLINE int list_size(const string_list *B) { 162 | return B->position; 163 | } 164 | 165 | static __INLINE void list_append(string_list *B, char* S) { 166 | int required = 1; 167 | int length = strlen(S); 168 | if (required > B->size - B->position) { 169 | B->size = B->position + 16; 170 | B->buffer = (char**) realloc(B->buffer, sizeof(char*) * B->size); 171 | } 172 | B->buffer[B->position] = (char*) malloc(sizeof(char) * (length + 1)); 173 | memcpy(B->buffer[B->position], S, length + 1); 174 | B->position += required; 175 | } 176 | 177 | // This version of the append does not copy the string but simply takes the control of its allocation 178 | static __INLINE void list_append_direct(string_list *B, char* S) { 179 | int required = 1; 180 | // int length = strlen(S); 181 | if (required > B->size - B->position) { 182 | B->size = B->position + 16; 183 | B->buffer = (char**) realloc(B->buffer, sizeof(char*) * B->size); 184 | } 185 | B->buffer[B->position] = S; 186 | B->position += required; 187 | } 188 | 189 | 190 | #endif 191 | -------------------------------------------------------------------------------- /toolkit/utils/src/region.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 4; tab-width: 4 -*- */ 2 | 3 | #ifndef _REGION_H_ 4 | #define _REGION_H_ 5 | 6 | #ifdef TRAX_STATIC_DEFINE 7 | # define __TRAX_EXPORT 8 | #else 9 | # ifndef __TRAX_EXPORT 10 | # if defined(_MSC_VER) 11 | # ifdef trax_EXPORTS 12 | /* We are building this library */ 13 | # define __TRAX_EXPORT __declspec(dllexport) 14 | # else 15 | /* We are using this library */ 16 | # define __TRAX_EXPORT __declspec(dllimport) 17 | # endif 18 | # elif defined(__GNUC__) 19 | # ifdef trax_EXPORTS 20 | /* We are building this library */ 21 | # define __TRAX_EXPORT __attribute__((visibility("default"))) 22 | # else 23 | /* We are using this library */ 24 | # define __TRAX_EXPORT __attribute__((visibility("default"))) 25 | # endif 26 | # endif 27 | # endif 28 | #endif 29 | 30 | #ifndef MAX 31 | #define MAX(a,b) (((a) > (b)) ? (a) : (b)) 32 | #endif 33 | 34 | #ifndef MIN 35 | #define MIN(a,b) (((a) < (b)) ? (a) : (b)) 36 | #endif 37 | 38 | #define TRAX_DEFAULT_CODE 0 39 | 40 | #define REGION_LEGACY_RASTERIZATION 1 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | typedef enum region_type {EMPTY, SPECIAL, RECTANGLE, POLYGON, MASK} region_type; 47 | 48 | typedef struct region_bounds { 49 | 50 | float top; 51 | float bottom; 52 | float left; 53 | float right; 54 | 55 | } region_bounds; 56 | 57 | typedef struct region_polygon { 58 | 59 | int count; 60 | 61 | float* x; 62 | float* y; 63 | 64 | } region_polygon; 65 | 66 | typedef struct region_mask { 67 | 68 | int x; 69 | int y; 70 | 71 | int width; 72 | int height; 73 | 74 | char* data; 75 | 76 | } region_mask; 77 | 78 | typedef struct region_rectangle { 79 | 80 | float x; 81 | float y; 82 | float width; 83 | float height; 84 | 85 | } region_rectangle; 86 | 87 | typedef struct region_container { 88 | enum region_type type; 89 | union { 90 | region_rectangle rectangle; 91 | region_polygon polygon; 92 | region_mask mask; 93 | int special; 94 | } data; 95 | } region_container; 96 | 97 | typedef struct region_overlap { 98 | 99 | float overlap; 100 | float only1; 101 | float only2; 102 | 103 | } region_overlap; 104 | 105 | extern const region_bounds region_no_bounds; 106 | 107 | __TRAX_EXPORT int region_set_flags(int mask); 108 | 109 | __TRAX_EXPORT int region_clear_flags(int mask); 110 | 111 | __TRAX_EXPORT region_overlap region_compute_overlap(const region_container* ra, const region_container* rb, region_bounds bounds); 112 | 113 | __TRAX_EXPORT float compute_polygon_overlap(const region_polygon* p1, const region_polygon* p2, float *only1, float *only2, region_bounds bounds); 114 | 115 | __TRAX_EXPORT region_bounds region_create_bounds(float left, float top, float right, float bottom); 116 | 117 | __TRAX_EXPORT region_bounds region_compute_bounds(const region_container* region); 118 | 119 | __TRAX_EXPORT int region_parse(const char* buffer, region_container** region); 120 | 121 | __TRAX_EXPORT char* region_string(region_container* region); 122 | 123 | __TRAX_EXPORT void region_print(FILE* out, region_container* region); 124 | 125 | __TRAX_EXPORT region_container* region_convert(const region_container* region, region_type type); 126 | 127 | __TRAX_EXPORT void region_release(region_container** region); 128 | 129 | __TRAX_EXPORT region_container* region_create_special(int code); 130 | 131 | __TRAX_EXPORT region_container* region_create_rectangle(float x, float y, float width, float height); 132 | 133 | __TRAX_EXPORT region_container* region_create_polygon(int count); 134 | 135 | __TRAX_EXPORT int region_contains_point(region_container* r, float x, float y); 136 | 137 | __TRAX_EXPORT void region_get_mask(region_container* r, char* mask, int width, int height); 138 | 139 | __TRAX_EXPORT void region_get_mask_offset(region_container* r, char* mask, int x, int y, int width, int height); 140 | 141 | #ifdef __cplusplus 142 | } 143 | #endif 144 | 145 | #endif 146 | -------------------------------------------------------------------------------- /toolkit/utils/statistics.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author fangyi.zhang@vipl.ict.ac.cn 3 | """ 4 | import numpy as np 5 | from . import region 6 | 7 | def calculate_failures(trajectory): 8 | """ Calculate number of failures 9 | Args: 10 | trajectory: list of bbox 11 | Returns: 12 | num_failures: number of failures 13 | failures: failures point in trajectory, start with 0 14 | """ 15 | failures = [i for i, x in zip(range(len(trajectory)), trajectory) 16 | if len(x) == 1 and x[0] == 2] 17 | num_failures = len(failures) 18 | return num_failures, failures 19 | 20 | def calculate_accuracy(pred_trajectory, gt_trajectory, 21 | burnin=0, ignore_unknown=True, bound=None): 22 | """Caculate accuracy socre as average overlap over the entire sequence 23 | Args: 24 | trajectory: list of bbox 25 | gt_trajectory: list of bbox 26 | burnin: number of frames that have to be ignored after the failure 27 | ignore_unknown: ignore frames where the overlap is unknown 28 | bound: bounding region 29 | Return: 30 | acc: average overlap 31 | overlaps: per frame overlaps 32 | """ 33 | pred_trajectory_ = pred_trajectory 34 | if not ignore_unknown: 35 | unkown = [len(x)==1 and x[0] == 0 for x in pred_trajectory] 36 | 37 | if burnin > 0: 38 | pred_trajectory_ = pred_trajectory[:] 39 | mask = [len(x)==1 and x[0] == 1 for x in pred_trajectory] 40 | for i in range(len(mask)): 41 | if mask[i]: 42 | for j in range(burnin): 43 | if i + j < len(mask): 44 | pred_trajectory_[i+j] = [0] 45 | min_len = min(len(pred_trajectory_), len(gt_trajectory)) 46 | overlaps = region.vot_overlap_traj(pred_trajectory_[:min_len], 47 | gt_trajectory[:min_len], bound) 48 | 49 | if not ignore_unknown: 50 | overlaps = [u if u else 0 for u in unkown] 51 | 52 | acc = 0 53 | if len(overlaps) > 0: 54 | acc = np.nanmean(overlaps) 55 | return acc, overlaps 56 | 57 | # def caculate_expected_overlap(pred_trajectorys, gt_trajectorys, skip_init, traj_length=None, 58 | # weights=None, tags=['all']): 59 | # """ Caculate expected overlap 60 | # Args: 61 | # pred_trajectory: list of bbox 62 | # gt_trajectory: list of bbox 63 | # traj_length: a list of sequence length for which the overlap should be evaluated 64 | # weights: a list of per-sequence weights that indicate how much does each sequence 65 | # contribute to the estimate 66 | # tags: set list of tags for which to perform calculation 67 | # """ 68 | # overlaps = [calculate_accuracy(pred, gt)[1] 69 | # for pred, gt in zip(pred_trajectorys, gt_trajectorys)] 70 | # failures = [calculate_accuracy(pred, gt)[1] 71 | # for pred, gt in zip(pred_trajectorys, gt_trajectorys)] 72 | # 73 | # if traj_length is None: 74 | # traj_length = range(1, max([len(x) for x in gt_trajectorys])+1) 75 | # traj_length = list(set(traj_length)) 76 | 77 | def overlap_ratio(rect1, rect2): 78 | '''Compute overlap ratio between two rects 79 | Args 80 | rect:2d array of N x [x,y,w,h] 81 | Return: 82 | iou 83 | ''' 84 | # if rect1.ndim==1: 85 | # rect1 = rect1[np.newaxis, :] 86 | # if rect2.ndim==1: 87 | # rect2 = rect2[np.newaxis, :] 88 | left = np.maximum(rect1[:,0], rect2[:,0]) 89 | right = np.minimum(rect1[:,0]+rect1[:,2], rect2[:,0]+rect2[:,2]) 90 | top = np.maximum(rect1[:,1], rect2[:,1]) 91 | bottom = np.minimum(rect1[:,1]+rect1[:,3], rect2[:,1]+rect2[:,3]) 92 | 93 | intersect = np.maximum(0,right - left) * np.maximum(0,bottom - top) 94 | union = rect1[:,2]*rect1[:,3] + rect2[:,2]*rect2[:,3] - intersect 95 | iou = intersect / union 96 | iou = np.maximum(np.minimum(1, iou), 0) 97 | return iou 98 | 99 | def success_overlap(gt_bb, result_bb, n_frame): 100 | thresholds_overlap = np.arange(0, 1.05, 0.05) 101 | success = np.zeros(len(thresholds_overlap)) 102 | iou = np.ones(len(gt_bb)) * (-1) 103 | # mask = np.sum(gt_bb > 0, axis=1) == 4 #TODO check all dataset 104 | mask = np.sum(gt_bb[:, 2:] > 0, axis=1) == 2 105 | iou[mask] = overlap_ratio(gt_bb[mask], result_bb[mask]) 106 | for i in range(len(thresholds_overlap)): 107 | success[i] = np.sum(iou > thresholds_overlap[i]) / float(n_frame) 108 | return success 109 | 110 | def success_error(gt_center, result_center, thresholds, n_frame): 111 | # n_frame = len(gt_center) 112 | success = np.zeros(len(thresholds)) 113 | dist = np.ones(len(gt_center)) * (-1) 114 | mask = np.sum(gt_center > 0, axis=1) == 2 115 | dist[mask] = np.sqrt(np.sum( 116 | np.power(gt_center[mask] - result_center[mask], 2), axis=1)) 117 | for i in range(len(thresholds)): 118 | success[i] = np.sum(dist <= thresholds[i]) / float(n_frame) 119 | return success 120 | 121 | def determine_thresholds(scores, resolution=100): 122 | """ 123 | Args: 124 | scores: 1d array of score 125 | """ 126 | scores = np.sort(scores[np.logical_not(np.isnan(scores))]) 127 | delta = np.floor(len(scores) / (resolution - 2)) 128 | idxs = np.floor(np.linspace(delta-1, len(scores)-delta, resolution-2)+0.5).astype(np.int32) 129 | thresholds = np.zeros((resolution)) 130 | thresholds[0] = - np.inf 131 | thresholds[-1] = np.inf 132 | thresholds[1:-1] = scores[idxs] 133 | return thresholds 134 | 135 | def calculate_f1(overlaps, score, bound, thresholds, N): 136 | overlaps = np.array(overlaps) 137 | overlaps[np.isnan(overlaps)] = 0 138 | score = np.array(score) 139 | score[np.isnan(score)] = 0 140 | precision = np.zeros(len(thresholds)) 141 | recall = np.zeros(len(thresholds)) 142 | for i, th in enumerate(thresholds): 143 | if th == - np.inf: 144 | idx = score > 0 145 | else: 146 | idx = score >= th 147 | if np.sum(idx) == 0: 148 | precision[i] = 1 149 | recall[i] = 0 150 | else: 151 | precision[i] = np.mean(overlaps[idx]) 152 | recall[i] = np.sum(overlaps[idx]) / N 153 | f1 = 2 * precision * recall / (precision + recall) 154 | return f1, precision, recall 155 | 156 | def calculate_expected_overlap(fragments, fweights): 157 | max_len = fragments.shape[1] 158 | expected_overlaps = np.zeros((max_len), np.float32) 159 | expected_overlaps[0] = 1 160 | 161 | # TODO Speed Up 162 | for i in range(1, max_len): 163 | mask = np.logical_not(np.isnan(fragments[:, i])) 164 | if np.any(mask): 165 | fragment = fragments[mask, 1:i+1] 166 | seq_mean = np.sum(fragment, 1) / fragment.shape[1] 167 | expected_overlaps[i] = np.sum(seq_mean * 168 | fweights[mask]) / np.sum(fweights[mask]) 169 | return expected_overlaps 170 | -------------------------------------------------------------------------------- /toolkit/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | from .draw_success_precision import draw_success_precision 2 | -------------------------------------------------------------------------------- /toolkit/visualization/draw_success_precision.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | from .draw_utils import COLOR, LINE_STYLE 5 | 6 | 7 | def draw_success_precision( 8 | success_ret, 9 | name, 10 | videos, 11 | attr, 12 | precision_ret=None, 13 | norm_precision_ret=None, 14 | bold_name=None, 15 | axis=[0, 1], 16 | ): 17 | # success plot 18 | fig, ax = plt.subplots() 19 | ax.grid() 20 | ax.set_aspect(1) 21 | plt.xlabel("Overlap threshold") 22 | plt.ylabel("Success rate") 23 | if attr == "ALL": 24 | plt.title(r"\textbf{Success plots of OPE on %s}" % (name)) 25 | else: 26 | plt.title(r"\textbf{Success plots of OPE - %s}" % (attr)) 27 | plt.axis([0, 1] + axis) 28 | success = {} 29 | thresholds = np.arange(0, 1.05, 0.05) 30 | for tracker_name in success_ret.keys(): 31 | value = [v for k, v in success_ret[tracker_name].items() if k in videos] 32 | success[tracker_name] = np.mean(value) 33 | for idx, (tracker_name, auc) in enumerate( 34 | sorted(success.items(), key=lambda x: x[1], reverse=True) 35 | ): 36 | if tracker_name == bold_name: 37 | label = r"\textbf{[%.3f] %s}" % (auc, tracker_name) 38 | else: 39 | label = "[%.3f] " % (auc) + tracker_name 40 | value = [v for k, v in success_ret[tracker_name].items() if k in videos] 41 | plt.plot( 42 | thresholds, 43 | np.mean(value, axis=0), 44 | color=COLOR[idx], 45 | linestyle=LINE_STYLE[idx], 46 | label=label, 47 | linewidth=2, 48 | ) 49 | ax.legend(loc="lower left", labelspacing=0.2) 50 | ax.autoscale(enable=True, axis="both", tight=True) 51 | xmin, xmax, ymin, ymax = plt.axis() 52 | ax.autoscale(enable=False) 53 | ymax += 0.03 54 | plt.axis([xmin, xmax, ymin, ymax]) 55 | plt.xticks(np.arange(xmin, xmax + 0.01, 0.1)) 56 | plt.yticks(np.arange(ymin, ymax, 0.1)) 57 | ax.set_aspect((xmax - xmin) / (ymax - ymin)) 58 | # plt.show() 59 | plt.savefig( 60 | f"/mnt/sdc/V4R/LX/2024/HiFT/result_{attr}.png", 61 | format="png", 62 | bbox_inches="tight", 63 | ) 64 | -------------------------------------------------------------------------------- /toolkit/visualization/draw_utils.py: -------------------------------------------------------------------------------- 1 | COLOR = ( 2 | (1, 0, 0), 3 | (0, 1, 0), 4 | (1, 0, 1), 5 | (1, 1, 0), 6 | (0, 162 / 255, 232 / 255), 7 | (0.5, 0.5, 0.5), 8 | (0, 0, 1), 9 | (0, 1, 1), 10 | (136 / 255, 0, 21 / 255), 11 | (255 / 255, 127 / 255, 39 / 255), 12 | (0, 0, 0), 13 | ) 14 | 15 | LINE_STYLE = ["-", "--", ":", "-", "--", ":", "-", "--", ":", "-"] 16 | 17 | MARKER_STYLE = ["o", "v", "<", "*", "D", "x", ".", "x", "<", "."] 18 | -------------------------------------------------------------------------------- /tools/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | 5 | sys.path.append("./") 6 | 7 | from tqdm import tqdm 8 | from multiprocessing import Pool 9 | 10 | from toolkit.visualization import draw_success_precision 11 | from toolkit.evaluation import OPEBenchmark 12 | from toolkit.datasets import * 13 | 14 | 15 | parser = argparse.ArgumentParser(description="Single Object Tracking Evaluation") 16 | parser.add_argument( 17 | "--datasetpath", 18 | default="./test_dataset/", 19 | type=str, 20 | help="dataset root directory", 21 | ) 22 | parser.add_argument("--dataset", default="UAV123", type=str, help="dataset name") 23 | parser.add_argument( 24 | "--tracker_result_dir", 25 | default="./results", 26 | type=str, 27 | help="tracker result root", 28 | ) 29 | parser.add_argument("--tracker_path", default="./results", type=str) 30 | parser.add_argument("--vis", default="", dest="vis", action="store_true") 31 | parser.add_argument( 32 | "--show_video_level", default=True, dest="show_video_level", action="store_true" 33 | ) 34 | parser.add_argument("--num", default=1, type=int, help="number of processes to eval") 35 | args = parser.parse_args() 36 | 37 | 38 | def main(): 39 | tracker_dir = os.path.join(args.tracker_path, args.dataset) 40 | trackers = os.listdir(tracker_dir) 41 | 42 | root = args.datasetpath + args.dataset 43 | 44 | assert len(trackers) > 0 45 | args.num = min(args.num, len(trackers)) 46 | 47 | if "UAV123_10fps" in args.dataset: 48 | dataset = UAV10Dataset(args.dataset, root) 49 | elif "UAV123_20L" in args.dataset: 50 | dataset = UAV20Dataset(args.dataset, root) 51 | elif "UAV123" in args.dataset: 52 | dataset = UAVDataset(args.dataset, root) 53 | elif "DTB70" in args.dataset: 54 | dataset = DTBDataset(args.dataset, root) 55 | elif "UAVDT" in args.dataset: 56 | dataset = UAVDTDataset(args.dataset, root) 57 | elif "VISDRONED" in args.dataset: 58 | dataset = VISDRONEDDataset(args.dataset, root) 59 | elif "UAVTrack112_L" in args.dataset: 60 | dataset = UAV112LDataset(args.dataset, root) 61 | elif "UAVTrack112" in args.dataset: 62 | dataset = UAV112Dataset(args.dataset, root) 63 | 64 | dataset.set_tracker(tracker_dir, trackers) 65 | benchmark = OPEBenchmark(dataset) 66 | success_ret = {} 67 | with Pool(processes=args.num) as pool: 68 | for ret in tqdm( 69 | pool.imap_unordered(benchmark.eval_success, trackers), 70 | desc="eval success", 71 | total=len(trackers), 72 | ncols=18, 73 | ): 74 | success_ret.update(ret) 75 | 76 | norm_precision_ret = {} 77 | with Pool(processes=args.num) as pool: 78 | for ret in tqdm( 79 | pool.imap_unordered(benchmark.eval_norm_precision, trackers), 80 | desc="eval norm precision", 81 | total=len(trackers), 82 | ncols=25, 83 | ): 84 | norm_precision_ret.update(ret) 85 | 86 | precision_ret = {} 87 | with Pool(processes=args.num) as pool: 88 | for ret in tqdm( 89 | pool.imap_unordered(benchmark.eval_precision, trackers), 90 | desc="eval precision", 91 | total=len(trackers), 92 | ncols=20, 93 | ): 94 | precision_ret.update(ret) 95 | 96 | benchmark.show_result( 97 | success_ret, 98 | precision_ret, 99 | norm_precision_ret, 100 | show_video_level=args.show_video_level, 101 | ) 102 | 103 | if args.vis: 104 | for attr, videos in dataset.attr.items(): 105 | draw_success_precision( 106 | success_ret, 107 | name=dataset.name, 108 | videos=videos, 109 | attr=attr, 110 | precision_ret=precision_ret, 111 | ) 112 | 113 | 114 | if __name__ == "__main__": 115 | main() 116 | -------------------------------------------------------------------------------- /tools/snapshot/README.md: -------------------------------------------------------------------------------- 1 | You need to download the model from [Google Drive](https://drive.google.com/drive/folders/1WYQf_zAMy9Xf1tLH1MRmQELe5ywwsB5d?usp=drive_link) and put it in the `tools/snapshot` directory. 2 | -------------------------------------------------------------------------------- /tools/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | 5 | sys.path.append("./") 6 | 7 | import cv2 8 | import torch 9 | import numpy as np 10 | 11 | from pysot.core.config import cfg 12 | from pysot.models.model_builder import ModelBuilder 13 | from pysot.tracker.prl_tracker import PRLTrack 14 | from pysot.utils.bbox import get_axis_aligned_bbox 15 | from pysot.utils.model_load import load_pretrain 16 | from toolkit.datasets import DatasetFactory 17 | 18 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 19 | 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser(description="PRL-Track tracking") 23 | parser.add_argument("--dataset", default="UAVTrack112", type=str, help="datasets"), 24 | parser.add_argument( 25 | "--dataset-root", default="./test_dataset", type=str, help="dataset root" 26 | ) 27 | parser.add_argument( 28 | "--config", default="./experiments/config.yaml", type=str, help="config file" 29 | ) 30 | parser.add_argument( 31 | "--snapshot", 32 | default="./tools/snapshot/best.pth", 33 | type=str, 34 | help="snapshot of models to eval", 35 | ) 36 | parser.add_argument("--video", default="", type=str, help="eval one special video") 37 | parser.add_argument( 38 | "--vis", default="", action="store_true", help="whether visualzie result" 39 | ) 40 | args = parser.parse_args() 41 | 42 | torch.set_num_threads(1) 43 | 44 | cfg.merge_from_file(args.config) 45 | 46 | dataset_root = os.path.join(args.dataset_root, args.dataset) 47 | model = ModelBuilder() 48 | model = load_pretrain(model, args.snapshot).cuda().eval() 49 | 50 | tracker = PRLTrack(model) 51 | 52 | dataset = DatasetFactory.create_dataset( 53 | name=args.dataset, dataset_root=dataset_root, load_img=False 54 | ) 55 | 56 | model_name = args.snapshot.split("/")[-1].split(".")[0] 57 | 58 | for v_idx, video in enumerate(dataset): 59 | if args.video != "": 60 | if video.name != args.video: 61 | continue 62 | 63 | toc = 0 64 | pred_bboxes = [] 65 | scores = [] 66 | track_times = [] 67 | for idx, (img, gt_bbox) in enumerate(video): 68 | tic = cv2.getTickCount() 69 | if idx == 0: 70 | cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) 71 | gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] 72 | tracker.init(img, gt_bbox_) 73 | pred_bbox = gt_bbox_ 74 | scores.append(None) 75 | if "VOT2018-LT" == args.dataset: 76 | pred_bboxes.append([1]) 77 | else: 78 | pred_bboxes.append(pred_bbox) 79 | else: 80 | outputs = tracker.track(img) 81 | pred_bbox = outputs["bbox"] 82 | pred_bboxes.append(pred_bbox) 83 | scores.append(outputs["best_score"]) 84 | toc += cv2.getTickCount() - tic 85 | track_times.append((cv2.getTickCount() - tic) / cv2.getTickFrequency()) 86 | if idx == 0: 87 | cv2.destroyAllWindows() 88 | if args.vis: 89 | try: 90 | gt_bbox = list(map(int, gt_bbox)) 91 | except: 92 | gt_bbox = [0, 0, 0, 0] 93 | pred_bbox = list(map(int, pred_bbox)) 94 | cv2.rectangle( 95 | img, 96 | (gt_bbox[0], gt_bbox[1]), 97 | (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), 98 | (0, 255, 0), 99 | 3, 100 | ) 101 | cv2.rectangle( 102 | img, 103 | (pred_bbox[0], pred_bbox[1]), 104 | (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), 105 | (0, 255, 255), 106 | 3, 107 | ) 108 | cv2.putText( 109 | img, 110 | str(idx), 111 | (40, 40), 112 | cv2.FONT_HERSHEY_SIMPLEX, 113 | 1, 114 | (0, 255, 255), 115 | 2, 116 | ) 117 | cv2.imshow(video.name, img) 118 | cv2.waitKey(1) 119 | toc /= cv2.getTickFrequency() 120 | 121 | model_path = os.path.join("results", args.dataset, model_name) 122 | if not os.path.isdir(model_path): 123 | os.makedirs(model_path) 124 | result_path = os.path.join(model_path, "{}.txt".format(video.name)) 125 | with open(result_path, "w") as f: 126 | for x in pred_bboxes: 127 | f.write(",".join([str(i) for i in x]) + "\n") 128 | print( 129 | "({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps".format( 130 | v_idx + 1, video.name, toc, idx / toc 131 | ) 132 | ) 133 | 134 | 135 | if __name__ == "__main__": 136 | main() 137 | --------------------------------------------------------------------------------