├── .gitattributes ├── .gitignore ├── LICENSE ├── Metrics.py ├── README.md ├── Visualization ├── Vis_Input.py ├── Vis_Res.py └── __init__.py ├── __init__.py ├── assets └── demo.gif ├── configs ├── TubeTK_resnet_50_FPN_8frame_1stride.yaml ├── __init__.py ├── default.py ├── get_MOT17_tube.yaml └── get_jta_tube.yaml ├── dataset ├── Parsers │ ├── JTA.py │ ├── MOT17.py │ ├── __init__.py │ └── structures.py ├── __init__.py ├── augmentation.py ├── dataLoader.py ├── jta.py ├── mot17.py └── mot17jta.py ├── demo.py ├── evaluate.py ├── fetch_models.sh ├── install └── compile_local.sh ├── launch.py ├── main.py ├── network ├── __init__.py ├── focal_loss.py ├── fpn.py ├── resnet.py ├── track_head.py ├── tubetk.py └── utils.py ├── optim ├── __init__.py ├── lr_scheduler.py └── solver.py ├── post_processing ├── __init__.py ├── nms │ ├── __init__.py │ ├── setup.py │ └── src │ │ ├── nms_cpu.cpp │ │ ├── nms_cuda.cpp │ │ ├── nms_kernel.cu │ │ └── soft_nms_cpu.pyx ├── tube_iou_matching.py ├── tube_iou_matching_old.py ├── tube_iou_matching_super_old.py └── tube_nms.py ├── pre_processing ├── __init__.py ├── get_tubes_MOT17.py └── get_tubes_jta.py ├── requirements.txt ├── seqmaps ├── AVG-TownCentre.txt ├── JTA_train_turning ├── MOT15_test.txt ├── MOT17-01-FRCNN.txt ├── MOT17-02-FRCNN.txt ├── MOT17-04-FRCNN.txt ├── MOT17-13-FRCNN.txt ├── MOT17-14-FRCNN.txt ├── MOT17_test.txt ├── MOT17_train.txt └── PETS09-S2L2.txt └── utils ├── __init__.py ├── mem_track.py └── util.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Example user template template 3 | ### Example user template 4 | 5 | # IntelliJ project files 6 | .idea 7 | *.iml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Bo Pang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TubeTK 2 | 3 | TubeTK is an one-step end-to-end multi-object tracking method, which is the **first end-to-end** open-source system that achieves **60+ MOTA** on MOT-16 (64 MOTA) and MOT-17 (63 MOTA) datasets. 4 | Our paper "[TubeTK: Adopting Tubes to Track Multi-Object in a One-Step Training Model](https://bopang1996.github.io/posts/2020/04/tubeTKpaper/)" is accepted as an oral paper on CVPR-2020. 5 | 6 | 7 | 8 | # Contents 9 | 10 | - [TubeTK](#TubeTK) 11 | - [Contents](#Contents) 12 | - [Results](#Results) 13 | - [MOT-16](#MOT-16) 14 | - [MOT-17](#MOT-17) 15 | - [Installation](#Installation) 16 | - [Quick Start](#Quick-Start) 17 | - [Demo](#Demo) 18 | - [Evaluation](#Evaluation-on-MOT-17-(16)) 19 | - [Train](#Train-on-MOT-17-(16)) 20 | 21 | # Results 22 | 23 | ![Demo Video](assets/demo.gif) 24 | 25 | 26 | ## MOT-16 27 | 28 | Results on MOT-16 dataset: 29 | 30 | | Video | MOTA | IDF1 | MT | ML | FP | FN | IDS | 31 | | ------------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 32 | | MOT16-01 | 48.9 | 45.5 | 8 | 9 | 175 | 3052 | 40| 33 | | MOT16-03 | 76.3 | 69.5 | 86 | 12 | 3741 | 20828 | 177| 34 | | MOT16-06 | 51.2 | 55.7 | 87 | 39 | 1863 | 3542 | 231| 35 | | MOT16-07 | 55.0 | 43.5 | 21 | 3 | 2225 | 4938 | 190| 36 | | MOT16-08 | 46.9 | 37.3 | 18 | 3 | 1694 | 6952 | 234| 37 | | MOT16-12 | 52.4 | 50.8 | 27 | 20 | 533 | 3366 | 51| 38 | | MOT16-14 | 35.8 | 39.8 | 7 | 61 | 731 | 10948 | 194| 39 | | TubeTK (Mean) | 64.0 | 59.4 | 33.5 | 19.4 | 10962 | 53626 | 1117 | 40 | | RAN | 63.0 | 63.8 | 39.9 | 22.1 | 13663 | 53248 | 482 | 41 | | Tracktor | 54.5 | 52.5 | 19.0 | 36.9 | 3280 | 79149 | 682 | 42 | 43 | 44 | 45 | ## MOT-17 46 | Results on MOT-17 dataset: 47 | 48 | | Video | MOTA | IDF1 | MT | ML | FP | FN | IDS | 49 | | ------------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 50 | | MOT17-01 | 47.9 | 44.9 | 6 | 10 | 167 | 3154 | 41| 51 | | MOT17-03 | 76.4 | 69.6 | 81 | 12 | 3181 | 21287 | 186| 52 | | MOT17-06 | 52.4 | 54.8 | 85 | 36 | 1609 | 3699 | 307| 53 | | MOT17-07 | 55.4 | 43.3 | 21 | 2 | 1944 | 5371 | 222| 54 | | MOT17-08 | 42.3 | 34.1 | 18 | 12 | 970 | 10889 | 319| 55 | | MOT17-12 | 50.3 | 49.4 | 28 | 23 | 494 | 3749 | 63| 56 | | MOT17-14 | 35.6 | 39.5 | 6 | 61 | 655 | 11012 | 241| 57 | | TubeTK (Mean) | 63.0 |58.6 | 31.2 | 19.9 | 27060 |177483 | 4137 | 58 | | SCNet | 60.0 | 54.4 | 34.4 | 16.2 | 72230 | 145851 | 7611 | 59 | | Tracktor | 53.5 | 52.3 | 19.5 | 36.3 | 12201 | 248047 | 2072 | 60 | 61 | 62 | 63 | # Installation 64 | 65 | 1. Get the code and build related modules: 66 | 67 | ```Shell 68 | git clone ...(TO BE CONFIRM) 69 | cd TubeTK/install 70 | ./compile.sh 71 | # if something wrong, try: 72 | # sudo ldconfig /lib64 73 | cd .. 74 | ``` 75 | 76 | 2. Install [pytorch 1.10]( https://pytorch.org/ ) and other dependencies: 77 | 78 | ```Shell 79 | pip install -r requirements.txt 80 | ``` 81 | 82 | 83 | 3. If the memory of your GPU < 16G, then you need [NVIDIA APEX]( https://github.com/nvidia/apex ) to conduct the mixed precision training. 84 | 85 | 1. Install Apex: 86 | 87 | ```Shell 88 | git clone https://github.com/NVIDIA/apex 89 | cd apex 90 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 91 | # if something wrong with the above pip install, try: 92 | # pip install -v --no-cache-dir ./ 93 | ``` 94 | 95 | 2. We provide the `--apex` option to train with the APEX, see [Quick Start](#quick-start) for detail. 96 | 97 | 4. Run `fetch_model.sh` to download our pre-trained models. Or download the models manually and put them in `./models`: 98 | 99 | 100 | 1. 3DResNet50_original ([Baidu pan](https://pan.baidu.com/s/13GHBQlpugHmhMDG9pQ0_Sw) | [Google drive](https://drive.google.com/open?id=1jLgyNmiZ_c-m8Cw3NcZTEPTf6VESfIzK)) 101 | 102 | 103 | 104 | 105 | # Quick Start 106 | 107 | ## Demo 108 | 109 | Run TubeTK for a video and visualization the results with: 110 | 111 | ```Shell 112 | python launch.py --nproc_per --training_script demo.py --batch_size=3 --config configs/TubeTK_resnet_50_FPN_8frame_1stride.yaml --video_url --output_dir ./vis_video 113 | ``` 114 | 115 | 116 | 117 | ## Evaluation on MOT-17 (16) 118 | 119 | 1. Download the data from [MOT Challenge](https://motchallenge.net/data/MOT17/ ), and put or link it to `./data` 120 | 121 | 2. To get the tracking result with: 122 | 123 | ```Shell 124 | python launch.py --nproc_per --training_script evaluate.py --batch_size 3 --config configs/TubeTK_resnet_50_FPN_8frame_1stride.yaml --trainOrTest test 125 | ``` 126 | 127 | 3. To get the visualization with: 128 | 129 | ```Shell 130 | python Visualization/Vis_Res.py --mode test 131 | ``` 132 | 133 | The visualization videos are stored in `./vis_video` . 134 | 135 | 136 | 137 | ## Train on MOT-17 (16) 138 | 139 | 1. Download the data from [MOT Challenge](https://motchallenge.net/data/MOT17/ ), and put or link it to `./data` 140 | 141 | 2. Get the ground truth Btubes with: 142 | 143 | ```Shell 144 | python ./pre_processing/get_tubes_MOT17.py 145 | ``` 146 | 147 | 3. Train the model with: 148 | 149 | ```Shell 150 | python launch.py --nproc_per --training_script main.py --batch_size 1 --config ./configs/TubeTK_resnet_50_FPN_8frame_1stride.yaml 151 | ``` 152 | 153 | If out of memory, try: 154 | 155 | ```Shell 156 | python launch.py --nproc_per --training_script main.py --batch_size 1 --config ./configs/TubeTK_resnet_50_FPN_8frame_1stride.yaml --apex 157 | ``` 158 | 159 | If still out of memory, modify the configuration file: `TubeTK_resnet_50_FPN_8frame_1stride.yaml`: 160 | 161 | ``` 162 | tube_limit: 500 # or 300 163 | ``` 164 | 165 | ## Citation 166 | 167 | ``` 168 | @inproceedings{pang2020tubetk, 169 | title={TubeTK: Adopting Tubes to Track Multi-Object in a One-Step Training Model}, 170 | author={Pang, Bo and Li, Yizhuo and Zhang, Yifan and Li, Muchen and Lu, Cewu}, 171 | booktitle={CVPR}, 172 | year={2020} 173 | } 174 | ``` 175 | 176 | ## License 177 | 178 | TubeTK is freely available for free non-commercial use, and may be redistributed under these conditions. 179 | -------------------------------------------------------------------------------- /Visualization/Vis_Input.py: -------------------------------------------------------------------------------- 1 | import os 2 | import dataset.augmentation as argument 3 | import random 4 | import cv2 5 | import torch 6 | 7 | 8 | class VisArgumentation(object): 9 | def __init__(self, size=896, mean=(104, 117, 123)): 10 | self.mean = mean 11 | self.size = size 12 | self.augment = argument.Compose([ 13 | argument.AddMeans(self.mean), 14 | argument.ToCV2(), 15 | argument.ToAbsCoords() 16 | ]) 17 | 18 | def __call__(self, images, img_meta, tubes, labels, start_frame): 19 | return self.augment(images, img_meta, tubes, labels, start_frame) 20 | 21 | 22 | def get_color(): 23 | colors = [[i for i in range(0, 250, 20)], 24 | [i for i in range(0, 250, 20)], 25 | [i for i in range(0, 250, 20)]] 26 | for i in range(3): 27 | random.shuffle(colors[i]) 28 | colors = list(zip(*colors)) 29 | return colors 30 | 31 | 32 | def get_inter_box(start_box, end_box, inter_id, end_id): 33 | if end_id == 0: 34 | return start_box 35 | return start_box * (end_id - inter_id) / end_id + end_box * inter_id / end_id 36 | 37 | 38 | def tubes2bbox(tubes, colors, tube_len, stride): 39 | bboxes = [[] for _ in range(tube_len)] 40 | for i, tube in enumerate(tubes): 41 | color = colors[i % len(colors)] 42 | mid_frame = tube[4] / stride 43 | back_frame = (tube[4] - tube[10]) / stride 44 | front_frame = (tube[4] + tube[5]) / stride 45 | 46 | mid_bbox = tube[0:4] 47 | back_bbox = tube[11: 15] + mid_bbox 48 | front_bbox = tube[6: 10] + mid_bbox 49 | 50 | for f in range(int(back_frame), int(mid_frame)): 51 | bboxes[f].append([get_inter_box(back_bbox, mid_bbox, f - back_frame, mid_frame - back_frame), color]) 52 | for f in range(int(mid_frame), int(front_frame + 1)): 53 | bboxes[f].append([get_inter_box(mid_bbox, front_bbox, f - mid_frame, front_frame - mid_frame), color]) 54 | # bboxes[int(front_frame)].append([front_bbox, color]) 55 | return bboxes 56 | 57 | 58 | def vis_input(imgs, img_metas, gt_bboxes, gt_labels, start_frame, stride, out_folder): 59 | imgs_v, img_metas_v, gt_bboxes_v, gt_labels_v, start_frame_v = \ 60 | VisArgumentation()(imgs[0], img_metas[0], gt_bboxes[0], gt_labels[0], start_frame[0]) 61 | 62 | for f in range(len(imgs_v)): 63 | imgs_c = imgs_v.copy() 64 | f_tubes = [] 65 | for tube in gt_bboxes_v: 66 | if round(tube[4]) == f * stride: 67 | f_tubes.append(tube) 68 | bboxes = tubes2bbox(f_tubes, get_color(), len(imgs_v), stride=stride) 69 | for i in range(len(imgs_c)): 70 | f_bboxes = bboxes[i] 71 | for bbox in f_bboxes: 72 | cv2.rectangle(imgs_c[i], tuple(bbox[0][0:2]), tuple(bbox[0][2:4]), bbox[1], 1) 73 | cv2.imwrite(os.path.join(out_folder, str(i) + '.jpg'), imgs_c[i]) 74 | 75 | 76 | def tubes2bbox_out(tubes, colors, tube_len, stride): 77 | 78 | bboxes = [[] for _ in range(tube_len)] 79 | for i, tube in enumerate(tubes): 80 | color = colors[i % len(colors)] 81 | mid_frame = tube[0] / stride 82 | back_frame = tube[10] / stride 83 | front_frame = tube[5] / stride 84 | 85 | mid_bbox = tube[1:5] 86 | back_bbox = tube[11: 15] 87 | front_bbox = tube[6: 10] 88 | 89 | for f in range(int(back_frame), int(mid_frame)): 90 | bboxes[f].append([get_inter_box(back_bbox, mid_bbox, f - back_frame, mid_frame - back_frame), color]) 91 | for f in range(int(mid_frame), int(front_frame + 1)): 92 | bboxes[f].append([get_inter_box(mid_bbox, front_bbox, f - mid_frame, front_frame - mid_frame), color]) 93 | return bboxes 94 | 95 | 96 | def vis_output(imgs, img_metas, gt_bboxes, stride, out_folder): 97 | no_use = torch.tensor(gt_bboxes) 98 | imgs, img_metas, _, gt_labels, start_frame = \ 99 | VisArgumentation()(imgs, img_metas, no_use, torch.tensor(1), torch.tensor(1)) 100 | 101 | gt_bboxes[:, [1, 3, 6, 8, 11, 13]] /= img_metas['img_shape'][2] / img_metas['pad_percent'][0] / img_metas['value_range'] / 1024 102 | gt_bboxes[:, [2, 4, 7, 9, 12, 14]] /= img_metas['img_shape'][1] / img_metas['pad_percent'][1] / img_metas['value_range'] / 768 103 | gt_bboxes = gt_bboxes.data.numpy() 104 | for f in range(len(imgs)): 105 | imgs_c = imgs.copy() 106 | f_tubes = [] 107 | for tube in gt_bboxes: 108 | if round(tube[0]) == f * stride: 109 | f_tubes.append(tube) 110 | bboxes = tubes2bbox_out(f_tubes, get_color(), len(imgs), stride=stride) 111 | write_folder = os.path.join(out_folder, img_metas['video_name'], str(img_metas['start_frame']), str(f)) 112 | os.makedirs(write_folder, exist_ok=True) 113 | for i in range(len(imgs_c)): 114 | f_bboxes = bboxes[i] 115 | for bbox in f_bboxes: 116 | cv2.rectangle(imgs_c[i], tuple(bbox[0][0:2]), tuple(bbox[0][2:4]), bbox[1], 1) 117 | cv2.imwrite(os.path.join(write_folder, str(i) + '.jpg'), imgs_c[i]) 118 | -------------------------------------------------------------------------------- /Visualization/Vis_Res.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import os 3 | import argparse 4 | import configparser 5 | import numpy as np 6 | import pandas as pd 7 | from tqdm import tqdm 8 | from natsort import natsorted 9 | try: 10 | import xml.etree.cElementTree as ET 11 | except ImportError: 12 | import xml.etree.ElementTree as ET 13 | 14 | 15 | def get_seq_info_from_file_mot(seqName, dataDir): 16 | seqFolder = os.path.join(dataDir, seqName) 17 | seqInfoFile = os.path.join(dataDir, seqName, 'seqinfo.ini') 18 | config = configparser.ConfigParser() 19 | config.read(seqInfoFile) 20 | 21 | imgFolder = config.get('Sequence', 'imDir') 22 | frameRate = config.getfloat('Sequence', 'frameRate') 23 | F = config.getint('Sequence', 'seqLength') 24 | imWidth = config.getint('Sequence', 'imWidth') 25 | imHeight = config.getint('Sequence', 'imHeight') 26 | imgExt = config.get('Sequence', 'imExt') 27 | 28 | return imgFolder, frameRate, imWidth, imHeight 29 | 30 | 31 | def vis_one_video(res_file, frame_rate, img_width, img_height, img_dir, output_name): 32 | 33 | try: 34 | res = np.loadtxt(res_file, delimiter=',') 35 | except: 36 | res = np.loadtxt(res_file, delimiter=' ') 37 | res[:, 4:6] += res[:, 2:4] 38 | res = pd.DataFrame(res) 39 | res = res.replace([np.inf, -np.inf], np.nan) 40 | res = res.dropna() 41 | 42 | res_group = res.groupby(0) 43 | 44 | vid_writer = cv2.VideoWriter(output_name, 45 | cv2.VideoWriter_fourcc(*"MJPG"), frame_rate, (img_width, img_height)) 46 | 47 | img_names = natsorted(os.listdir(img_dir)) 48 | 49 | color_dict = {} 50 | for i, img_name in tqdm(enumerate(img_names), ncols=20): 51 | img = cv2.imread(os.path.join(img_dir, img_name)) 52 | frame = int(os.path.splitext(img_name)[0]) 53 | if frame not in res_group.groups.keys(): 54 | vid_writer.write(img) 55 | continue 56 | bboxes = res_group.get_group(frame).values 57 | for bbox in bboxes: 58 | if bbox[1] in color_dict: 59 | color = color_dict[bbox[1]] 60 | else: 61 | color = np.round(np.random.rand(3) * 255) 62 | color_dict[bbox[1]] = color 63 | cv2.rectangle(img, tuple(bbox[4:6].astype(int)), tuple(bbox[2:4].astype(int)), color=color, thickness=3) 64 | cv2.putText(img, str(bbox[6]) + ' ' + str(bbox[7])[0:5], 65 | tuple(bbox[2:4].astype(int)), cv2.FONT_HERSHEY_COMPLEX, 1, color, 2) 66 | vid_writer.write(img) 67 | vid_writer.release() 68 | 69 | 70 | def vis_res(args): 71 | try: 72 | os.makedirs(args.output_dir) 73 | except: 74 | pass 75 | 76 | if args.video_list is not None: 77 | video_list = open(args.video_list).readlines() 78 | video_list = [x.strip() for x in video_list] 79 | else: 80 | video_list = os.listdir(args.res_dir) 81 | video_list = [x for x in video_list if x.endswith('txt')] 82 | video_list = [os.path.splitext(x)[0] for x in video_list] 83 | video_list = [x for x in video_list if os.path.exists(os.path.join(args.data_dir, args.mode, x))] 84 | 85 | for vid in video_list: 86 | print('Processing {}'.format(vid)) 87 | res_file = os.path.join(args.res_dir, vid + '.txt') 88 | if not os.path.exists(res_file): 89 | res_file = os.path.join(args.res_dir, vid, 'gt.txt') 90 | 91 | if 'JTA' not in args.output_dir: 92 | img_folder, frame_rate, img_width, img_height = get_seq_info_from_file_mot(vid, os.path.join(args.data_dir, 93 | args.mode)) 94 | else: 95 | img_folder = '' 96 | frame_rate = 30 97 | img_width = 1920 98 | img_height = 1080 99 | 100 | img_dir = os.path.join(args.data_dir, args.mode, vid, img_folder) 101 | vis_one_video(res_file, frame_rate, img_width, img_height, img_dir, os.path.join(args.output_dir, vid + '.avi')) 102 | 103 | 104 | if __name__ == '__main__': 105 | parser = argparse.ArgumentParser() 106 | parser.add_argument('--res_dir', default='./link_res', type=str, help="path of the predicted tracks (saved as .txt)") 107 | parser.add_argument('--output_dir', default='./vis_video', type=str, help='where to save the output video') 108 | parser.add_argument('--data_dir', default='../data', type=str, help="input dataset path") 109 | parser.add_argument('--mode', default='test', type=str, help='vis the train or test set') 110 | parser.add_argument('--video_list', default='./seqmaps/MOT17_test.txt', type=str, 111 | help='List for videos to visualize, None for all in res_dir') 112 | args = parser.parse_args() 113 | vis_res(args) 114 | -------------------------------------------------------------------------------- /Visualization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoPang1996/TubeTK/bcca334c5348f9ae33e04595e1af93cf8351e50e/Visualization/__init__.py -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoPang1996/TubeTK/bcca334c5348f9ae33e04595e1af93cf8351e50e/__init__.py -------------------------------------------------------------------------------- /assets/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoPang1996/TubeTK/bcca334c5348f9ae33e04595e1af93cf8351e50e/assets/demo.gif -------------------------------------------------------------------------------- /configs/TubeTK_resnet_50_FPN_8frame_1stride.yaml: -------------------------------------------------------------------------------- 1 | min_visibility: -0.1 2 | forward_frames: 4 3 | frame_stride: 1 4 | # pretrain_model_path: '/home/pb/models/3DTracking/3DFCOS_JTA_iou_withoutRegLoss' 5 | pretrain_model_path: '' 6 | backbone: 'res50' 7 | img_size: [896, 1152] 8 | tube_limit: 700 # if out of memory, you can reduce it, but the performance may be effected -------------------------------------------------------------------------------- /configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoPang1996/TubeTK/bcca334c5348f9ae33e04595e1af93cf8351e50e/configs/__init__.py -------------------------------------------------------------------------------- /configs/default.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import numpy as np 4 | from easydict import EasyDict as edict 5 | INF = 1e8 6 | 7 | __C = edict() 8 | cfg = __C 9 | 10 | # for generating the tubes 11 | __C.min_visibility = -0.1 12 | __C.tube_thre = 0.8 13 | __C.forward_frames = 4 14 | __C.frame_stride = 1 15 | __C.value_range = 1 16 | __C.img_size = [896, 1152] 17 | 18 | # pretrain 19 | __C.pretrain_model_path = '' 20 | 21 | # for ResNet 22 | __C.freeze_stages = -1 23 | __C.backbone = 'res50' 24 | 25 | # for FPN 26 | __C.fpn_features_n = 256 27 | __C.fpn_outs_n = 5 28 | 29 | # for FCOS head 30 | __C.tube_points = 14 31 | __C.heads_features_n = 256 32 | __C.heads_layers_n = 4 33 | __C.withoutThickCenterness = False 34 | __C.model_stride = [[2, 8], 35 | [4, 16], 36 | [8, 32], 37 | [8, 64], 38 | [8, 128]] 39 | __C.regress_range = ([(-1, 0.25), (-1, 0.0714)], 40 | [(0.25, 0.5), (0.0714, 0.1428)], 41 | [(0.5, 0.75), (0.1428, 0.2857)], 42 | [(0.75, INF), (0.2857, 0.5714)], 43 | [(0.75, INF), (0.5714, INF)]) 44 | 45 | 46 | # for loss 47 | __C.reg_loss = 'giou' 48 | __C.tube_limit = 1000 49 | __C.test_nms_pre = 1000 50 | __C.test_nms_max_per_img = 500 51 | __C.test_nms_score_thre = 0.5 52 | __C.test_nms_iou_thre = 0.5 53 | __C.linking_min_iou = 0.4 54 | __C.cos_weight = 0.2 55 | 56 | 57 | def _merge_a_into_b(a, b): 58 | """Merge config dictionary a into config dictionary b, clobbering the 59 | options in b whenever they are also specified in a. 60 | """ 61 | if type(a) is not edict: 62 | return 63 | 64 | for k, v in a.items(): 65 | # a must specify keys that are in b 66 | if k not in b: 67 | raise KeyError('{} is not a valid config key'.format(k)) 68 | 69 | # the types must match, too 70 | old_type = type(b[k]) 71 | if old_type is not type(v): 72 | if isinstance(b[k], np.ndarray): 73 | v = np.array(v, dtype=b[k].dtype) 74 | else: 75 | raise ValueError(('Type mismatch ({} vs. {}) ' 76 | 'for config key: {}').format(type(b[k]), 77 | type(v), k)) 78 | 79 | # recursively merge dicts 80 | if type(v) is edict: 81 | try: 82 | _merge_a_into_b(a[k], b[k]) 83 | except: 84 | print('Error under config key: {}'.format(k)) 85 | raise 86 | else: 87 | b[k] = v 88 | 89 | 90 | def cfg_from_file(filename): 91 | """Load a config file and merge it into the default options.""" 92 | import yaml 93 | with open(filename, 'r') as f: 94 | yaml_cfg = edict(yaml.load(f)) 95 | 96 | _merge_a_into_b(yaml_cfg, __C) 97 | -------------------------------------------------------------------------------- /configs/get_MOT17_tube.yaml: -------------------------------------------------------------------------------- 1 | min_visibility: -0.1 2 | tube_thre: 0.8 3 | forward_frames: 4 4 | frame_stride: 1 5 | value_range: 1 -------------------------------------------------------------------------------- /configs/get_jta_tube.yaml: -------------------------------------------------------------------------------- 1 | min_visibility: 0.9 2 | tube_thre: 0.8 3 | forward_frames: 4 4 | frame_stride: 1 5 | value_range: 1 -------------------------------------------------------------------------------- /dataset/Parsers/JTA.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import cv2 4 | import numpy as np 5 | import random 6 | import pickle 7 | from PIL import Image, ImageFile 8 | 9 | ImageFile.LOAD_TRUNCATED_IMAGES = True 10 | 11 | 12 | class GTSingleParser_JTA: 13 | def __init__(self, folder, 14 | forward_frames=8, 15 | frame_stride=1, 16 | min_vis=-0.1, 17 | value_range=1, 18 | type='train'): 19 | self.type = type 20 | self.frame_stride = frame_stride 21 | self.value_range = value_range 22 | self.img_folder = folder 23 | 24 | self.forward_frames = forward_frames 25 | self.max_frame_index = len(os.listdir(os.path.join(self.img_folder))) - ( 26 | self.forward_frames * 2 - 1) * self.frame_stride 27 | 28 | split_path = folder.split('/') 29 | if folder[0] == '/': 30 | jta_root = '/' + os.path.join(*split_path[:-3]) 31 | else: 32 | jta_root = os.path.join(*split_path[:-3]) 33 | type = split_path[-2] 34 | video_name = split_path[-1] 35 | 36 | self.min_vis = min_vis 37 | self.jta_root = jta_root 38 | self.video_name = video_name 39 | if frame_stride != -1: 40 | tube_path = os.path.join(jta_root, 41 | 'tubes_' + str(forward_frames) + '_' + str(frame_stride) + '_' + str(min_vis), 42 | type, video_name) 43 | self.tube_folder = tube_path 44 | if 's3:' in self.tube_folder: 45 | self.tube_folder = self.tube_folder[:3] + '/' + self.tube_folder[3:] 46 | 47 | if type == 'train': 48 | assert os.path.exists(os.path.join(self.tube_folder)), 'Tube folder does not exist: ' + str(os.path.join(self.tube_folder)) 49 | 50 | def _getimage(self, frame_index): 51 | image_file = os.path.join(self.img_folder, '{}.jpg'.format(frame_index + 1)) 52 | # return cv2.imread(image_file) 53 | for i in range(10): 54 | try: 55 | assert os.path.exists(image_file), 'Image does not exist: {}'.format(image_file) 56 | img = cv2.cvtColor(np.asarray(Image.open(image_file).convert("RGB")), cv2.COLOR_RGB2BGR) 57 | break 58 | except: 59 | print('READ IMAGE ERROR: ' + str(image_file)) 60 | print("IMAGE EXIST: " + str(os.path.exists(image_file))) 61 | return img 62 | 63 | def get_item(self, frame_index): 64 | if self.frame_stride == -1: 65 | strides = [1, 2, 4] 66 | frame_stride = strides[random.randint(0, 2)] 67 | tube_path = os.path.join(self.jta_root, 68 | 'tubes_' + str(self.forward_frames) + '_' + str(frame_stride) + '_' + str(self.min_vis), 69 | self.type, self.video_name) 70 | self.tube_folder = tube_path 71 | if 's3:' in self.tube_folder: 72 | self.tube_folder = self.tube_folder[:3] + '/' + self.tube_folder[3:] 73 | if self.type == 'train': 74 | assert os.path.exists(os.path.join(self.tube_folder)), 'Tube folder does not exist: ' + str(os.path.join(self.tube_folder)) 75 | else: 76 | frame_stride = self.frame_stride 77 | 78 | start_frame = frame_index 79 | max_len = self.forward_frames * 2 * frame_stride 80 | tube_file = os.path.join(self.tube_folder, str(start_frame)) 81 | if self.type == 'train': 82 | if not os.path.exists(tube_file): 83 | print(tube_file) 84 | return None, None, None, None, None 85 | 86 | # get image meta 87 | img_meta = {} 88 | image = self._getimage(frame_index) 89 | if image is None: 90 | print(os.path.join(self.img_folder, 'img1/{}.jpg'.format(frame_index + 1))) 91 | img_meta['img_shape'] = [max_len, image.shape[0], image.shape[1]] 92 | img_meta['value_range'] = self.value_range 93 | img_meta['pad_percent'] = [1, 1] # prepared for padding 94 | img_meta['video_name'] = os.path.basename(self.img_folder) 95 | img_meta['start_frame'] = start_frame 96 | 97 | # get image 98 | imgs = [] 99 | for i in range(self.forward_frames * 2): 100 | frame_index = start_frame + i * frame_stride 101 | image = self._getimage(frame_index) # h, w, c 102 | imgs.append(image) 103 | 104 | # get_tube 105 | tubes = np.zeros((1, 15)) 106 | if self.type == 'train': 107 | tubes = pickle.load(open(tube_file, 'rb')) 108 | 109 | num_dets = len(tubes) 110 | labels = np.ones((num_dets, 1)) # only human class 111 | 112 | tubes = np.array(tubes) 113 | imgs = np.array(imgs) 114 | 115 | return imgs, img_meta, tubes, labels, start_frame 116 | 117 | def __len__(self): 118 | return self.max_frame_index 119 | 120 | 121 | class GTParser_JTA: 122 | def __init__(self, jta_root, 123 | type='train', 124 | forward_frames=4, 125 | frame_stride=1, 126 | min_vis=-0.1, 127 | value_range=1): 128 | # analsis all the folder in mot_root 129 | # 1. get all the folders 130 | jta_root = os.path.join(jta_root, 'imgs', type) 131 | all_folders = sorted( 132 | [os.path.join(jta_root, i) for i in os.listdir(jta_root) 133 | if os.path.isdir(os.path.join(jta_root, i)) 134 | ] 135 | ) 136 | # 2. create single parser 137 | self.parsers = [GTSingleParser_JTA(folder, forward_frames=forward_frames, frame_stride=frame_stride, 138 | min_vis=min_vis, value_range=value_range, type=type) for folder in all_folders] 139 | 140 | # 3. get some basic information 141 | self.lens = [len(p) for p in self.parsers] 142 | self.len = sum(self.lens) 143 | 144 | def __len__(self): 145 | # get the length of all the matching frame 146 | return self.len 147 | 148 | def __getitem__(self, item): 149 | # 1. find the parser 150 | total_len = 0 151 | index = 0 152 | current_item = item 153 | for l in self.lens: 154 | total_len += l 155 | if item < total_len: 156 | break 157 | else: 158 | index += 1 159 | current_item -= l 160 | 161 | # 2. get items 162 | if index >= len(self.parsers): 163 | return None, None, None, None, None 164 | return self.parsers[index].get_item(current_item) 165 | 166 | -------------------------------------------------------------------------------- /dataset/Parsers/MOT17.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import cv2 4 | import numpy as np 5 | import random 6 | import pickle 7 | from PIL import Image, ImageFile 8 | 9 | ImageFile.LOAD_TRUNCATED_IMAGES = True 10 | 11 | 12 | class GTSingleParser_MOT_17: 13 | def __init__(self, folder, 14 | forward_frames=8, 15 | frame_stride=1, 16 | min_vis=-0.1, 17 | value_range=1, 18 | type='train'): 19 | self.type = type 20 | self.frame_stride = frame_stride 21 | self.value_range = value_range 22 | self.folder = folder 23 | 24 | self.min_vis = min_vis 25 | self.forward_frames = forward_frames 26 | self.max_frame_index = len(os.listdir(os.path.join(self.folder, 'img1'))) - ( 27 | self.forward_frames * 2 - 1) * self.frame_stride 28 | 29 | if frame_stride != -1: 30 | self.tube_folder = 'tubes_' + str(forward_frames) + '_' + str(frame_stride) + '_' + str(min_vis) 31 | 32 | if type == 'train': 33 | assert os.path.exists(os.path.join(self.folder, self.tube_folder)), 'Tube folder does not exist: ' + str(os.path.join(self.folder, self.tube_folder)) 34 | 35 | def _getimage(self, frame_index): 36 | image_file = os.path.join(self.folder, 'img1/{0:06}.jpg'.format(frame_index + 1)) 37 | # return cv2.imread(image_file) 38 | for i in range(10): 39 | try: 40 | assert os.path.exists(image_file), 'Image does not exist: {}'.format(image_file) 41 | img = cv2.cvtColor(np.asarray(Image.open(image_file).convert("RGB")), cv2.COLOR_RGB2BGR) 42 | break 43 | except: 44 | print('READ IMAGE ERROR: ' + str(image_file)) 45 | print("IMAGE EXIST: " + str(os.path.exists(image_file))) 46 | return img 47 | 48 | def get_item(self, frame_index): 49 | if self.frame_stride == -1: 50 | strides = [1,2,4] 51 | frame_stride = strides[random.randint(0, 2)] 52 | self.tube_folder = 'tubes_' + str(self.forward_frames) + '_' + str(frame_stride) + '_' + str(self.min_vis) 53 | if type == 'train': 54 | assert os.path.exists( 55 | os.path.join(self.folder, self.tube_folder)), 'Tube folder does not exist: ' + str( 56 | os.path.join(self.folder, self.tube_folder)) 57 | else: 58 | frame_stride = self.frame_stride 59 | 60 | start_frame = frame_index 61 | max_len = self.forward_frames * 2 * frame_stride 62 | tube_file = os.path.join(self.folder, self.tube_folder, str(start_frame)) 63 | if self.type == 'train': 64 | if not os.path.exists(tube_file): 65 | print(tube_file) 66 | return None, None, None, None, None 67 | 68 | # get image meta 69 | img_meta = {} 70 | image = self._getimage(frame_index) 71 | if image is None: 72 | print(os.path.join(self.folder, 'img1/{0:06}.jpg'.format(frame_index + 1))) 73 | img_meta['img_shape'] = [max_len, image.shape[0], image.shape[1]] 74 | img_meta['value_range'] = self.value_range 75 | img_meta['pad_percent'] = [1, 1] # prepared for padding 76 | img_meta['video_name'] = os.path.basename(self.folder) 77 | img_meta['start_frame'] = start_frame 78 | 79 | # get image 80 | imgs = [] 81 | for i in range(self.forward_frames * 2): 82 | frame_index = start_frame + i * frame_stride 83 | image = self._getimage(frame_index) # h, w, c 84 | imgs.append(image) 85 | 86 | # get_tube 87 | tubes = np.zeros((1, 15)) 88 | if self.type == 'train': 89 | tubes = pickle.load(open(tube_file, 'rb')) 90 | 91 | num_dets = len(tubes) 92 | labels = np.ones((num_dets, 1)) # only human class 93 | 94 | tubes = np.array(tubes) 95 | imgs = np.array(imgs) 96 | 97 | return imgs, img_meta, tubes, labels, start_frame 98 | 99 | def __len__(self): 100 | return self.max_frame_index 101 | 102 | 103 | class GTParser_MOT_17: 104 | def __init__(self, mot_root, 105 | type='train', 106 | test_seq=None, 107 | forward_frames=4, 108 | frame_stride=1, 109 | min_vis=-0.1, 110 | value_range=1): 111 | # analsis all the folder in mot_root 112 | # 1. get all the folders 113 | mot_root = os.path.join(mot_root, type) 114 | if test_seq is None: 115 | all_folders = sorted( 116 | [os.path.join(mot_root, i) for i in os.listdir(mot_root) 117 | if os.path.isdir(os.path.join(mot_root, i)) 118 | and i.find('FRCNN') != -1] 119 | ) 120 | else: 121 | all_folders = sorted( 122 | [os.path.join(mot_root, i) for i in os.listdir(mot_root) 123 | if os.path.isdir(os.path.join(mot_root, i)) 124 | and i.find('FRCNN') != -1 125 | and i in test_seq] 126 | ) 127 | # 2. create single parser 128 | self.parsers = [GTSingleParser_MOT_17(folder, forward_frames=forward_frames, frame_stride=frame_stride, 129 | min_vis=min_vis, value_range=value_range, type=type) for folder in all_folders] 130 | 131 | # 3. get some basic information 132 | self.lens = [len(p) for p in self.parsers] 133 | self.len = sum(self.lens) 134 | 135 | def __len__(self): 136 | # get the length of all the matching frame 137 | return self.len 138 | 139 | def __getitem__(self, item): 140 | # 1. find the parser 141 | total_len = 0 142 | index = 0 143 | current_item = item 144 | for l in self.lens: 145 | total_len += l 146 | if item < total_len: 147 | break 148 | else: 149 | index += 1 150 | current_item -= l 151 | 152 | # 2. get items 153 | if index >= len(self.parsers): 154 | return None, None, None, None, None 155 | return self.parsers[index].get_item(current_item) 156 | -------------------------------------------------------------------------------- /dataset/Parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoPang1996/TubeTK/bcca334c5348f9ae33e04595e1af93cf8351e50e/dataset/Parsers/__init__.py -------------------------------------------------------------------------------- /dataset/Parsers/structures.py: -------------------------------------------------------------------------------- 1 | class Node: 2 | def __init__(self, box, frame_id, next_fram_id=-1): 3 | self.box = box 4 | self.frame_id = frame_id 5 | self.next_frame_id = next_fram_id 6 | 7 | 8 | class Track: 9 | def __init__(self, id): 10 | self.nodes = list() 11 | self.id = id 12 | 13 | def add_node(self, n): 14 | if len(self.nodes) > 0: 15 | self.nodes[-1].next_frame_id = n.frame_id 16 | self.nodes.append(n) 17 | 18 | def get_node_by_index(self, index): 19 | return self.nodes[index] 20 | 21 | 22 | class Tracks: 23 | def __init__(self): 24 | self.tracks = list() 25 | 26 | def add_node(self, node, id): 27 | node_added = False 28 | track_index = 0 29 | node_index = 0 30 | for t in self.tracks: 31 | if t.id == id: 32 | t.add_node(node) 33 | node_added = True 34 | track_index = self.tracks.index(t) 35 | node_index = t.nodes.index(node) 36 | break 37 | if not node_added: 38 | t = Track(id) 39 | t.add_node(node) 40 | self.tracks.append(t) 41 | track_index = self.tracks.index(t) 42 | node_index = t.nodes.index(node) 43 | 44 | return track_index, node_index 45 | 46 | def get_track_by_index(self, index): 47 | return self.tracks[index] 48 | -------------------------------------------------------------------------------- /dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoPang1996/TubeTK/bcca334c5348f9ae33e04595e1af93cf8351e50e/dataset/__init__.py -------------------------------------------------------------------------------- /dataset/dataLoader.py: -------------------------------------------------------------------------------- 1 | from dataset.mot17 import MOT17TrainDataset, MOT17TestDataset 2 | from dataset.jta import JTATrainDataset 3 | from dataset.mot17jta import MOT17JTATrainDataset 4 | import torch 5 | try: 6 | import moxing.pytorch as mox 7 | except: 8 | pass 9 | import os 10 | 11 | 12 | class Data_Loader_MOT(): 13 | def __init__(self, 14 | batch_size, 15 | num_workers, 16 | input_path, 17 | model_arg, 18 | train_epoch, 19 | test_epoch, 20 | dataset, 21 | test_type='test', 22 | test_seq=None): 23 | 24 | self.BATCH_SIZE = batch_size 25 | self.num_workers = num_workers 26 | 27 | def my_collate(batch): 28 | imgs = torch.stack([torch.tensor(item[0]) for item in batch], 0) 29 | img_meta = [item[1] for item in batch] 30 | tubes = [item[2] for item in batch] 31 | labels = [item[3] for item in batch] 32 | start_frame = [item[4] for item in batch] 33 | return imgs, img_meta, tubes, labels, start_frame 34 | 35 | if dataset == 'MOT17': 36 | print('MOT17 data') 37 | self.training_set = MOT17TrainDataset(mot_root=input_path, epoch=train_epoch, arg=model_arg) 38 | self.validation_set = MOT17TestDataset(mot_root=input_path, type=test_type, test_seq=test_seq, 39 | epoch=test_epoch, arg=model_arg) 40 | elif dataset == 'JTA': 41 | print('JTA data') 42 | self.training_set = JTATrainDataset(jta_root=input_path, epoch=train_epoch, arg=model_arg) 43 | self.validation_set = None 44 | elif dataset == 'MOT17JTA': 45 | print('MOT17JTA data') 46 | self.training_set = MOT17JTATrainDataset(mot17_root=input_path[0], mot15_root=input_path[1], 47 | jta_root=input_path[2], epoch=train_epoch, arg=model_arg) 48 | self.validation_set = None 49 | else: 50 | raise NotImplementedError 51 | 52 | # train loader 53 | if int(os.environ["RANK"]) == 0: 54 | print('==> Training data :', len(self.training_set)) 55 | train_sampler = torch.utils.data.distributed.DistributedSampler(self.training_set) 56 | self.train_loader = torch.utils.data.DataLoader( 57 | dataset=self.training_set, 58 | batch_size=self.BATCH_SIZE, 59 | collate_fn=my_collate, 60 | num_workers=self.num_workers, 61 | pin_memory=True, sampler=train_sampler) 62 | 63 | # val loader 64 | if self.validation_set is not None: 65 | if int(os.environ["RANK"]) == 0: 66 | print('==> Validation data :', len(self.validation_set)) 67 | val_sampler = torch.utils.data.distributed.DistributedSampler(self.validation_set) 68 | self.test_loader = torch.utils.data.DataLoader( 69 | dataset=self.validation_set, 70 | batch_size=self.BATCH_SIZE, 71 | collate_fn=my_collate, 72 | num_workers=self.num_workers, 73 | pin_memory=True, sampler=val_sampler) 74 | 75 | -------------------------------------------------------------------------------- /dataset/jta.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as data 2 | from PIL import Image, ImageFile 3 | 4 | from dataset.augmentation import SSJAugmentation 5 | from dataset.Parsers.JTA import GTParser_JTA 6 | ImageFile.LOAD_TRUNCATED_IMAGES = True 7 | 8 | 9 | class JTATrainDataset(data.Dataset): 10 | ''' 11 | The class is the dataset for train, which read gt.txt file and rearrange them as the tracks set. 12 | it can be selected from the specified frame 13 | ''' 14 | def __init__(self, 15 | jta_root, 16 | epoch, 17 | arg, 18 | transform=SSJAugmentation, 19 | ): 20 | # 1. init all the variables 21 | self.jta_root = jta_root 22 | self.transform = transform(size=arg.img_size, type='train') 23 | self.epoch = epoch 24 | 25 | # 2. init GTParser 26 | self.parser = GTParser_JTA(self.jta_root, 'train', forward_frames=arg.forward_frames, 27 | frame_stride=arg.frame_stride, min_vis=arg.min_visibility, 28 | value_range=arg.value_range) 29 | 30 | def __getitem__(self, item): 31 | item = item % len(self.parser) 32 | image, img_meta, tubes, labels, start_frame = self.parser[item] 33 | 34 | while image is None: 35 | item += 100 36 | image, img_meta, tubes, labels, start_frame = self.parser[item % len(self.parser)] 37 | 38 | print('None processing.') 39 | 40 | if self.transform is None: 41 | return image, img_meta, tubes, labels, start_frame 42 | else: 43 | image, img_meta, tubes, labels, start_frame = self.transform(image, img_meta, tubes, labels, start_frame) 44 | return image, img_meta, tubes, labels, start_frame 45 | 46 | def __len__(self): 47 | return len(self.parser) * self.epoch 48 | 49 | -------------------------------------------------------------------------------- /dataset/mot17.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as data 2 | import random 3 | from PIL import ImageFile 4 | from dataset.augmentation import SSJAugmentation 5 | from dataset.Parsers.MOT17 import GTParser_MOT_17 6 | 7 | ImageFile.LOAD_TRUNCATED_IMAGES = True 8 | 9 | 10 | class MOT17TrainDataset(data.Dataset): 11 | ''' 12 | The class is the dataset for train, which read gt.txt file and rearrange them as the tracks set. 13 | it can be selected from the specified frame 14 | ''' 15 | def __init__(self, 16 | mot_root, 17 | epoch, 18 | arg, 19 | transform=SSJAugmentation, 20 | ): 21 | # 1. init all the variables 22 | self.mot_root = mot_root 23 | self.transform = transform(size=arg.img_size, type='train') 24 | self.epoch = epoch 25 | 26 | # 2. init GTParser 27 | self.parser = GTParser_MOT_17(self.mot_root, 'train', forward_frames=arg.forward_frames, 28 | frame_stride=arg.frame_stride, min_vis=arg.min_visibility, 29 | value_range=arg.value_range) 30 | 31 | def __getitem__(self, item): 32 | item = item % len(self.parser) 33 | image, img_meta, tubes, labels, start_frame = self.parser[item] 34 | 35 | while image is None: 36 | item = item + 50 37 | image, img_meta, tubes, labels, start_frame = self.parser[item % len(self.parser)] 38 | print('None processing.') 39 | 40 | if self.transform is None: 41 | return image, img_meta, tubes, labels, start_frame 42 | else: 43 | image, img_meta, tubes, labels, start_frame = self.transform(image, img_meta, tubes, labels, start_frame) 44 | return image, img_meta, tubes, labels, start_frame 45 | 46 | def __len__(self): 47 | return len(self.parser) * self.epoch 48 | 49 | 50 | class MOT17TestDataset(data.Dataset): 51 | ''' 52 | The class is the dataset for train, which read gt.txt file and rearrange them as the tracks set. 53 | it can be selected from the specified frame 54 | ''' 55 | def __init__(self, 56 | mot_root, 57 | epoch, 58 | type, 59 | test_seq, 60 | arg, 61 | transform=SSJAugmentation, 62 | ): 63 | # 1. init all the variables 64 | self.mot_root = mot_root 65 | self.transform = transform(size=arg.img_size, type='test') 66 | self.epoch = epoch 67 | 68 | # 2. init GTParser 69 | self.parser = GTParser_MOT_17(self.mot_root, type, test_seq=test_seq, forward_frames=arg.forward_frames, 70 | frame_stride=arg.frame_stride, min_vis=arg.min_visibility, 71 | value_range=arg.value_range) 72 | 73 | def __getitem__(self, item): 74 | item = item % len(self.parser) 75 | image, img_meta, tubes, labels, start_frame = self.parser[item] 76 | 77 | while image is None: 78 | image, img_meta, tubes, labels, start_frame = self.parser[(item+random.randint(-10, 10)) % len(self.parser)] 79 | print('None processing.') 80 | 81 | if self.transform is None: 82 | return image, img_meta, tubes, labels, start_frame 83 | else: 84 | image, img_meta, tubes, labels, start_frame = self.transform(image, img_meta, tubes, labels, start_frame) 85 | return image, img_meta, tubes, labels, start_frame 86 | 87 | def __len__(self): 88 | return len(self.parser) * self.epoch 89 | 90 | -------------------------------------------------------------------------------- /dataset/mot17jta.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as data 2 | from PIL import ImageFile 3 | from dataset.Parsers.MOT17 import GTParser_MOT_17 4 | from dataset.Parsers.JTA import GTParser_JTA 5 | from dataset.augmentation import SSJAugmentation 6 | 7 | ImageFile.LOAD_TRUNCATED_IMAGES = True 8 | 9 | 10 | class MOT17JTATrainDataset(data.Dataset): 11 | ''' 12 | The class is the dataset for train, which read gt.txt file and rearrange them as the tracks set. 13 | it can be selected from the specified frame 14 | ''' 15 | def __init__(self, 16 | mot17_root, 17 | mot15_root, 18 | jta_root, 19 | epoch, 20 | arg, 21 | transform=SSJAugmentation, 22 | ): 23 | # 1. init all the variables 24 | self.mot17_root = mot17_root 25 | self.mot15_root = mot15_root 26 | self.jta_root = jta_root 27 | self.transform = transform(size=arg.img_size, type='train') 28 | self.epoch = epoch 29 | 30 | self.parsers = {} 31 | # 2. init GTParser 32 | self.parser_MOT17 = GTParser_MOT_17(self.mot17_root, 'train', forward_frames=arg.forward_frames, 33 | frame_stride=arg.frame_stride, min_vis=arg.min_visibility, 34 | value_range=arg.value_range) 35 | self.parsers['MOT17'] = self.parser_MOT17 36 | 37 | self.parser_JTA = GTParser_JTA(self.jta_root, 'train', forward_frames=arg.forward_frames, 38 | frame_stride=arg.frame_stride, min_vis=0.3, 39 | value_range=arg.value_range) 40 | self.parsers['JTA'] = self.parser_JTA 41 | 42 | def __getitem__(self, item): 43 | 44 | mot17 = True if item < len(self.parser_MOT17) * self.epoch else False 45 | if mot17: 46 | parser = self.parsers['MOT17'] 47 | item = item % len(self.parser_MOT17) 48 | 49 | if not mot17: 50 | parser = self.parsers['JTA'] 51 | item = (item - len(self.parser_MOT17) * self.epoch) % len(self.parser_JTA) 52 | 53 | image, img_meta, tubes, labels, start_frame = parser[item] 54 | while image is None: 55 | print('None processing.') 56 | item += 100 57 | image, img_meta, tubes, labels, start_frame = parser[item % len(parser)] 58 | 59 | if self.transform is None: 60 | return image, img_meta, tubes, labels, start_frame 61 | else: 62 | image, img_meta, tubes, labels, start_frame = self.transform(image, img_meta, tubes, labels, start_frame) 63 | return image, img_meta, tubes, labels, start_frame 64 | 65 | def __len__(self): 66 | return len(self.parser_MOT17) * self.epoch * 2 67 | 68 | 69 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | import torch 5 | import warnings 6 | from tqdm import tqdm 7 | from network.tubetk import TubeTK 8 | from apex import amp 9 | import argparse 10 | import multiprocessing 11 | from configs.default import __C, cfg_from_file 12 | from post_processing.tube_iou_matching import matching 13 | warnings.filterwarnings('ignore') 14 | import shutil 15 | from Visualization.Vis_Res import vis_one_video 16 | import cv2 17 | import torch.utils.data as data 18 | import random 19 | from dataset.augmentation import SSJAugmentation 20 | 21 | 22 | class GTSingleParser: 23 | def __init__(self, video, 24 | forward_frames=4, 25 | frame_stride=1, 26 | min_vis=-0.1, 27 | value_range=1): 28 | self.frame_stride = frame_stride 29 | self.value_range = value_range 30 | self.video_name = video 31 | self.min_vis = min_vis 32 | self.forward_frames = forward_frames 33 | 34 | self.cap = cv2.VideoCapture(video) 35 | fps = int(round(self.cap.get(cv2.CAP_PROP_FPS))) 36 | width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 37 | height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 38 | frame_counter = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) 39 | self.video_frames = np.zeros((frame_counter, height, width, 3), dtype='float16') 40 | cnt = 0 41 | if int(os.environ["RANK"]) == 0: 42 | print('reading video...') 43 | pbar = tqdm(total=frame_counter) 44 | 45 | os.makedirs(video + '_imgs', exist_ok=True) 46 | while self.cap.isOpened(): 47 | _, frame = self.cap.read() 48 | if cnt >= frame_counter: 49 | break 50 | if frame is not None: 51 | frame_ok = frame # .astype('float16') 52 | else: 53 | if int(os.environ['RANK']) == 0: 54 | print('cannot read frame') 55 | self.video_frames[cnt] = frame_ok 56 | cv2.imwrite(filename=os.path.join(video + '_imgs', str(cnt + 1) + '.jpg'), img=frame_ok) #.astype('int8')) 57 | cnt += 1 58 | if int(os.environ["RANK"]) == 0: 59 | pbar.update(1) 60 | if int(os.environ["RANK"]) == 0: 61 | print('finish_reading') 62 | pbar.close() 63 | 64 | self.max_frame_index = frame_counter - ( 65 | self.forward_frames * 2 - 1) * self.frame_stride 66 | 67 | def _getimage(self, frame_index): 68 | img = self.video_frames[frame_index] 69 | return img 70 | 71 | def get_item(self, frame_index): 72 | frame_stride = self.frame_stride 73 | 74 | start_frame = frame_index 75 | max_len = self.forward_frames * 2 * frame_stride 76 | 77 | # get image meta 78 | img_meta = {} 79 | image = self._getimage(frame_index) 80 | img_meta['img_shape'] = [max_len, image.shape[0], image.shape[1]] 81 | img_meta['value_range'] = self.value_range 82 | img_meta['pad_percent'] = [1, 1] # prepared for padding 83 | img_meta['video_name'] = os.path.basename(self.video_name) 84 | img_meta['start_frame'] = start_frame 85 | 86 | # get image 87 | imgs = [] 88 | for i in range(self.forward_frames * 2): 89 | frame_index = start_frame + i * frame_stride 90 | image = self._getimage(frame_index) # h, w, c 91 | imgs.append(image) 92 | 93 | # get_tube 94 | tubes = np.zeros((1, 15)) 95 | num_dets = len(tubes) 96 | labels = np.ones((num_dets, 1)) # only human class 97 | 98 | tubes = np.array(tubes) 99 | imgs = np.array(imgs) 100 | 101 | return imgs, img_meta, tubes, labels, start_frame 102 | 103 | def __len__(self): 104 | return self.max_frame_index 105 | 106 | 107 | class GTParser: 108 | def __init__(self, data_root, 109 | forward_frames=4, 110 | frame_stride=1, 111 | min_vis=-0.1, 112 | value_range=1): 113 | # analsis all the folder in mot_root 114 | # 1. get all the folders 115 | all_videos = sorted([os.path.join(data_root, i) for i in os.listdir(data_root) 116 | if '_imgs' not in i]) 117 | # 2. create single parser 118 | self.parsers = [GTSingleParser(video, forward_frames=forward_frames, frame_stride=frame_stride, 119 | min_vis=min_vis, value_range=value_range) for video in all_videos] 120 | 121 | # 3. get some basic information 122 | self.lens = [len(p) for p in self.parsers] 123 | self.len = sum(self.lens) 124 | 125 | def __len__(self): 126 | # get the length of all the matching frame 127 | return self.len 128 | 129 | def __getitem__(self, item): 130 | # 1. find the parser 131 | total_len = 0 132 | index = 0 133 | current_item = item 134 | for l in self.lens: 135 | total_len += l 136 | if item < total_len: 137 | break 138 | else: 139 | index += 1 140 | current_item -= l 141 | 142 | # 2. get items 143 | if index >= len(self.parsers): 144 | return None, None, None, None, None 145 | return self.parsers[index].get_item(current_item) 146 | 147 | 148 | class DemoDataset(data.Dataset): 149 | ''' 150 | The class is the dataset for train, which read gt.txt file and rearrange them as the tracks set. 151 | it can be selected from the specified frame 152 | ''' 153 | def __init__(self, 154 | data_root, 155 | arg, 156 | transform=SSJAugmentation, 157 | ): 158 | # 1. init all the variables 159 | self.data_root = data_root 160 | self.transform = transform(size=arg.img_size, type='test') 161 | 162 | # 2. init GTParser 163 | self.parser = GTParser(self.data_root, forward_frames=arg.forward_frames, 164 | frame_stride=arg.frame_stride, min_vis=arg.min_visibility, 165 | value_range=arg.value_range) 166 | 167 | def __getitem__(self, item): 168 | item = item % len(self.parser) 169 | image, img_meta, tubes, labels, start_frame = self.parser[item] 170 | 171 | while image is None: 172 | image, img_meta, tubes, labels, start_frame = self.parser[(item+random.randint(-10, 10)) % len(self.parser)] 173 | print('None processing.') 174 | 175 | if self.transform is None: 176 | return image, img_meta, tubes, labels, start_frame 177 | else: 178 | image, img_meta, tubes, labels, start_frame = self.transform(image, img_meta, tubes, labels, start_frame) 179 | return image, img_meta, tubes, labels, start_frame 180 | 181 | def __len__(self): 182 | return len(self.parser) 183 | 184 | 185 | class Data_Loader(): 186 | def __init__(self, 187 | batch_size, 188 | num_workers, 189 | input_path, 190 | model_arg): 191 | self.num_workers = num_workers 192 | self.BATCH_SIZE = batch_size 193 | 194 | def my_collate(batch): 195 | imgs = torch.stack([torch.tensor(item[0]) for item in batch], 0) 196 | img_meta = [item[1] for item in batch] 197 | tubes = [item[2] for item in batch] 198 | labels = [item[3] for item in batch] 199 | start_frame = [item[4] for item in batch] 200 | return imgs, img_meta, tubes, labels, start_frame 201 | 202 | self.demo_set = DemoDataset(data_root=input_path, arg=model_arg) 203 | 204 | if int(os.environ["RANK"]) == 0: 205 | print('==> Validation data :', len(self.demo_set)) 206 | val_sampler = torch.utils.data.distributed.DistributedSampler(self.demo_set) 207 | self.loader = torch.utils.data.DataLoader( 208 | dataset=self.demo_set, 209 | batch_size=self.BATCH_SIZE, 210 | collate_fn=my_collate, 211 | num_workers=self.num_workers, 212 | pin_memory=True, sampler=val_sampler) 213 | 214 | 215 | def synchronize(): 216 | """ 217 | Helper function to synchronize (barrier) among all processes when 218 | using distributed training 219 | """ 220 | if not torch.distributed.is_available(): 221 | return 222 | if not torch.distributed.is_initialized(): 223 | return 224 | world_size = torch.distributed.get_world_size() 225 | if world_size == 1: 226 | return 227 | torch.distributed.barrier() 228 | 229 | 230 | def match_video(video_name, tmp_dir, output_dir, model_arg): 231 | tubes_path = os.path.join(tmp_dir, video_name) 232 | tubes = [] 233 | frames = sorted([int(x) for x in os.listdir(tubes_path)]) 234 | for f in frames: 235 | tube = pickle.load(open(os.path.join(tubes_path, str(f)), 'rb')) 236 | tubes.append(tube) 237 | 238 | tubes = np.concatenate(tubes) 239 | matching(tubes, save_path=os.path.join(output_dir, video_name + '.txt'), verbose=True, arg=model_arg) 240 | 241 | 242 | def evaluate(model, loader, test_arg, model_arg, output_dir='output'): 243 | if not os.path.exists(output_dir): 244 | os.makedirs(output_dir) 245 | 246 | tmp_dir = os.path.join(output_dir, 'tmp') 247 | try: 248 | shutil.rmtree(tmp_dir) 249 | except: 250 | pass 251 | os.makedirs(tmp_dir, exist_ok=True) 252 | 253 | if test_arg.rank == 0: 254 | loader = tqdm(loader, ncols=20) 255 | 256 | for i, data in enumerate(loader): 257 | imgs, img_metas = data[:2] 258 | imgs = imgs.cuda() 259 | with torch.no_grad(): 260 | tubes, _, _ = zip(*model(imgs, img_metas, return_loss=False)) 261 | 262 | for img, tube, img_meta in zip(imgs, tubes, img_metas): 263 | tube[:, [0, 5, 10]] += img_meta['start_frame'] 264 | 265 | os.makedirs(os.path.join(tmp_dir, img_meta['video_name']), exist_ok=True) 266 | 267 | tube = tube.cpu().data.numpy() 268 | pickle.dump(tube, open(os.path.join(tmp_dir, img_meta['video_name'], str(img_meta['start_frame'])), 'wb')) 269 | 270 | synchronize() 271 | if test_arg.rank == 0: 272 | print('Finish prediction, Start matching') 273 | video_names = os.listdir(tmp_dir) 274 | pool = multiprocessing.Pool(processes=20) 275 | pool_list = [] 276 | for vid in video_names: 277 | pool_list.append(pool.apply_async(match_video, (vid, tmp_dir, os.path.join(output_dir, 'res'), model_arg,))) 278 | for p in tqdm(pool_list, ncols=20): 279 | p.get() 280 | pool.close() 281 | pool.join() 282 | shutil.rmtree(tmp_dir) 283 | 284 | print('Finish matching, Start writing to video') 285 | for vid in os.listdir(os.path.join(output_dir, 'res')): 286 | cap = cv2.VideoCapture(os.path.join(test_arg.video_url, vid[0: -4])) 287 | frame_rate = int(round(cap.get(cv2.CAP_PROP_FPS))) 288 | img_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 289 | img_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 290 | res_file = os.path.join(output_dir, 'res', vid) 291 | img_dir = os.path.join(test_arg.video_url, vid[0: -4] + '_imgs') 292 | output_name = os.path.join(test_arg.output_dir, vid + '.avi') 293 | vis_one_video(res_file, frame_rate, img_width, img_height, img_dir, output_name) 294 | try: 295 | shutil.rmtree(img_dir) 296 | except: 297 | pass 298 | 299 | 300 | def main(test_arg, model_arg): 301 | torch.distributed.init_process_group(backend="nccl", init_method='env://') 302 | 303 | local_rank = int(os.environ["LOCAL_RANK"]) 304 | print('Rank: ' + str(test_arg.rank) + " Start!") 305 | torch.cuda.set_device(local_rank) 306 | if local_rank == 0: 307 | print("Building TubeTK Model") 308 | 309 | model = TubeTK(num_classes=1, arg=model_arg, pretrained=False) 310 | 311 | data_loader = Data_Loader( 312 | batch_size=test_arg.batch_size, 313 | num_workers=8, 314 | input_path=test_arg.video_url, 315 | model_arg=model_arg, 316 | ) 317 | 318 | model = model.cuda(local_rank) 319 | 320 | model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) 321 | if test_arg.apex: 322 | model = amp.initialize(model, opt_level='O1') 323 | 324 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], 325 | output_device=local_rank, 326 | find_unused_parameters=True) 327 | 328 | if test_arg.local_rank == 0: 329 | print("Loading Model") 330 | checkpoint = torch.load(test_arg.model_path + '/' + test_arg.model_name, map_location= 331 | {'cuda:0': 'cuda:' + str(test_arg.local_rank), 332 | 'cuda:1': 'cuda:' + str(test_arg.local_rank), 333 | 'cuda:2': 'cuda:' + str(test_arg.local_rank), 334 | 'cuda:3': 'cuda:' + str(test_arg.local_rank), 335 | 'cuda:4': 'cuda:' + str(test_arg.local_rank), 336 | 'cuda:5': 'cuda:' + str(test_arg.local_rank), 337 | 'cuda:6': 'cuda:' + str(test_arg.local_rank), 338 | 'cuda:7': 'cuda:' + str(test_arg.local_rank)}) 339 | model.load_state_dict(checkpoint['state'], strict=False) 340 | if test_arg.local_rank == 0: 341 | print("Finish Loading") 342 | del checkpoint 343 | 344 | model.eval() 345 | loader = data_loader.loader 346 | 347 | evaluate(model, loader, test_arg, model_arg, output_dir=test_arg.output_dir) 348 | 349 | 350 | if __name__ == '__main__': 351 | parser = argparse.ArgumentParser() 352 | parser.add_argument('--batch_size', default=3, type=int) 353 | parser.add_argument('--model_path', default='./models', type=str, help='model path') 354 | parser.add_argument('--model_name', default='TubeTK', type=str, help='model name') 355 | parser.add_argument('--video_url', type=str, default='./data', help='video path') 356 | parser.add_argument('--output_dir', default='./vis_video', type=str, help='output path') 357 | parser.add_argument('--apex', action='store_true', help='whether use apex') 358 | parser.add_argument('--config', default='./configs/TubeTK_resnet_50_FPN_8frame_1stride.yaml', type=str, help='config file') 359 | 360 | parser.add_argument('--local_rank', type=int, help='gpus') 361 | 362 | test_arg, unparsed = parser.parse_known_args() 363 | 364 | model_arg = __C 365 | if test_arg.config is not None: 366 | cfg_from_file(test_arg.config) 367 | 368 | test_arg.rank = int(os.environ["RANK"]) 369 | 370 | main(test_arg, model_arg) 371 | 372 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | import torch 5 | import warnings 6 | from tqdm import tqdm 7 | from Metrics import evaluateTracking 8 | from dataset.dataLoader import Data_Loader_MOT 9 | from network.tubetk import TubeTK 10 | from post_processing.tube_nms import multiclass_nms 11 | from apex import amp 12 | import argparse 13 | import multiprocessing 14 | from configs.default import __C, cfg_from_file 15 | from post_processing.tube_iou_matching import matching 16 | warnings.filterwarnings('ignore') 17 | import shutil 18 | 19 | 20 | def synchronize(): 21 | """ 22 | Helper function to synchronize (barrier) among all processes when 23 | using distributed training 24 | """ 25 | if not torch.distributed.is_available(): 26 | return 27 | if not torch.distributed.is_initialized(): 28 | return 29 | world_size = torch.distributed.get_world_size() 30 | if world_size == 1: 31 | return 32 | torch.distributed.barrier() 33 | 34 | 35 | def match_video(video_name, tmp_dir, output_dir, model_arg): 36 | tubes_path = os.path.join(tmp_dir, video_name) 37 | tubes = [] 38 | frames = sorted([int(x) for x in os.listdir(tubes_path)]) 39 | for f in frames: 40 | tube = pickle.load(open(os.path.join(tubes_path, str(f)), 'rb')) 41 | tubes.append(tube) 42 | 43 | tubes = np.concatenate(tubes) 44 | matching(tubes, save_path=os.path.join(output_dir, video_name + '.txt'), verbose=True, arg=model_arg) 45 | 46 | 47 | def evaluate(model, loader, test_arg, model_arg, output_dir='output'): 48 | if not os.path.exists(output_dir): 49 | os.makedirs(output_dir) 50 | 51 | tmp_dir = os.path.join(output_dir, 'tmp') 52 | try: 53 | shutil.rmtree(tmp_dir) 54 | except: 55 | pass 56 | os.makedirs(tmp_dir, exist_ok=True) 57 | 58 | if test_arg.rank == 0: 59 | loader = tqdm(loader, ncols=20) 60 | 61 | for i, data in enumerate(loader): 62 | imgs, img_metas = data[:2] 63 | imgs = imgs.cuda() 64 | with torch.no_grad(): 65 | tubes, _, _ = zip(*model(imgs, img_metas, return_loss=False)) 66 | 67 | for img, tube, img_meta in zip(imgs, tubes, img_metas): 68 | # ===========================================VIS OUTPUT==================================================== 69 | # if img is not None: 70 | # vis_output(img.cpu(), img_meta, bbox.cpu(), stride=model_arg.frame_stride, out_folder='/home/pb/results/') 71 | # ========================================================================================================= 72 | tube[:, [0, 5, 10]] += img_meta['start_frame'] 73 | 74 | os.makedirs(os.path.join(tmp_dir, img_meta['video_name']), exist_ok=True) 75 | 76 | tube = tube.cpu().data.numpy() 77 | pickle.dump(tube, open(os.path.join(tmp_dir, img_meta['video_name'], str(img_meta['start_frame'])), 'wb')) 78 | 79 | synchronize() 80 | if test_arg.rank == 0: 81 | print('Finish prediction, Start matching') 82 | video_names = os.listdir(tmp_dir) 83 | pool = multiprocessing.Pool(processes=20) 84 | pool_list = [] 85 | for vid in video_names: 86 | pool_list.append(pool.apply_async(match_video, (vid, tmp_dir, os.path.join(output_dir, 'res'), model_arg,))) 87 | for p in tqdm(pool_list, ncols=20): 88 | p.get() 89 | pool.close() 90 | pool.join() 91 | shutil.rmtree(tmp_dir) 92 | 93 | if test_arg.trainOrTest == 'train' and test_arg.dataset == 'MOT17': 94 | print("FINISH MATCHING, START EVALUATE") 95 | seq_map = 'MOT17_train.txt' 96 | evaluateTracking(seq_map, os.path.join(output_dir, 'res'), 97 | os.path.join(test_arg.data_url, 'train'), 'MOT17') 98 | # elif test_arg.trainOrTest == 'train' and test_arg.dataset == 'MOT15': 99 | # print("FINISH MATCHING, START EVALUATE") 100 | # seq_map = 'MOT15_train.txt' 101 | # evaluateTracking(seq_map, os.path.join(output_dir, 'res'), 102 | # os.path.join(test_arg.data_url[3], 'train'), 'MOT15') 103 | 104 | 105 | def main(test_arg, model_arg): 106 | torch.distributed.init_process_group(backend="nccl", init_method='env://') 107 | 108 | local_rank = int(os.environ["LOCAL_RANK"]) 109 | print('Rank: ' + str(test_arg.rank) + " Start!") 110 | torch.cuda.set_device(local_rank) 111 | if local_rank == 0: 112 | print("Building TubeTK Model") 113 | 114 | model = TubeTK(num_classes=1, arg=model_arg, pretrained=False) 115 | 116 | data_loader = Data_Loader_MOT( 117 | batch_size=test_arg.batch_size, 118 | num_workers=8, 119 | input_path=test_arg.data_url, 120 | train_epoch=1, 121 | test_epoch=1, 122 | model_arg=model_arg, 123 | dataset=test_arg.dataset, 124 | test_seq=None, 125 | test_type=test_arg.trainOrTest, 126 | ) 127 | 128 | model = model.cuda(local_rank) 129 | 130 | model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) 131 | if test_arg.apex: 132 | model = amp.initialize(model, opt_level='O1') 133 | 134 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], 135 | output_device=local_rank, 136 | find_unused_parameters=True) 137 | 138 | if test_arg.local_rank == 0: 139 | print("Loading Model") 140 | checkpoint = torch.load(test_arg.model_path + '/' + test_arg.model_name, map_location= 141 | {'cuda:0': 'cuda:' + str(test_arg.local_rank), 142 | 'cuda:1': 'cuda:' + str(test_arg.local_rank), 143 | 'cuda:2': 'cuda:' + str(test_arg.local_rank), 144 | 'cuda:3': 'cuda:' + str(test_arg.local_rank), 145 | 'cuda:4': 'cuda:' + str(test_arg.local_rank), 146 | 'cuda:5': 'cuda:' + str(test_arg.local_rank), 147 | 'cuda:6': 'cuda:' + str(test_arg.local_rank), 148 | 'cuda:7': 'cuda:' + str(test_arg.local_rank)}) 149 | model.load_state_dict(checkpoint['state'], strict=False) 150 | if test_arg.local_rank == 0: 151 | print("Finish Loading") 152 | del checkpoint 153 | 154 | model.eval() 155 | loader = data_loader.test_loader 156 | 157 | evaluate(model, loader, test_arg, model_arg, output_dir=test_arg.output_dir) 158 | 159 | 160 | if __name__ == '__main__': 161 | parser = argparse.ArgumentParser() 162 | parser.add_argument('--batch_size', default=1, type=int) 163 | parser.add_argument('--model_path', default='./models', type=str, help='model path') 164 | parser.add_argument('--model_name', default='TubeTK', type=str, help='model name') 165 | parser.add_argument('--data_url', default='./data/', type=str, help='model path') 166 | parser.add_argument('--output_dir', default='./link_res', type=str, help='output path') 167 | parser.add_argument('--apex', action='store_true', help='whether use apex') 168 | parser.add_argument('--config', default='./configs/TubeTK_resnet_50_FPN_8frame_1stride.yaml', type=str, help='config file') 169 | parser.add_argument('--dataset', default='MOT17', type=str, help='test which dataset: MOT17, MOT15') 170 | parser.add_argument('--trainOrTest', default='test', type=str, help='evaluate train or test set') 171 | 172 | parser.add_argument('--local_rank', type=int, help='gpus') 173 | 174 | test_arg, unparsed = parser.parse_known_args() 175 | 176 | model_arg = __C 177 | if test_arg.config is not None: 178 | cfg_from_file(test_arg.config) 179 | 180 | test_arg.rank = int(os.environ["RANK"]) 181 | 182 | main(test_arg, model_arg) 183 | 184 | -------------------------------------------------------------------------------- /fetch_models.sh: -------------------------------------------------------------------------------- 1 | DIR='./models' 2 | URL='https://drive.google.com/uc?id=1jLgyNmiZ_c-m8Cw3NcZTEPTf6VESfIzK&export=download' 3 | 4 | mkdir -p $DIR 5 | 6 | echo "Downloading pre-trained TubeTK..." 7 | FILE="$(curl -sc /tmp/gcokie "${URL}" | grep -o '="uc-name.*' | sed 's/.*">//;s/<.a> .*//')" 8 | curl -Lb /tmp/gcokie "${URL}&confirm=$(awk '/_warning_/ {print $NF}' /tmp/gcokie)" -o "$DIR/${FILE}" 9 | 10 | echo "Download success." 11 | -------------------------------------------------------------------------------- /install/compile_local.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | PYTHON=${PYTHON:-"python"} 3 | echo "Building nms op..." 4 | cd ../post_processing/nms 5 | if [ -d "build" ]; then 6 | rm -r build 7 | fi 8 | $PYTHON setup.py build_ext --inplace 9 | 10 | -------------------------------------------------------------------------------- /launch.py: -------------------------------------------------------------------------------- 1 | import os, sys, stat, subprocess 2 | from argparse import ArgumentParser 3 | 4 | os.environ['NCCL_LL_THRESHOLD'] = '0' 5 | 6 | 7 | def parse_args(): 8 | parser = ArgumentParser(description="PyTorch distributed training launch " 9 | "helper utilty that will spawn up " 10 | "multiple distributed processes") 11 | 12 | # Optional arguments for the launch helper 13 | parser.add_argument("--world_size", type=int, default=1, 14 | help="The number of nodes to use for distributed " 15 | "training") 16 | parser.add_argument("--rank", type=int, default=0, 17 | help="The rank of the node for multi-node distributed " 18 | "training") 19 | parser.add_argument("--nproc_per_node", type=int, default=1, 20 | help="The number of processes to launch on each node, " 21 | "for GPU training, this is recommended to be set " 22 | "to the number of GPUs in your system so that " 23 | "each process can be bound to a single GPU.") 24 | parser.add_argument("--init_method", default="tcp://127.0.0.1:29000", type=str, 25 | help="Init method of distributed system.") 26 | parser.add_argument("--use_env", default=False, action="store_true", 27 | help="Use environment variable to pass " 28 | "'local rank'. For legacy reasons, the default value is False. " 29 | "If set to True, the script will not pass " 30 | "--local_rank as argument, and will instead set LOCAL_RANK.") 31 | 32 | # positional 33 | parser.add_argument("--training_script", type=str, 34 | help="The full path to the single GPU training " 35 | "program/script to be launched in parallel, " 36 | "followed by all the arguments for the " 37 | "training script") 38 | 39 | return parser.parse_known_args() 40 | 41 | 42 | def main(): 43 | args, script_args = parse_args() 44 | 45 | # world size in terms of number of processes 46 | dist_world_size = args.nproc_per_node * args.world_size 47 | 48 | # set PyTorch distributed related environmental variables 49 | current_env = os.environ.copy() 50 | assert args.init_method.startswith("tcp://"), "init_method should start with \"tcp://\"." 51 | master_addr, master_port = args.init_method[6:].split(":") 52 | current_env["MASTER_ADDR"] = master_addr 53 | current_env["MASTER_PORT"] = str(master_port) 54 | current_env["WORLD_SIZE"] = str(dist_world_size) 55 | 56 | processes = [] 57 | 58 | for local_rank in range(0, args.nproc_per_node): 59 | # each process's rank 60 | dist_rank = args.nproc_per_node * args.rank + local_rank 61 | current_env["RANK"] = str(dist_rank) 62 | current_env["LOCAL_RANK"] = str(local_rank) 63 | 64 | # For some store true args. 65 | new_script_args = [] 66 | for script_arg in script_args: 67 | if script_arg.endswith("=True"): 68 | new_script_args.append(script_arg[:-5]) 69 | elif script_arg.endswith("=False"): 70 | pass 71 | else: 72 | new_script_args.append(script_arg) 73 | script_args = new_script_args 74 | 75 | # spawn the processes 76 | if args.use_env: 77 | cmd = [sys.executable, "-u", 78 | args.training_script] + script_args 79 | else: 80 | cmd = [sys.executable, 81 | "-u", 82 | args.training_script, 83 | "--local_rank={}".format(local_rank)] + script_args 84 | process = subprocess.Popen(cmd, env=current_env) 85 | processes.append(process) 86 | 87 | for process in processes: 88 | process.wait() 89 | if process.returncode != 0: 90 | raise subprocess.CalledProcessError(returncode=process.returncode, 91 | cmd=process.args) 92 | 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import torch 4 | from tensorboardX import SummaryWriter 5 | from network.tubetk import TubeTK 6 | from dataset.dataLoader import Data_Loader_MOT 7 | from optim.solver import make_optimizer as makeOpt 8 | from configs.default import __C, cfg_from_file 9 | from utils.util import AverageMeter 10 | from tqdm import tqdm 11 | from optim.lr_scheduler import WarmupMultiStepLR 12 | import warnings 13 | import numpy as np 14 | try: 15 | from apex import amp 16 | import apex 17 | except: 18 | pass 19 | warnings.filterwarnings('ignore') 20 | 21 | 22 | def fix_bn(m): 23 | classname = m.__class__.__name__ 24 | if classname.find('BatchNorm') != -1: 25 | m.half() 26 | 27 | 28 | def synchronize(): 29 | """ 30 | Helper function to synchronize (barrier) among all processes when 31 | using distributed training 32 | """ 33 | if not torch.distributed.is_available(): 34 | return 35 | if not torch.distributed.is_initialized(): 36 | return 37 | world_size = torch.distributed.get_world_size() 38 | if world_size == 1: 39 | return 40 | torch.distributed.barrier() 41 | 42 | 43 | def print_dict(string, rank): 44 | if rank == 0: 45 | print(string) 46 | 47 | 48 | def run_one_iter(model, optimizer, data, scheduler, test): 49 | imgs, img_metas, gt_tubes, gt_labels, start_frame = data 50 | 51 | # =================================Visualization================================================ 52 | # vis_input(imgs, img_metas, gt_bboxes, gt_labels, start_frame, stride=model_arg.frame_stride, out_folder='/home/pb/results/') 53 | # ============================================================================================== 54 | 55 | # Get Input 56 | imgs = imgs.cuda() 57 | for i in range(len(gt_tubes)): 58 | gt_tubes[i] = gt_tubes[i].cuda() 59 | gt_labels[i] = gt_labels[i].cuda() 60 | 61 | if not test: 62 | scheduler.step() 63 | 64 | # Forward 65 | if not test: 66 | losses = model(imgs, img_metas, return_loss=True, gt_tubes=gt_tubes, gt_labels=gt_labels) 67 | res = losses 68 | else: 69 | with torch.no_grad(): 70 | bbox_list = model(imgs, img_metas, return_loss=False, gt_tubes=gt_tubes, gt_labels=gt_labels) 71 | bbox_list[:, :, 0] += start_frame 72 | res = bbox_list 73 | 74 | # Backward 75 | if not test: 76 | if losses: 77 | optimizer.zero_grad() 78 | loss = torch.zeros(1).cuda() 79 | for l in losses: 80 | if 'loss_cls' in l: 81 | loss += 1e3 * losses[l] 82 | else: 83 | loss += losses[l] 84 | if not train_arg.apex: 85 | loss.backward() 86 | else: 87 | with amp.scale_loss(loss, optimizer) as scaled_loss: 88 | scaled_loss.backward() 89 | optimizer.step() 90 | 91 | return res 92 | 93 | 94 | def train(model, optimizer, data_loader, scheduler, writer, max_acc=0, step_start=0): 95 | loss_cls_accumulate = AverageMeter() 96 | loss_reg_accumulate = AverageMeter() 97 | loss_center_accumulate = AverageMeter() 98 | max_acc = max_acc 99 | 100 | loader = data_loader.train_loader 101 | model.train() 102 | if train_arg.apex: 103 | model.apply(fix_bn) 104 | if train_arg.local_rank == 0: 105 | loader = tqdm(loader, ncols=20) 106 | 107 | loader_len = len(loader) 108 | for step, data in enumerate(loader): 109 | # Input 110 | if step > loader_len - step_start: 111 | break 112 | step += step_start 113 | losses = run_one_iter(model, optimizer, data, scheduler, False) 114 | 115 | # Loss and results 116 | if losses: 117 | if not np.isnan(losses['loss_cls'].data.cpu().numpy()): 118 | loss_cls_accumulate.update(val=losses['loss_cls'].data.cpu().numpy()) 119 | if not np.isnan(losses['loss_reg'].data.cpu().numpy()): 120 | loss_reg_accumulate.update(val=losses['loss_reg'].data.cpu().numpy()) 121 | if not np.isnan(losses['loss_centerness'].data.cpu().numpy()): 122 | loss_center_accumulate.update(val=losses['loss_centerness'].data.cpu().numpy()) 123 | 124 | if train_arg.rank == 0: 125 | writer.add_scalar('train/loss_cls', loss_cls_accumulate.avg, step) 126 | writer.add_scalar('train/loss_reg', loss_reg_accumulate.avg, step) 127 | writer.add_scalar('train/loss_center', loss_center_accumulate.avg, step) 128 | writer.add_scalar('train/lr', optimizer.param_groups[0]["lr"], step) 129 | 130 | if step % 1000 == 999: 131 | if train_arg.rank == 0: 132 | print('save model') 133 | torch.save({'state': model.state_dict(), 134 | 'max_acc': max_acc, 135 | 'step': step, 136 | 'opt': optimizer.state_dict(), 137 | 'sched': scheduler.state_dict()}, 138 | train_arg.model_path + '/' + train_arg.model_name) 139 | 140 | if step % train_arg.reset_iter == train_arg.reset_iter - 1: 141 | loss_cls_accumulate.reset() 142 | loss_reg_accumulate.reset() 143 | loss_center_accumulate.reset() 144 | 145 | if train_arg.local_rank == 0: 146 | loader.set_description('Loss_cls: ' + str(loss_cls_accumulate.avg)[0:6] + 147 | ',\tLoss_reg: ' + str(loss_reg_accumulate.avg)[0:6] + 148 | ',\tLoss_center: ' + str(loss_center_accumulate.avg)[0:6], refresh=False) 149 | 150 | 151 | def main(train_arg, model_arg): 152 | torch.distributed.init_process_group(backend="nccl", init_method='env://') 153 | local_rank = int(os.environ["LOCAL_RANK"]) 154 | print('Rank: ' + str(train_arg.rank) + " Start!") 155 | torch.cuda.set_device(local_rank) 156 | 157 | print_dict("Building TubeTK Model", train_arg.local_rank) 158 | model = TubeTK(num_classes=1, arg=model_arg, pretrained=True) 159 | 160 | data_loader = Data_Loader_MOT( 161 | batch_size=train_arg.batch_size, 162 | num_workers=8, 163 | input_path=train_arg.data_url, 164 | train_epoch=train_arg.epochs, 165 | model_arg=model_arg, 166 | dataset=train_arg.dataset, 167 | test_epoch=1 168 | ) 169 | # =================================Visualization================================================ 170 | # loader = data_loader.train_loader 171 | # for step, data in enumerate(loader): 172 | # imgs, img_metas, gt_bboxes, gt_labels, start_frame = data 173 | # 174 | # vis_input(imgs, img_metas, gt_bboxes, gt_labels, start_frame, stride=model_arg.frame_stride, 175 | # out_folder='/home/pb/results/') 176 | # ============================================================================================== 177 | 178 | model = model.cuda(local_rank) 179 | optimizer = makeOpt(train_arg, model) 180 | 181 | model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) 182 | if train_arg.apex: 183 | model, optimizer = amp.initialize(model, optimizer, 184 | opt_level='O1', 185 | # loss_scale='dynamic', 186 | # keep_batchnorm_fp32=False 187 | ) 188 | 189 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], 190 | output_device=local_rank, 191 | find_unused_parameters=True) 192 | 193 | sched = WarmupMultiStepLR( 194 | optimizer, 195 | milestones=train_arg.mileStone, 196 | warmup_factor=0.1, 197 | warmup_iters=0, 198 | warmup_method='linear') 199 | 200 | max_acc = 0 201 | step = 0 202 | 203 | if train_arg.resume: 204 | print_dict("Loading Model", train_arg.local_rank) 205 | checkpoint = torch.load(train_arg.model_path + '/' + train_arg.model_name, map_location= 206 | {'cuda:0': 'cuda:' + str(train_arg.local_rank), 207 | 'cuda:1': 'cuda:' + str(train_arg.local_rank), 208 | 'cuda:2': 'cuda:' + str(train_arg.local_rank), 209 | 'cuda:3': 'cuda:' + str(train_arg.local_rank), 210 | 'cuda:4': 'cuda:' + str(train_arg.local_rank), 211 | 'cuda:5': 'cuda:' + str(train_arg.local_rank), 212 | 'cuda:6': 'cuda:' + str(train_arg.local_rank), 213 | 'cuda:7': 'cuda:' + str(train_arg.local_rank)}) 214 | model.load_state_dict(checkpoint['state'], strict=False) 215 | optimizer.load_state_dict(checkpoint['opt']) 216 | sched.load_state_dict(checkpoint['sched']) 217 | sched.milestones = train_arg.mileStone 218 | step = checkpoint['step'] + 1 219 | sched.last_epoch = step 220 | max_acc = checkpoint['max_acc'] 221 | print_dict("Finish Loading", train_arg.local_rank) 222 | del checkpoint 223 | 224 | if train_arg.rank == 0: 225 | tensorboard_writer = SummaryWriter(train_arg.logName, purge_step=step) 226 | else: 227 | tensorboard_writer = None 228 | 229 | print_dict("Training", train_arg.local_rank) 230 | train(model, optimizer, data_loader, sched, tensorboard_writer, max_acc=max_acc, step_start=step) 231 | 232 | 233 | if __name__ == '__main__': 234 | parser = argparse.ArgumentParser(description='PyTorch Sub-JHMDB rgb frame training') 235 | parser.add_argument('--epochs', default=120, type=int, metavar='N', help='number of total epochs') 236 | parser.add_argument('--batch_size', default=1, type=int, metavar='N', help='mini-batch size (default: 64)') 237 | parser.add_argument('--lr', default=0.001, type=float, metavar='LR', help='initial learning rate') 238 | parser.add_argument('--weight_decay', default=1e-5, type=float, help='weight decay') 239 | parser.add_argument('--mileStone', nargs='+', type=int, default=[7500, 15000], help='mileStone for lr Sched') 240 | parser.add_argument('--reset_iter', default=200, type=list, help='test iter') 241 | parser.add_argument('--model_path', default='./models', type=str, help='model path') 242 | parser.add_argument('--model_name', default='TubeTK', type=str, help='model name') 243 | parser.add_argument('--data_url', default='./data/', type=str, help='data path') 244 | parser.add_argument('--dataset', default='MOT17', type=str, help='MOT17, JTA, MOTJTA') 245 | 246 | parser.add_argument('--config', default=None, type=str, help='config file') 247 | 248 | parser.add_argument('--logName', type=str, 249 | default='./logs/TubeTK_log', help='log dir name') 250 | 251 | parser.add_argument('--local_rank', type=int, help='gpus') 252 | 253 | parser.add_argument('--resume', action='store_true', help='whether resume') 254 | 255 | parser.add_argument('--apex', action='store_true', help='whether use apex') 256 | 257 | train_arg, unparsed = parser.parse_known_args() 258 | 259 | model_arg = __C 260 | if train_arg.config is not None: 261 | cfg_from_file(train_arg.config) 262 | 263 | train_arg.rank = int(os.environ["RANK"]) 264 | if train_arg.rank == 0: 265 | try: 266 | os.makedirs(train_arg.model_path) 267 | except: 268 | pass 269 | 270 | main(train_arg, model_arg) 271 | -------------------------------------------------------------------------------- /network/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoPang1996/TubeTK/bcca334c5348f9ae33e04595e1af93cf8351e50e/network/__init__.py -------------------------------------------------------------------------------- /network/focal_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import torch.nn.functional as F 4 | 5 | 6 | def one_hot_embedding(labels, num_classes): 7 | ''' 8 | Embedding labels to one-hot form. 9 | Args: 10 | labels: (LongTensor) class labels, sized [N,]. 11 | num_classes: (int) number of classes. 12 | Returns: 13 | (tensor) encoded labels, sized [N,#classes]. 14 | ''' 15 | y = torch.eye(num_classes) 16 | return y[labels] 17 | 18 | 19 | def focal_loss(x, y): 20 | ''' 21 | Focal loss. 22 | Args: 23 | x: (tensor) sized [N,D]. 24 | y: (tensor) sized [N,]. 25 | Return: 26 | (tensor) focal loss. 27 | ''' 28 | alpha = 0.25 29 | gamma = 2 30 | 31 | t = one_hot_embedding(y, x.shape[1] + 1) 32 | 33 | # exclude background 34 | t = t[:, 1:] 35 | 36 | t = Variable(t).cuda() 37 | p = x.sigmoid().float() 38 | 39 | # pt = p if t > 0 else 1-p 40 | pt = p * t + (1 - p) * (1 - t) 41 | 42 | # w = alpha if t > 0 else 1-alpha 43 | w = alpha * t + (1 - alpha) * (1 - t) 44 | 45 | w = w * (1 - pt).pow(gamma) 46 | return F.binary_cross_entropy_with_logits(x.float(), t, w.detach(), size_average=True) 47 | -------------------------------------------------------------------------------- /network/fpn.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | 5 | class FPN(nn.Module): 6 | def __init__(self, 7 | in_channels, # [512, 1024, 2048] 8 | arg, 9 | ): 10 | super(FPN, self).__init__() 11 | assert isinstance(in_channels, list) 12 | self.in_channels = in_channels 13 | self.out_channels = arg.fpn_features_n 14 | self.num_ins = len(in_channels) 15 | self.num_outs = arg.fpn_outs_n 16 | 17 | self.lateral_convs = nn.ModuleList() 18 | self.fpn_convs = nn.ModuleList() 19 | 20 | for i in range(self.num_ins): 21 | l_conv = nn.Conv3d(in_channels[i], self.out_channels, kernel_size=1, stride=1) 22 | fpn_conv = nn.Conv3d(self.out_channels, self.out_channels, 23 | kernel_size=3, stride=1, padding=1) 24 | 25 | self.lateral_convs.append(l_conv) 26 | self.fpn_convs.append(fpn_conv) 27 | 28 | # add extra conv layers (e.g., RetinaNet) 29 | extra_levels = self.num_outs - self.num_ins 30 | if extra_levels >= 1: 31 | for i in range(extra_levels): 32 | in_channels = self.out_channels 33 | extra_fpn_conv = nn.Conv3d(in_channels, self.out_channels, kernel_size=3, stride=(1, 2, 2), padding=1) 34 | self.fpn_convs.append(extra_fpn_conv) 35 | 36 | # default init_weights for conv(msra) and norm in ConvModule 37 | for m in self.modules(): 38 | if isinstance(m, nn.Conv3d): 39 | nn.init.xavier_uniform_(m.weight, gain=1) 40 | if hasattr(m, 'bias') and m.bias is not None: 41 | nn.init.constant_(m.bias, 0) 42 | 43 | def forward(self, inputs): 44 | assert len(inputs) == len(self.in_channels) 45 | 46 | # build laterals 47 | laterals = [ 48 | lateral_conv(inputs[i]) 49 | for i, lateral_conv in enumerate(self.lateral_convs) 50 | ] 51 | 52 | # build top-down path 53 | used_backbone_levels = len(laterals) 54 | for i in range(used_backbone_levels - 1, 0, -1): 55 | laterals[i - 1] += F.interpolate( 56 | laterals[i], scale_factor=2, mode='nearest') 57 | 58 | # build outputs 59 | # part 1: from original levels 60 | outs = [ 61 | self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels) 62 | ] 63 | # part 2: add extra levels 64 | if self.num_outs > len(outs): 65 | # add conv layers on top of original feature maps (RetinaNet) 66 | outs.append(self.fpn_convs[used_backbone_levels](outs[-1])) 67 | for i in range(used_backbone_levels + 1, self.num_outs): 68 | outs.append(self.fpn_convs[i](F.relu(outs[-1]))) 69 | 70 | return tuple(outs) 71 | -------------------------------------------------------------------------------- /network/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | from torch.nn.modules.batchnorm import _BatchNorm 5 | 6 | __all__ = [ 7 | 'ResNet', 'resnet50', 'resnet101', 'resnext101' 8 | ] 9 | 10 | 11 | class Bottleneck(nn.Module): 12 | expansion = 4 13 | 14 | def __init__(self, inplanes, planes, stride=1, kernel=3, downsample=None, groups=1, base_width=64): 15 | super(Bottleneck, self).__init__() 16 | 17 | width = int(planes * (base_width / 64.)) * groups 18 | 19 | self.conv1 = nn.Conv3d(inplanes, width, kernel_size=1, bias=False) 20 | self.bn1 = nn.BatchNorm3d(width) 21 | self.conv2 = nn.Conv3d( 22 | width, width, kernel_size=(kernel, 3, 3), stride=stride, padding=(kernel//2, 1, 1), groups=groups, bias=False) 23 | self.bn2 = nn.BatchNorm3d(width) 24 | self.conv3 = nn.Conv3d(width, planes * 4, kernel_size=1, bias=False) 25 | self.bn3 = nn.BatchNorm3d(planes * 4) 26 | self.relu = nn.ReLU(inplace=True) 27 | self.downsample = downsample 28 | self.stride = stride 29 | 30 | def forward(self, x): 31 | residual = x 32 | 33 | out = self.conv1(x) 34 | out = self.bn1(out) 35 | out = self.relu(out) 36 | 37 | out = self.conv2(out) 38 | out = self.bn2(out) 39 | out = self.relu(out) 40 | 41 | out = self.conv3(out) 42 | out = self.bn3(out) 43 | 44 | if self.downsample is not None: 45 | residual = self.downsample(x) 46 | 47 | out += residual 48 | out = self.relu(out) 49 | 50 | return out 51 | 52 | 53 | class ResNet(nn.Module): 54 | 55 | def __init__(self, 56 | block, 57 | layers, 58 | kernels, 59 | groups=1, 60 | width_per_group=64, 61 | freeze_bn=False, 62 | freeze_stages=-1, 63 | fst_l_stride=2): 64 | self.freeze_bn = freeze_bn 65 | self.freeze_stages = freeze_stages 66 | self.groups = groups 67 | self.base_width = width_per_group 68 | self.inplanes = 64 69 | 70 | super(ResNet, self).__init__() 71 | self.conv1 = nn.Conv3d( 72 | 3, 73 | 64, 74 | kernel_size=(kernels[0][0], 7, 7), 75 | stride=(1, 2, 2), 76 | padding=(kernels[0][0]//2, 3, 3), 77 | bias=False) 78 | self.bn1 = nn.BatchNorm3d(64) 79 | self.relu = nn.ReLU(inplace=True) 80 | if kernels[0][0] == 7: 81 | self.maxpool = nn.MaxPool3d(kernel_size=3, stride=(1, 2, 2), padding=1) 82 | else: 83 | self.maxpool = nn.MaxPool3d(kernel_size=(1, 3, 4), stride=(1, 2, 2), padding=(0, 1, 1)) 84 | self.layer1 = self._make_layer(block, 64, layers[0], kernels[1]) 85 | self.layer2 = self._make_layer( 86 | block, 128, layers[1], kernels[2], stride=(1, 2, 2) if fst_l_stride < 2 else 2) 87 | self.layer3 = self._make_layer( 88 | block, 256, layers[2], kernels[3], stride=2) 89 | self.layer4 = self._make_layer( 90 | block, 512, layers[3], kernels[4], stride=2) 91 | 92 | for m in self.modules(): 93 | if isinstance(m, nn.Conv3d): 94 | m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out') 95 | elif isinstance(m, nn.BatchNorm3d): 96 | m.weight.data.fill_(1) 97 | m.bias.data.zero_() 98 | 99 | self._freeze_stages() 100 | if self.freeze_bn: 101 | self._freeze_bn() 102 | 103 | def _make_layer(self, block, planes, blocks, kernel, stride=1): 104 | downsample = None 105 | if stride != 1 or self.inplanes != planes * block.expansion: 106 | downsample = nn.Sequential( 107 | nn.Conv3d( 108 | self.inplanes, 109 | planes * block.expansion, 110 | kernel_size=1, 111 | stride=stride, 112 | bias=False), nn.BatchNorm3d(planes * block.expansion)) 113 | 114 | layers = [] 115 | layers.append(block(self.inplanes, planes, stride, kernel[0], downsample, self.groups, self.base_width)) 116 | self.inplanes = planes * block.expansion 117 | for i in range(1, blocks): 118 | layers.append(block(self.inplanes, planes, kernel=kernel[i], groups=self.groups, base_width=self.base_width)) 119 | 120 | return nn.Sequential(*layers) 121 | 122 | def forward(self, x): 123 | x = self.conv1(x) 124 | x = self.bn1(x) 125 | x = self.relu(x) 126 | x = self.maxpool(x) 127 | 128 | outs = [] 129 | x = self.layer1(x) 130 | x = self.layer2(x) 131 | outs.append(x) 132 | x = self.layer3(x) 133 | outs.append(x) 134 | x = self.layer4(x) 135 | outs.append(x) 136 | 137 | return tuple(outs) 138 | 139 | def _freeze_stages(self): 140 | if self.freeze_stages >= 0: 141 | print('Freeze Stage: 0') 142 | self.bn1.eval() 143 | for m in [self.conv1, self.bn1]: 144 | for param in m.parameters(): 145 | param.requires_grad = False 146 | 147 | for i in range(1, self.freeze_stages + 1): 148 | print('Freeze Stage: ' + str(i)) 149 | m = getattr(self, 'layer{}'.format(i)) 150 | m.eval() 151 | for param in m.parameters(): 152 | param.requires_grad = False 153 | 154 | def _freeze_bn(self): 155 | print('Freeze BN') 156 | for m in self.modules(): 157 | if isinstance(m, _BatchNorm): 158 | m.eval() 159 | 160 | def train(self, mode=True): 161 | super(ResNet, self).train(mode) 162 | self._freeze_stages() 163 | 164 | if mode and self.freeze_bn: 165 | for m in self.modules(): 166 | if isinstance(m, _BatchNorm): 167 | m.eval() 168 | 169 | 170 | def resnet50(**kwargs): 171 | """Constructs a ResNet-50 model. 172 | """ 173 | kernel = [[7], [3, 3, 3], [3, 3, 3, 3], [3, 3, 3, 3, 3, 3], [3, 3, 3]] 174 | # kernel = [[5], [3, 3, 3], [3, 1, 3, 1], [3, 1, 3, 1, 3, 1], [3, 1, 3]] 175 | model = ResNet(Bottleneck, [3, 4, 6, 3], 176 | kernels=kernel, 177 | groups=1, 178 | width_per_group=64, 179 | **kwargs) 180 | return model 181 | 182 | 183 | def resnet101(**kwargs): 184 | """Constructs a ResNet-101 model. 185 | """ 186 | # kernel = [[7], [3, 3, 3], [3, 1, 3, 1], [(-1 * (i % 2) + 1)*2 + 1 for i in range(23)], [1, 3, 1]] 187 | kernel = [[7], [3, 3, 3], [3, 3, 3, 3], [3 for _ in range(23)], [3, 3, 3]] 188 | model = ResNet(Bottleneck, [3, 4, 23, 3], 189 | kernels=kernel, 190 | groups=1, 191 | width_per_group=64, 192 | **kwargs) 193 | return model 194 | 195 | 196 | def resnext101(**kwargs): 197 | """Constructs a ResNet-101 model. 198 | """ 199 | # kernel = [[5], [3, 3, 3], [3, 1, 3, 1], [(-1 * (i % 2) + 1)*2 + 1 for i in range(23)], [1, 3, 1]] 200 | kernel = [[7], [3, 3, 3], [3, 3, 3, 3], [3 for _ in range(23)], [3, 3, 3]] 201 | model = ResNet(Bottleneck, [3, 4, 23, 3], 202 | kernels=kernel, 203 | groups=32, 204 | width_per_group=4, 205 | **kwargs) 206 | return model 207 | -------------------------------------------------------------------------------- /network/tubetk.py: -------------------------------------------------------------------------------- 1 | import time, os 2 | import torch 3 | import torch.nn as nn 4 | from network.resnet import resnet101, resnet50, resnext101 5 | from network.fpn import FPN 6 | from network.track_head import TrackHead 7 | 8 | 9 | class TubeTK(nn.Module): 10 | 11 | def __init__(self, 12 | num_classes, 13 | arg, 14 | pretrained=True 15 | ): 16 | super(TubeTK, self).__init__() 17 | self.arg = arg 18 | if arg.backbone == 'res50': 19 | self.backbone = resnet50(freeze_stages=arg.freeze_stages, fst_l_stride=arg.model_stride[0][0]) 20 | elif arg.backbone == 'res101': 21 | self.backbone = resnet101(freeze_stages=arg.freeze_stages, fst_l_stride=arg.model_stride[0][0]) 22 | elif arg.backbone == 'resx101': 23 | self.backbone = resnext101(freeze_stages=arg.freeze_stages, fst_l_stride=arg.model_stride[0][0]) 24 | else: 25 | raise NotImplementedError 26 | self.neck = FPN(in_channels=[512, 1024, 2048], arg=arg) 27 | self.tube_head = TrackHead(arg=arg, 28 | num_classes=num_classes, 29 | in_channels=self.neck.out_channels, 30 | strides=[[arg.model_stride[i][0]/(arg.forward_frames * 2) * arg.value_range, 31 | arg.model_stride[i][1]/arg.img_size[0] * arg.value_range, 32 | arg.model_stride[i][1]/arg.img_size[1] * arg.value_range] for i in range(5)] 33 | ) 34 | 35 | if pretrained and arg.pretrain_model_path != '': 36 | self.load_pretrain(model_path=arg.pretrain_model_path) 37 | torch.cuda.empty_cache() 38 | 39 | def load_pretrain(self, model_path): 40 | if int(os.environ["RANK"]) == 0: 41 | print('loading JTA Pretrain: ' + str(model_path)) 42 | 43 | pre_model = torch.load(model_path, map_location={'cuda:0': 'cpu', 44 | 'cuda:1': 'cpu', 45 | 'cuda:2': 'cpu', 46 | 'cuda:3': 'cpu', 47 | 'cuda:4': 'cpu', 48 | 'cuda:5': 'cpu', 49 | 'cuda:6': 'cpu', 50 | 'cuda:7': 'cpu'})['state'] 51 | model_dict = self.state_dict() 52 | for key in model_dict: 53 | if model_dict[key].shape != pre_model['module.' + key].shape: 54 | p_shape = model_dict[key].shape 55 | pre_model['module.' + key] = pre_model['module.' + key].repeat(1, 1, p_shape[2], 1, 1) / p_shape[2] 56 | else: 57 | model_dict[key] = pre_model['module.' + key] 58 | self.load_state_dict(model_dict) 59 | del pre_model, model_dict 60 | 61 | def extract_feat(self, x): 62 | x = self.backbone(x) 63 | x = self.neck(x) 64 | return x 65 | 66 | def forward_train(self, 67 | img, 68 | img_metas, 69 | gt_tubes, 70 | gt_labels): 71 | x = self.extract_feat(img) 72 | outs = self.tube_head(x) 73 | loss_inputs = outs + (gt_tubes, gt_labels, img_metas) 74 | losses = self.tube_head.loss(*loss_inputs) 75 | return losses 76 | 77 | def forward_test(self, img, img_meta): 78 | x = self.extract_feat(img) 79 | outs = self.tube_head(x) 80 | tube_inputs = outs + (img_meta, self.arg) 81 | tube_list = self.tube_head.get_tubes(*tube_inputs) 82 | return tube_list 83 | 84 | def forward(self, img, img_meta, return_loss=True, **kwargs): 85 | if return_loss: 86 | return self.forward_train(img, img_meta, **kwargs) 87 | else: 88 | return self.forward_test(img, img_meta) 89 | -------------------------------------------------------------------------------- /network/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def distance2bbox(points, distance, max_shape=None): 6 | """Decode distance prediction to bounding box. 7 | 8 | Args: 9 | points (Tensor): Shape (n, 3), [t, x, y]. 10 | distance (Tensor): Distance from the given point to 4 11 | boundaries (left, top, right, bottom, frDis, 4point, bkDis, 4point). 12 | max_shape (list): Shape of the image. 13 | 14 | Returns: 15 | Tensor: Decoded bboxes. 16 | """ 17 | 18 | mid_t = points[:, 0] 19 | mid_x1 = points[:, 1] - distance[:, 0] 20 | mid_y1 = points[:, 2] - distance[:, 1] 21 | mid_x2 = points[:, 1] + distance[:, 2] 22 | mid_y2 = points[:, 2] + distance[:, 3] 23 | 24 | fr_t = points[:, 0] + distance[:, 4] 25 | fr_x1 = mid_x1 + distance[:, 5] 26 | fr_y1 = mid_y1 + distance[:, 6] 27 | fr_x2 = mid_x2 + distance[:, 7] 28 | fr_y2 = mid_y2 + distance[:, 8] 29 | 30 | bk_t = points[:, 0] - distance[:, 9] 31 | bk_x1 = mid_x1 + distance[:, 10] 32 | bk_y1 = mid_y1 + distance[:, 11] 33 | bk_x2 = mid_x2 + distance[:, 12] 34 | bk_y2 = mid_y2 + distance[:, 13] 35 | 36 | if max_shape is not None: 37 | mid_x1 = mid_x1.clamp(min=0, max=max_shape[2]) 38 | mid_y1 = mid_y1.clamp(min=0, max=max_shape[1]) 39 | mid_x2 = mid_x2.clamp(min=0, max=max_shape[2]) 40 | mid_y2 = mid_y2.clamp(min=0, max=max_shape[1]) 41 | 42 | fr_t = fr_t.clamp(min=0, max=max_shape[0]) 43 | fr_x1 = fr_x1.clamp(min=0, max=max_shape[2]) 44 | fr_y1 = fr_y1.clamp(min=0, max=max_shape[1]) 45 | fr_x2 = fr_x2.clamp(min=0, max=max_shape[2]) 46 | fr_y2 = fr_y2.clamp(min=0, max=max_shape[1]) 47 | 48 | bk_t = bk_t.clamp(min=0, max=max_shape[0]) 49 | bk_x1 = bk_x1.clamp(min=0, max=max_shape[2]) 50 | bk_y1 = bk_y1.clamp(min=0, max=max_shape[1]) 51 | bk_x2 = bk_x2.clamp(min=0, max=max_shape[2]) 52 | bk_y2 = bk_y2.clamp(min=0, max=max_shape[1]) 53 | 54 | return torch.stack([mid_t, mid_x1, mid_y1, mid_x2, mid_y2, 55 | fr_t, fr_x1, fr_y1, fr_x2, fr_y2, 56 | bk_t, bk_x1, bk_y1, bk_x2, bk_y2], -1) 57 | 58 | 59 | def iou_loss(pred_tubes, target_tubes): 60 | ious = tube_iou(pred_tubes, target_tubes) 61 | loss = 1 - ious 62 | return loss 63 | 64 | 65 | def giou_loss(pred_tubes, target_tubes): 66 | gious = tube_giou(pred_tubes, target_tubes) 67 | loss = 1 - gious 68 | loss = loss.clamp(min=0, max=2) 69 | return loss 70 | 71 | 72 | def tube_giou(pred_tubes, target_tubes): 73 | mid_t_pred, mid_bboxes_pred, fr_t_pred, fr_bboxes_pred, bk_t_pred, bk_bboxes_pred = get3bboxes_from_tube(pred_tubes) 74 | mid_t_gt, mid_bboxes_gt, fr_t_gt, fr_bboxes_gt, bk_t_gt, bk_bboxes_gt = get3bboxes_from_tube(target_tubes) 75 | 76 | # get giou of mid_frame 77 | tube_vol_pred = volume(area(mid_bboxes_pred), area(fr_bboxes_pred), fr_t_pred - mid_t_pred) + \ 78 | volume(area(mid_bboxes_pred), area(bk_bboxes_pred), mid_t_pred - bk_t_pred) 79 | tube_vol_gt = volume(area(mid_bboxes_gt), area(fr_bboxes_gt), fr_t_gt - mid_t_gt) + \ 80 | volume(area(mid_bboxes_gt), area(bk_bboxes_gt), mid_t_gt - bk_t_gt) 81 | 82 | mid_intersect = bbox_overlaps(mid_bboxes_pred, mid_bboxes_gt) 83 | mid_enclose = bbox_enclose(mid_bboxes_pred, mid_bboxes_gt) 84 | 85 | iou = mid_intersect / (area(mid_bboxes_gt) + area(mid_bboxes_pred) - mid_intersect) 86 | giou = iou - (mid_enclose - (area(mid_bboxes_gt) + area(mid_bboxes_pred) - mid_intersect)) / mid_enclose 87 | 88 | # get intersect of front and back frame 89 | dis_fr_min, fr_bboxes_pred_align_min, fr_bboxes_gt_align_min = \ 90 | align_bbox_on_frame(mid_bboxes_pred, fr_bboxes_pred, fr_t_pred - mid_t_pred, 91 | mid_bboxes_gt, fr_bboxes_gt, fr_t_gt - mid_t_gt) 92 | fr_intersect = bbox_overlaps(fr_bboxes_pred_align_min, fr_bboxes_gt_align_min) 93 | 94 | dis_bk_min, bk_bboxes_pred_align_min, bk_bboxes_gt_align_min = \ 95 | align_bbox_on_frame(mid_bboxes_pred, bk_bboxes_pred, mid_t_pred - bk_t_pred, 96 | mid_bboxes_gt, bk_bboxes_gt, mid_t_gt - bk_t_gt) 97 | bk_intersect = bbox_overlaps(bk_bboxes_pred_align_min, bk_bboxes_gt_align_min) 98 | 99 | # get enclose of front and back frame 100 | dis_fr_max, fr_bboxes_pred_align_max, fr_bboxes_gt_align_max = \ 101 | align_bbox_on_frame(mid_bboxes_pred, fr_bboxes_pred, fr_t_pred - mid_t_pred, 102 | mid_bboxes_gt, fr_bboxes_gt, fr_t_gt - mid_t_gt, mode='max') 103 | fr_enclose = bbox_enclose(fr_bboxes_pred_align_max, fr_bboxes_gt_align_max) 104 | 105 | dis_bk_max, bk_bboxes_pred_align_max, bk_bboxes_gt_align_max = \ 106 | align_bbox_on_frame(mid_bboxes_pred, bk_bboxes_pred, mid_t_pred - bk_t_pred, 107 | mid_bboxes_gt, bk_bboxes_gt, mid_t_gt - bk_t_gt, mode='max') 108 | bk_enclose = bbox_enclose(bk_bboxes_pred_align_max, bk_bboxes_gt_align_max) 109 | 110 | isTube = dis_fr_min + dis_bk_min != 0 111 | intersect = volume(mid_intersect[isTube], fr_intersect[isTube], dis_fr_min[isTube]) + \ 112 | volume(mid_intersect[isTube], bk_intersect[isTube], dis_bk_min[isTube]) 113 | iou[isTube] = intersect / (tube_vol_pred[isTube] + tube_vol_gt[isTube] - intersect) 114 | 115 | enclose = volume(mid_enclose[isTube], fr_enclose[isTube], dis_fr_max[isTube]) + \ 116 | volume(mid_enclose[isTube], bk_enclose[isTube], dis_bk_max[isTube]) 117 | 118 | giou[isTube] = iou[isTube] - (enclose - (tube_vol_pred[isTube] + tube_vol_gt[isTube] - intersect)) / enclose 119 | 120 | return giou 121 | 122 | 123 | def tube_iou(pred_tubes, target_tubes): 124 | mid_t_pred, mid_bboxes_pred, fr_t_pred, fr_bboxes_pred, bk_t_pred, bk_bboxes_pred = get3bboxes_from_tube(pred_tubes) 125 | mid_t_gt, mid_bboxes_gt, fr_t_gt, fr_bboxes_gt, bk_t_gt, bk_bboxes_gt = get3bboxes_from_tube(target_tubes) 126 | 127 | # get the tubes volume 128 | tube_vol_pred = volume(area(mid_bboxes_pred), area(fr_bboxes_pred), fr_t_pred - mid_t_pred) + \ 129 | volume(area(mid_bboxes_pred), area(bk_bboxes_pred), mid_t_pred - bk_t_pred) 130 | tube_vol_gt = volume(area(mid_bboxes_gt), area(fr_bboxes_gt), fr_t_gt - mid_t_gt) + \ 131 | volume(area(mid_bboxes_gt), area(bk_bboxes_gt), mid_t_gt - bk_t_gt) 132 | 133 | # overlap area on mid bbox 134 | mid_overlap = bbox_overlaps(mid_bboxes_pred, mid_bboxes_gt) 135 | 136 | # overlap area on front bbox 137 | dis_fr, fr_bboxes_pred_align, fr_bboxes_gt_align = \ 138 | align_bbox_on_frame(mid_bboxes_pred, fr_bboxes_pred, fr_t_pred - mid_t_pred, 139 | mid_bboxes_gt, fr_bboxes_gt, fr_t_gt - mid_t_gt) 140 | fr_overlap = bbox_overlaps(fr_bboxes_pred_align, fr_bboxes_gt_align) 141 | 142 | # overlap area on back bbox 143 | dis_bk, bk_bboxes_pred_align, bk_bboxes_gt_align = \ 144 | align_bbox_on_frame(mid_bboxes_pred, bk_bboxes_pred, mid_t_pred - bk_t_pred, 145 | mid_bboxes_gt, bk_bboxes_gt, mid_t_gt - bk_t_gt) 146 | bk_overlap = bbox_overlaps(bk_bboxes_pred_align, bk_bboxes_gt_align) 147 | 148 | # overlap volume 149 | res = mid_overlap / (area(mid_bboxes_gt) + area(mid_bboxes_pred) - mid_overlap) 150 | isTube = dis_fr + dis_bk != 0 151 | overlap = volume(mid_overlap[isTube], fr_overlap[isTube], dis_fr[isTube]) + \ 152 | volume(mid_overlap[isTube], bk_overlap[isTube], dis_bk[isTube]) 153 | res[isTube] = overlap / (tube_vol_pred[isTube] + tube_vol_gt[isTube] - overlap) 154 | 155 | res = res.clamp(min=1e-5, max=1) 156 | return res 157 | 158 | 159 | def get3bboxes_from_tube(tubes): 160 | mid_t = tubes[:, 0] 161 | mid_bboxes = tubes[:, 1:5] 162 | fr_t = tubes[:, 5] 163 | fr_bboxes = tubes[:, 6:10] 164 | bk_t = tubes[:, 10] 165 | bk_bboxes = tubes[:, 11:15] 166 | return mid_t, mid_bboxes, fr_t, fr_bboxes, bk_t, bk_bboxes 167 | 168 | 169 | def area(bboxes): 170 | a = (bboxes[:, 2] - bboxes[:, 0]) * (bboxes[:, 3] - bboxes[:, 1]) 171 | if isinstance(a, np.ndarray): 172 | return np.abs(a) 173 | else: 174 | return torch.abs(a) 175 | 176 | 177 | def volume(bbox1_area, bbox2_area, dis): 178 | return (bbox1_area + bbox2_area + torch.sqrt(bbox1_area + 1e-5) * torch.sqrt(bbox2_area + 1e-5)) * dis 179 | 180 | 181 | def align_bbox_on_frame(mid1, bbox1, t1, mid2, bbox2, t2, mode='min'): 182 | if mode == 'min': 183 | t = torch.min(t1, t2) 184 | else: 185 | t = torch.max(t1, t2) 186 | 187 | t1_zero_ind = t1 == 0 188 | t1_notzero_ind = t1 != 0 189 | bbox1_aligned = torch.zeros(mid1.shape, device=mid1.device) 190 | bbox1_aligned[t1_zero_ind] = mid1[t1_zero_ind] 191 | bbox1_aligned[t1_notzero_ind] = mid1[t1_notzero_ind] * ((t1[t1_notzero_ind]-t[t1_notzero_ind])/(t1[t1_notzero_ind]+1e-4)).unsqueeze(1).repeat(1, 4) + \ 192 | bbox1[t1_notzero_ind] * (t[t1_notzero_ind]/(t1[t1_notzero_ind]+1e-4)).unsqueeze(1).repeat(1, 4) 193 | 194 | t2_zero_ind = t2 == 0 195 | t2_notzero_ind = t2 != 0 196 | bbox2_aligned = torch.zeros(mid2.shape, device=mid2.device) 197 | bbox2_aligned[t2_zero_ind] = mid2[t2_zero_ind] 198 | bbox2_aligned[t2_notzero_ind] = mid2[t2_notzero_ind] * ((t2[t2_notzero_ind]-t[t2_notzero_ind])/(t2[t2_notzero_ind]+1e-4)).unsqueeze(1).repeat(1, 4) + \ 199 | bbox2[t2_notzero_ind] * (t[t2_notzero_ind]/(t2[t2_notzero_ind]+1e-4)).unsqueeze(1).repeat(1, 4) 200 | 201 | return t, bbox1_aligned, bbox2_aligned 202 | 203 | 204 | def bbox_overlaps(bboxes1, bboxes2): 205 | rows = bboxes1.shape[0] 206 | cols = bboxes2.shape[0] 207 | 208 | if rows * cols == 0: 209 | return bboxes1.new(rows, 1) 210 | 211 | if isinstance(bboxes1, np.ndarray): 212 | # To avoid wrong pred bbox which is not left top cord and right bottom cord 213 | lt = np.maximum(np.minimum(bboxes1[:, :2], bboxes1[:, 2:]), np.minimum(bboxes2[:, :2], bboxes2[:, 2:])) 214 | rb = np.minimum(np.maximum(bboxes1[:, 2:], bboxes1[:, :2]), np.maximum(bboxes2[:, 2:], bboxes2[:, :2])) 215 | wh = np.clip(rb - lt, 0, None) 216 | else: 217 | lt = torch.max(torch.min(bboxes1[:, :2], bboxes1[:, 2:]), torch.min(bboxes2[:, :2], bboxes2[:, 2:])) 218 | rb = torch.min(torch.max(bboxes1[:, 2:], bboxes1[:, :2]), torch.max(bboxes2[:, 2:], bboxes2[:, :2])) 219 | wh = (rb - lt).clamp(min=0) 220 | overlap = wh[:, 0] * wh[:, 1] 221 | 222 | return overlap 223 | 224 | 225 | def bbox_enclose(bboxes1, bboxes2): 226 | rows = bboxes1.shape[0] 227 | cols = bboxes2.shape[0] 228 | 229 | if rows * cols == 0: 230 | return bboxes1.new(rows, 1) 231 | 232 | if isinstance(bboxes1, np.ndarray): 233 | # To avoid wrong pred bbox which is not left top cord and right bottom cord 234 | lt = np.minimum(np.minimum(bboxes1[:, :2], bboxes1[:, 2:]), 235 | np.minimum(bboxes2[:, :2], bboxes2[:, 2:])) 236 | rb = np.maximum(np.maximum(bboxes1[:, 2:], bboxes1[:, :2]), 237 | np.maximum(bboxes2[:, 2:], bboxes2[:, :2])) 238 | wh = np.clip(rb - lt, 0, None) 239 | else: 240 | lt = torch.min(torch.min(bboxes1[:, :2], bboxes1[:, 2:]), 241 | torch.min(bboxes2[:, :2], bboxes2[:, 2:])) 242 | rb = torch.max(torch.max(bboxes1[:, 2:], bboxes1[:, :2]), 243 | torch.max(bboxes2[:, 2:], bboxes2[:, :2])) 244 | wh = (rb - lt).clamp(min=0) 245 | overlap = wh[:, 0] * wh[:, 1] 246 | 247 | return overlap 248 | 249 | 250 | def bbox_iou_loss(bboxes1, bboxes2): 251 | iou = bbox_iou(bboxes1, bboxes2) 252 | return 1 - iou 253 | 254 | 255 | def bbox_iou(bboxes1, bboxes2): 256 | 257 | overlap = bbox_overlaps(bboxes1, bboxes2) 258 | 259 | area1 = area(bboxes1) 260 | area2 = area(bboxes2) 261 | 262 | ious = overlap / (area1 + area2 - overlap) 263 | 264 | return ious 265 | -------------------------------------------------------------------------------- /optim/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoPang1996/TubeTK/bcca334c5348f9ae33e04595e1af93cf8351e50e/optim/__init__.py -------------------------------------------------------------------------------- /optim/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from bisect import bisect_right 3 | import torch 4 | import numpy as np 5 | 6 | 7 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): 8 | def __init__( 9 | self, 10 | optimizer, 11 | milestones, 12 | gamma=0.1, 13 | warmup_factor=1.0 / 3, 14 | warmup_iters=500, 15 | warmup_method="linear", 16 | last_epoch=-1, 17 | ): 18 | if not list(milestones) == sorted(milestones): 19 | raise ValueError( 20 | "Milestones should be a list of" " increasing integers. Got {}", 21 | milestones, 22 | ) 23 | 24 | if warmup_method not in ("constant", "linear"): 25 | raise ValueError( 26 | "Only 'constant' or 'linear' warmup_method accepted" 27 | "got {}".format(warmup_method) 28 | ) 29 | self.milestones = milestones 30 | self.gamma = gamma 31 | self.warmup_factor = warmup_factor 32 | self.warmup_iters = warmup_iters 33 | self.warmup_method = warmup_method 34 | super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) 35 | 36 | def get_lr(self): 37 | warmup_factor = 1 38 | if self.last_epoch < self.warmup_iters: 39 | if self.warmup_method == "constant": 40 | warmup_factor = self.warmup_factor 41 | elif self.warmup_method == "linear": 42 | alpha = float(self.last_epoch) / self.warmup_iters 43 | warmup_factor = self.warmup_factor * (1 - alpha) + alpha 44 | return [ 45 | base_lr 46 | * warmup_factor 47 | * self.gamma ** bisect_right(self.milestones, self.last_epoch) 48 | for base_lr in self.base_lrs 49 | ] 50 | 51 | 52 | class HalfPeriodCosStepLR(torch.optim.lr_scheduler._LRScheduler): 53 | def __init__( 54 | self, 55 | optimizer, 56 | warmup_factor=1.0 / 3, 57 | warmup_iters=8000, 58 | max_iters=93750, 59 | warmup_method="linear", 60 | last_epoch=-1, 61 | ): 62 | if warmup_method not in ("constant", "linear"): 63 | raise ValueError( 64 | "Only 'constant' or 'linear' warmup_method accepted" 65 | "got {}".format(warmup_method) 66 | ) 67 | self.warmup_factor = warmup_factor 68 | self.warmup_iters = warmup_iters 69 | self.max_iters = max_iters 70 | self.warmup_method = warmup_method 71 | super(HalfPeriodCosStepLR, self).__init__(optimizer, last_epoch) 72 | 73 | def get_lr(self): 74 | warmup_factor = 1 75 | if self.last_epoch < self.warmup_iters: 76 | if self.warmup_method == "constant": 77 | warmup_factor = self.warmup_factor 78 | elif self.warmup_method == "linear": 79 | alpha = float(self.last_epoch) / self.warmup_iters 80 | warmup_factor = self.warmup_factor * (1 - alpha) + alpha 81 | else: 82 | warmup_factor = 0.5 * (np.cos(self.last_epoch / self.max_iters * np.pi) + 1) 83 | return [ 84 | base_lr 85 | * warmup_factor 86 | for base_lr in self.base_lrs 87 | ] 88 | -------------------------------------------------------------------------------- /optim/solver.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | 4 | 5 | def make_optimizer(arg, model): 6 | params = [] 7 | bn_param_set = set() 8 | for name, module in model.named_modules(): 9 | if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)): 10 | bn_param_set.add(name+".weight") 11 | bn_param_set.add(name+".bias") 12 | for key, value in model.named_parameters(): 13 | if not value.requires_grad: 14 | continue 15 | lr = arg.lr 16 | weight_decay = arg.weight_decay 17 | if key in bn_param_set: 18 | weight_decay = arg.weight_decay * 0 19 | elif "bias" in key: 20 | lr = arg.lr * 1 21 | weight_decay = arg.weight_decay 22 | params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] 23 | 24 | optimizer = torch.optim.SGD(params, arg.lr, momentum=0.9) 25 | return optimizer 26 | -------------------------------------------------------------------------------- /post_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoPang1996/TubeTK/bcca334c5348f9ae33e04595e1af93cf8351e50e/post_processing/__init__.py -------------------------------------------------------------------------------- /post_processing/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoPang1996/TubeTK/bcca334c5348f9ae33e04595e1af93cf8351e50e/post_processing/nms/__init__.py -------------------------------------------------------------------------------- /post_processing/nms/setup.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | from setuptools import setup, Extension 3 | 4 | import numpy as np 5 | from Cython.Build import cythonize 6 | from Cython.Distutils import build_ext 7 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 8 | 9 | ext_args = dict( 10 | include_dirs=[np.get_include()], 11 | language='c++', 12 | extra_compile_args={ 13 | 'cc': ['-Wno-unused-function', '-Wno-write-strings'], 14 | 'nvcc': ['-c', '--compiler-options', '-fPIC'], 15 | }, 16 | ) 17 | 18 | extensions = [ 19 | Extension('soft_nms_cpu', ['src/soft_nms_cpu.pyx'], **ext_args), 20 | ] 21 | 22 | 23 | def customize_compiler_for_nvcc(self): 24 | """inject deep into distutils to customize how the dispatch 25 | to cc/nvcc works. 26 | If you subclass UnixCCompiler, it's not trivial to get your subclass 27 | injected in, and still have the right customizations (i.e. 28 | distutils.sysconfig.customize_compiler) run on it. So instead of going 29 | the OO route, I have this. Note, it's kindof like a wierd functional 30 | subclassing going on.""" 31 | 32 | # tell the compiler it can processes .cu 33 | self.src_extensions.append('.cu') 34 | 35 | # save references to the default compiler_so and _comple methods 36 | default_compiler_so = self.compiler_so 37 | super = self._compile 38 | 39 | # now redefine the _compile method. This gets executed for each 40 | # object but distutils doesn't have the ability to change compilers 41 | # based on source extension: we add it. 42 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 43 | if osp.splitext(src)[1] == '.cu': 44 | # use the cuda for .cu files 45 | self.set_executable('compiler_so', 'nvcc') 46 | # use only a subset of the extra_postargs, which are 1-1 translated 47 | # from the extra_compile_args in the Extension class 48 | postargs = extra_postargs['nvcc'] 49 | else: 50 | postargs = extra_postargs['cc'] 51 | 52 | super(obj, src, ext, cc_args, postargs, pp_opts) 53 | # reset the default compiler_so, which we might have changed for cuda 54 | self.compiler_so = default_compiler_so 55 | 56 | # inject our redefined _compile method into the class 57 | self._compile = _compile 58 | 59 | 60 | class custom_build_ext(build_ext): 61 | 62 | def build_extensions(self): 63 | customize_compiler_for_nvcc(self.compiler) 64 | build_ext.build_extensions(self) 65 | 66 | 67 | setup( 68 | name='soft_nms', 69 | cmdclass={'build_ext': custom_build_ext}, 70 | ext_modules=cythonize(extensions), 71 | ) 72 | 73 | setup( 74 | name='nms_cuda', 75 | ext_modules=[ 76 | CUDAExtension('nms_cuda', [ 77 | 'src/nms_cuda.cpp', 78 | 'src/nms_kernel.cu', 79 | ]), 80 | CUDAExtension('nms_cpu', [ 81 | 'src/nms_cpu.cpp', 82 | ]), 83 | ], 84 | cmdclass={'build_ext': BuildExtension}) 85 | -------------------------------------------------------------------------------- /post_processing/nms/src/nms_cpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include 3 | 4 | template 5 | at::Tensor nms_cpu_kernel(const at::Tensor& dets, const float threshold) { 6 | AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); 7 | 8 | if (dets.numel() == 0) { 9 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 10 | } 11 | 12 | auto x1_t = dets.select(1, 0).contiguous(); 13 | auto y1_t = dets.select(1, 1).contiguous(); 14 | auto x2_t = dets.select(1, 2).contiguous(); 15 | auto y2_t = dets.select(1, 3).contiguous(); 16 | auto scores = dets.select(1, 4).contiguous(); 17 | 18 | at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); 19 | 20 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 21 | 22 | auto ndets = dets.size(0); 23 | at::Tensor suppressed_t = 24 | at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); 25 | 26 | auto suppressed = suppressed_t.data(); 27 | auto order = order_t.data(); 28 | auto x1 = x1_t.data(); 29 | auto y1 = y1_t.data(); 30 | auto x2 = x2_t.data(); 31 | auto y2 = y2_t.data(); 32 | auto areas = areas_t.data(); 33 | 34 | for (int64_t _i = 0; _i < ndets; _i++) { 35 | auto i = order[_i]; 36 | if (suppressed[i] == 1) continue; 37 | auto ix1 = x1[i]; 38 | auto iy1 = y1[i]; 39 | auto ix2 = x2[i]; 40 | auto iy2 = y2[i]; 41 | auto iarea = areas[i]; 42 | 43 | for (int64_t _j = _i + 1; _j < ndets; _j++) { 44 | auto j = order[_j]; 45 | if (suppressed[j] == 1) continue; 46 | auto xx1 = std::max(ix1, x1[j]); 47 | auto yy1 = std::max(iy1, y1[j]); 48 | auto xx2 = std::min(ix2, x2[j]); 49 | auto yy2 = std::min(iy2, y2[j]); 50 | 51 | auto w = std::max(static_cast(0), xx2 - xx1 + 1); 52 | auto h = std::max(static_cast(0), yy2 - yy1 + 1); 53 | auto inter = w * h; 54 | auto ovr = inter / (iarea + areas[j] - inter); 55 | if (ovr >= threshold) suppressed[j] = 1; 56 | } 57 | } 58 | return at::nonzero(suppressed_t == 0).squeeze(1); 59 | } 60 | 61 | at::Tensor nms(const at::Tensor& dets, const float threshold) { 62 | at::Tensor result; 63 | AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { 64 | result = nms_cpu_kernel(dets, threshold); 65 | }); 66 | return result; 67 | } 68 | 69 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 70 | m.def("nms", &nms, "non-maximum suppression"); 71 | } -------------------------------------------------------------------------------- /post_processing/nms/src/nms_cuda.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include 3 | 4 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") 5 | 6 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh, float side_nms_overlap_thresh); 7 | 8 | at::Tensor nms(const at::Tensor& dets, const float threshold, const float side_threshold) { 9 | CHECK_CUDA(dets); 10 | if (dets.numel() == 0) 11 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 12 | return nms_cuda(dets, threshold, side_threshold); 13 | } 14 | 15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 16 | m.def("nms", &nms, "non-maximum suppression"); 17 | } -------------------------------------------------------------------------------- /post_processing/nms/src/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 12 | 13 | int const bbox_dim = 15; 14 | 15 | 16 | struct res 17 | { 18 | float* bbox1_aligned; 19 | float* bbox2_aligned; 20 | }; 21 | 22 | __device__ inline res align_bbox_on_frame(float const * const mid1, float const *const bbox1, float const* t1, \ 23 | 24 | float const * const mid2, float const * const bbox2, float const* t2, float const* mid_t){ 25 | float d1 = abs(*t1 - *mid_t), d2 = abs(*t2 - *mid_t); 26 | float t = min(d1, d2); 27 | float bbox1_aligned[4], bbox2_aligned[4]; 28 | if (d1 != 0){ 29 | for (int i=0; i< 4; i++){ 30 | bbox1_aligned[i] = mid1[i] * ((d1-t)/ d1) + bbox1[i] * (t/ d1); 31 | } 32 | }else{ 33 | for (int i=0; i< 4; i++){ 34 | bbox1_aligned[i] = mid1[i]; 35 | } 36 | } 37 | if (d2 != 0){ 38 | for (int i=0; i< 4; i++){ 39 | bbox2_aligned[i] = mid2[i] * ((d2-t)/ d2) + bbox2[i] * (t/ d2); 40 | } 41 | }else{ 42 | for (int i=0; i< 4; i++){ 43 | bbox2_aligned[i] = mid2[i]; 44 | } 45 | } 46 | res aligned_bbox = {bbox1_aligned, bbox2_aligned}; 47 | return aligned_bbox; 48 | } 49 | 50 | __device__ inline float devIoU(float const * const a, float const * const b) { 51 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 52 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 53 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 54 | float interS = width * height; 55 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 56 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 57 | return interS / (Sa + Sb - interS); 58 | } 59 | 60 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, const float side_nms_overlap_thresh, 61 | const float *dev_boxes, unsigned long long *dev_mask) { 62 | const int row_start = blockIdx.y; 63 | const int col_start = blockIdx.x; 64 | 65 | // if (row_start > col_start) return; 66 | 67 | const int row_size = 68 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 69 | const int col_size = 70 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 71 | 72 | __shared__ float block_boxes[threadsPerBlock * (bbox_dim + 1)]; 73 | if (threadIdx.x < col_size) { 74 | int d = 0; 75 | for (;d <= bbox_dim; d ++){ 76 | block_boxes[threadIdx.x * (bbox_dim + 1) + d] = 77 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * (bbox_dim + 1) + d]; 78 | } 79 | } 80 | __syncthreads(); 81 | 82 | if (threadIdx.x < row_size) { 83 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 84 | const float *cur_box = dev_boxes + cur_box_idx * (bbox_dim + 1); 85 | int i = 0; 86 | unsigned long long t = 0; 87 | int start = 0; 88 | if (row_start == col_start) { 89 | start = threadIdx.x + 1; 90 | } 91 | for (i = start; i < col_size; i++) { 92 | if (bbox_dim == 4){ 93 | if (devIoU(cur_box, block_boxes + i * (bbox_dim + 1)) > nms_overlap_thresh) { 94 | t |= 1ULL << i; 95 | } 96 | } 97 | else if(bbox_dim == 15){ 98 | const float *cur_box_mid = cur_box + 1; 99 | const float *cur_box_fr = cur_box + 6; 100 | const float *cur_box_bk = cur_box + 11; 101 | 102 | const float *block_boxes_mid = block_boxes + i * (bbox_dim + 1) + 1; 103 | const float *block_boxes_fr = block_boxes + i * (bbox_dim + 1) + 6; 104 | const float *block_boxes_bk = block_boxes + i * (bbox_dim + 1) + 11; 105 | 106 | res aligned_bbox_fr = align_bbox_on_frame(cur_box_mid, cur_box_fr, cur_box + 5, 107 | block_boxes_mid, block_boxes_fr, block_boxes + i * (bbox_dim + 1) + 5, cur_box); 108 | const float * cur_box_fr_aligned = aligned_bbox_fr.bbox1_aligned; 109 | const float * block_boxes_fr_aligned = aligned_bbox_fr.bbox2_aligned; 110 | 111 | res aligned_bbox_bk = align_bbox_on_frame(cur_box_mid, cur_box_bk, cur_box + 10, 112 | block_boxes_mid, block_boxes_bk, block_boxes + i * (bbox_dim + 1) + 10, cur_box); 113 | const float * cur_box_bk_aligned = aligned_bbox_bk.bbox1_aligned; 114 | const float * block_boxes_bk_aligned = aligned_bbox_bk.bbox2_aligned; 115 | 116 | if (devIoU(cur_box_mid, block_boxes_mid) > nms_overlap_thresh && 117 | devIoU(cur_box_fr_aligned, block_boxes_fr_aligned) > side_nms_overlap_thresh && 118 | devIoU(cur_box_bk_aligned, block_boxes_bk_aligned) > side_nms_overlap_thresh) { 119 | t |= 1ULL << i; 120 | } 121 | } 122 | 123 | 124 | } 125 | const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); 126 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 127 | } 128 | } 129 | 130 | // boxes is a N x 5 tensor 131 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh, float side_nms_overlap_thresh) { 132 | using scalar_t = float; 133 | AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); 134 | auto scores = boxes.select(1, bbox_dim); 135 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 136 | auto boxes_sorted = boxes.index_select(0, order_t); 137 | 138 | int boxes_num = boxes.size(0); 139 | 140 | const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); 141 | 142 | scalar_t* boxes_dev = boxes_sorted.data(); 143 | 144 | THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState 145 | 146 | unsigned long long* mask_dev = NULL; 147 | //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, 148 | // boxes_num * col_blocks * sizeof(unsigned long long))); 149 | 150 | mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); 151 | 152 | dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), 153 | THCCeilDiv(boxes_num, threadsPerBlock)); 154 | dim3 threads(threadsPerBlock); 155 | nms_kernel<<>>(boxes_num, 156 | nms_overlap_thresh, 157 | side_nms_overlap_thresh, 158 | boxes_dev, 159 | mask_dev); 160 | 161 | std::vector mask_host(boxes_num * col_blocks); 162 | THCudaCheck(cudaMemcpy(&mask_host[0], 163 | mask_dev, 164 | sizeof(unsigned long long) * boxes_num * col_blocks, 165 | cudaMemcpyDeviceToHost)); 166 | 167 | std::vector remv(col_blocks); 168 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 169 | 170 | at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); 171 | int64_t* keep_out = keep.data(); 172 | 173 | int num_to_keep = 0; 174 | for (int i = 0; i < boxes_num; i++) { 175 | int nblock = i / threadsPerBlock; 176 | int inblock = i % threadsPerBlock; 177 | 178 | if (!(remv[nblock] & (1ULL << inblock))) { 179 | keep_out[num_to_keep++] = i; 180 | unsigned long long *p = &mask_host[0] + i * col_blocks; 181 | for (int j = nblock; j < col_blocks; j++) { 182 | remv[j] |= p[j]; 183 | } 184 | } 185 | } 186 | 187 | THCudaFree(state, mask_dev); 188 | // TODO improve this part 189 | return std::get<0>(order_t.index({ 190 | keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( 191 | order_t.device(), keep.scalar_type()) 192 | }).sort(0, false)); 193 | } -------------------------------------------------------------------------------- /post_processing/nms/src/soft_nms_cpu.pyx: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------- 2 | # Soft-NMS: Improving Object Detection With One Line of Code 3 | # Copyright (c) University of Maryland, College Park 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Navaneeth Bodla and Bharat Singh 6 | # Modified by Kai Chen 7 | # ---------------------------------------------------------- 8 | 9 | # cython: language_level=3, boundscheck=False 10 | 11 | import numpy as np 12 | cimport numpy as np 13 | 14 | 15 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 16 | return a if a >= b else b 17 | 18 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 19 | return a if a <= b else b 20 | 21 | 22 | def soft_nms_cpu( 23 | np.ndarray[float, ndim=2] boxes_in, 24 | float iou_thr, 25 | unsigned int method=1, 26 | float sigma=0.5, 27 | float min_score=0.001, 28 | ): 29 | boxes = boxes_in.copy() 30 | cdef unsigned int N = boxes.shape[0] 31 | cdef float iw, ih, box_area 32 | cdef float ua 33 | cdef int pos = 0 34 | cdef float maxscore = 0 35 | cdef int maxpos = 0 36 | cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov 37 | inds = np.arange(N) 38 | 39 | for i in range(N): 40 | maxscore = boxes[i, 4] 41 | maxpos = i 42 | 43 | tx1 = boxes[i, 0] 44 | ty1 = boxes[i, 1] 45 | tx2 = boxes[i, 2] 46 | ty2 = boxes[i, 3] 47 | ts = boxes[i, 4] 48 | ti = inds[i] 49 | 50 | pos = i + 1 51 | # get max box 52 | while pos < N: 53 | if maxscore < boxes[pos, 4]: 54 | maxscore = boxes[pos, 4] 55 | maxpos = pos 56 | pos = pos + 1 57 | 58 | # add max box as a detection 59 | boxes[i, 0] = boxes[maxpos, 0] 60 | boxes[i, 1] = boxes[maxpos, 1] 61 | boxes[i, 2] = boxes[maxpos, 2] 62 | boxes[i, 3] = boxes[maxpos, 3] 63 | boxes[i, 4] = boxes[maxpos, 4] 64 | inds[i] = inds[maxpos] 65 | 66 | # swap ith box with position of max box 67 | boxes[maxpos, 0] = tx1 68 | boxes[maxpos, 1] = ty1 69 | boxes[maxpos, 2] = tx2 70 | boxes[maxpos, 3] = ty2 71 | boxes[maxpos, 4] = ts 72 | inds[maxpos] = ti 73 | 74 | tx1 = boxes[i, 0] 75 | ty1 = boxes[i, 1] 76 | tx2 = boxes[i, 2] 77 | ty2 = boxes[i, 3] 78 | ts = boxes[i, 4] 79 | 80 | pos = i + 1 81 | # NMS iterations, note that N changes if detection boxes fall below 82 | # threshold 83 | while pos < N: 84 | x1 = boxes[pos, 0] 85 | y1 = boxes[pos, 1] 86 | x2 = boxes[pos, 2] 87 | y2 = boxes[pos, 3] 88 | s = boxes[pos, 4] 89 | 90 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 91 | iw = (min(tx2, x2) - max(tx1, x1) + 1) 92 | if iw > 0: 93 | ih = (min(ty2, y2) - max(ty1, y1) + 1) 94 | if ih > 0: 95 | ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih) 96 | ov = iw * ih / ua # iou between max box and detection box 97 | 98 | if method == 1: # linear 99 | if ov > iou_thr: 100 | weight = 1 - ov 101 | else: 102 | weight = 1 103 | elif method == 2: # gaussian 104 | weight = np.exp(-(ov * ov) / sigma) 105 | else: # original NMS 106 | if ov > iou_thr: 107 | weight = 0 108 | else: 109 | weight = 1 110 | 111 | boxes[pos, 4] = weight * boxes[pos, 4] 112 | 113 | # if box score falls below threshold, discard the box by 114 | # swapping with last box update N 115 | if boxes[pos, 4] < min_score: 116 | boxes[pos, 0] = boxes[N-1, 0] 117 | boxes[pos, 1] = boxes[N-1, 1] 118 | boxes[pos, 2] = boxes[N-1, 2] 119 | boxes[pos, 3] = boxes[N-1, 3] 120 | boxes[pos, 4] = boxes[N-1, 4] 121 | inds[pos] = inds[N - 1] 122 | N = N - 1 123 | pos = pos - 1 124 | 125 | pos = pos + 1 126 | 127 | return boxes[:N], inds[:N] 128 | -------------------------------------------------------------------------------- /post_processing/tube_iou_matching.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import cv2 3 | import os 4 | import datetime 5 | import numpy as np 6 | import pandas as pd 7 | from tqdm import tqdm 8 | from network.utils import bbox_iou 9 | from datetime import datetime 10 | import multiprocessing 11 | from scipy.optimize import linear_sum_assignment 12 | 13 | 14 | class Track: 15 | ''' 16 | Track is the class of track. it contains all the node and manages the node. it contains the following information: 17 | 1) all the nodes 18 | 2) track id. it is unique it identify each track 19 | 3) track pool id. it is a number to give a new id to a new track 20 | 4) age. age indicates how old is the track 21 | 5) max_age. indicates the dead age of this track 22 | ''' 23 | _id_pool = 1 24 | def __init__(self): 25 | self.nodes = list() 26 | self.frames = {} 27 | self.mid_frames = {} 28 | self.id = Track._id_pool 29 | Track._id_pool += 1 30 | self.color = tuple((np.random.rand(3) * 255).astype(int).tolist()) 31 | self.prev_direction = None 32 | 33 | def update_frames(self, all_tube_frames, tube_boxes, mid_frame, mid_box, score, tube_direction): 34 | 35 | for frame, frame_box in zip(all_tube_frames, tube_boxes): 36 | if frame not in self.frames: 37 | self.frames[frame] = [frame_box, 1, score] 38 | else: 39 | self.frames[frame][0] += frame_box.astype(np.float) 40 | self.frames[frame][1] += 1 41 | self.frames[frame][2] += score 42 | 43 | if mid_frame not in self.mid_frames: 44 | self.mid_frames[mid_frame] = [mid_box.astype(np.float), 1, score] 45 | else: 46 | self.mid_frames[mid_frame][0] += mid_box.astype(np.float) 47 | self.mid_frames[mid_frame][1] += 1 48 | self.mid_frames[mid_frame][2] += score 49 | 50 | def get_center(box): 51 | return np.array(((box[0] + box[2]) / 2, (box[1] + box[3]) / 2)) 52 | 53 | front_frame = np.max(all_tube_frames) 54 | back_frame = np.min(all_tube_frames) 55 | 56 | end_box = self.frames[front_frame][0] / self.frames[front_frame][1] 57 | start_box = self.frames[back_frame][0] / self.frames[back_frame][1] 58 | self.prev_direction = np.zeros(3) 59 | self.prev_direction[:2] = get_center(end_box) - get_center(start_box) 60 | self.prev_direction[2] = front_frame - back_frame 61 | 62 | 63 | def track_tube_iou(track_boxes, tube_boxes): 64 | track_boxes = np.atleast_3d(track_boxes).astype(np.float) # (n_track, n_tbbox, 4) 65 | tube_boxes = np.atleast_2d(tube_boxes).astype(np.float) # (n_tbbox, 4) 66 | 67 | def track_tube_overlaps(bboxes1, bboxes2): 68 | lt = np.maximum(np.minimum(bboxes1[:, :, :2], bboxes1[:, :, 2:]), np.minimum(bboxes2[:, :2], bboxes2[:, 2:])) # [rows, 2] 69 | rb = np.minimum(np.maximum(bboxes1[:, :, 2:], bboxes1[:, :, :2]), np.maximum(bboxes2[:, 2:], bboxes2[:, :2])) # [rows, 2] 70 | wh = np.clip(rb - lt, 0, None) 71 | overlap = wh[:, :, 0] * wh[:, :, 1] 72 | return overlap 73 | 74 | overlap = track_tube_overlaps(track_boxes, tube_boxes) 75 | 76 | area1 = (track_boxes[:, :, 2] - track_boxes[:, :, 0]) * (track_boxes[:, :, 3] - track_boxes[:, :, 1]) 77 | area1 = np.abs(area1) 78 | area2 = (tube_boxes[:, 2] - tube_boxes[:, 0]) * (tube_boxes[:, 3] - tube_boxes[:, 1]) 79 | area2 = np.abs(area2) 80 | 81 | ious = overlap / (area1 + area2 - overlap) 82 | 83 | return ious 84 | 85 | 86 | def get_shape_diff(track_boxes, tube_boxes): 87 | track_boxes = np.atleast_3d(track_boxes).astype(np.float) # (n_track, n_tbbox, 4) 88 | tube_boxes = np.atleast_2d(tube_boxes).astype(np.float) # (n_tbbox, 4) 89 | 90 | track_height = track_boxes[:, :, 2] - track_boxes[:, :, 0] 91 | track_width = track_boxes[:, :, 3] - track_boxes[:, :, 1] 92 | tube_height = tube_boxes[:, 2] - tube_boxes[:, 0] 93 | tube_width = tube_boxes[:, 3] - tube_boxes[:, 1] 94 | 95 | diff = np.abs(track_height - tube_height) / (track_height + tube_height) + \ 96 | np.abs(track_width - tube_width) / (track_width + tube_width) 97 | 98 | return np.exp(1.5 * -diff) 99 | 100 | 101 | def update_tracks_fast(tracks, tube, arg): 102 | mid_frame = tube[0].astype(np.int) 103 | mid_box = tube[1:5] 104 | end_frame = tube[5].astype(np.int) 105 | end_box = tube[6:10] 106 | start_frame = tube[10].astype(np.int) 107 | start_box = tube[11:15] 108 | score = tube[15] 109 | 110 | def get_center(box): 111 | return np.array(((box[0] + box[2]) / 2, (box[1] + box[3]) / 2)) 112 | 113 | back_frames = np.arange(start_frame, mid_frame) 114 | front_frames = np.arange(mid_frame + 1, end_frame + 1) 115 | all_tube_frames = np.arange(start_frame, end_frame + 1) 116 | 117 | back_start_coef = (mid_frame - back_frames) / (mid_frame - start_frame) 118 | back_mid_coef = (back_frames - start_frame) / (mid_frame - start_frame) 119 | front_mid_coef = (end_frame - front_frames) / (end_frame - mid_frame) 120 | front_end_coef = (front_frames - mid_frame) / (end_frame - mid_frame) 121 | 122 | back_frame_boxes = np.outer(back_start_coef, start_box) + np.outer(back_mid_coef, mid_box) 123 | front_frame_boxes = np.outer(front_end_coef, end_box) + np.outer(front_mid_coef, mid_box) 124 | 125 | tube_boxes = np.concatenate((back_frame_boxes, mid_box[None], front_frame_boxes)) 126 | tube_frame_num = len(all_tube_frames) 127 | 128 | depth_divider = 8 129 | 130 | tube_direction = np.zeros(3) 131 | tube_direction[:2] = get_center(end_box) - get_center(start_box) 132 | tube_direction[2] = np.max(all_tube_frames) - np.min(all_tube_frames) 133 | tube_direction[2] /= depth_divider 134 | 135 | if len(tracks) == 0: 136 | new_track = Track() 137 | new_track.update_frames(all_tube_frames, tube_boxes, mid_frame, mid_box, score, tube_direction) 138 | tracks.append(new_track) 139 | return 140 | 141 | all_has_frame = np.zeros((len(tracks), tube_frame_num), dtype=np.bool) 142 | all_track_boxes = np.zeros((len(tracks), *tube_boxes.shape)) 143 | track_direction = np.zeros((len(tracks), 3)) 144 | 145 | for track_idx, track in enumerate(tracks): 146 | # overlap_area = [1e8, -1] 147 | if track.prev_direction is not None: 148 | track_direction[track_idx, :] = track.prev_direction 149 | 150 | for i, frame in enumerate(all_tube_frames): 151 | if frame not in track.frames: 152 | continue 153 | all_has_frame[track_idx, i] = True 154 | all_track_boxes[track_idx, i, :] = \ 155 | track.frames[frame][0] / track.frames[frame][1] 156 | # overlap_area[0] = min(overlap_area[0], frame) 157 | # overlap_area[1] = max(overlap_area[1], frame) 158 | 159 | # if overlap_area[1] < 0: 160 | # continue 161 | # while overlap_area[0] - 1 in track.frames and overlap_area[1] - overlap_area[0] + 1 < tube_frame_num: 162 | # overlap_area[0] -= 1 163 | # while overlap_area[1] + 1 in track.frames and overlap_area[1] - overlap_area[0] + 1 < tube_frame_num: 164 | # overlap_area[1] += 1 165 | # track_direction[track_idx, :2] = get_center(track.frames[overlap_area[1]][0] / track.frames[overlap_area[1]][1]) - \ 166 | # get_center(track.frames[overlap_area[0]][0] / track.frames[overlap_area[0]][1]) 167 | # track_direction[track_idx, 2] = overlap_area[1] - overlap_area[0] 168 | 169 | track_direction[:, 2] /= depth_divider 170 | 171 | has_overlap = (np.sum(all_has_frame, axis=1) > 0) 172 | all_iou = np.zeros(all_has_frame.shape, dtype=np.float) 173 | shape_diff = np.zeros(all_has_frame.shape, dtype=np.float) 174 | all_iou[has_overlap] = track_tube_iou(all_track_boxes[has_overlap], tube_boxes) 175 | shape_diff[has_overlap] = get_shape_diff(all_track_boxes[has_overlap], tube_boxes) 176 | 177 | mean_all_iou = np.zeros(has_overlap.shape, dtype=np.float) 178 | mean_all_iou[has_overlap] = np.sum(all_iou[has_overlap], axis=1) / np.sum(all_has_frame[has_overlap], axis=1) 179 | 180 | angle_cos = np.ones_like(mean_all_iou) 181 | norm_mul = np.linalg.norm(track_direction, axis=1) * np.linalg.norm(tube_direction) 182 | 183 | cos_mask = np.logical_and(has_overlap, norm_mul > 0) 184 | angle_cos[cos_mask] = np.dot(track_direction[cos_mask], tube_direction) / norm_mul[cos_mask] 185 | 186 | mean_all_iou = mean_all_iou * (1 + arg.cos_weight * angle_cos) 187 | max_idx = np.argmax(mean_all_iou) 188 | 189 | if mean_all_iou[max_idx] > arg.linking_min_iou: 190 | tracks[max_idx].update_frames(all_tube_frames, tube_boxes, mid_frame, mid_box, score, tube_direction) 191 | else: 192 | new_track = Track() 193 | new_track.update_frames(all_tube_frames, tube_boxes, mid_frame, mid_box, score, tube_direction) 194 | tracks.append(new_track) 195 | 196 | 197 | def filt_bbox(save_path): 198 | def bboxfilt(res, l=8): 199 | max_frame = np.max(res[0]) 200 | range_mask = (res[6] >= l) | (res[0] <= 8) | (res[0] + 8 >= max_frame) 201 | return res[range_mask] 202 | 203 | def trackfilt(track, l=16): 204 | max_fid = int(np.max(track[0])) 205 | min_fid = int(np.min(track[0])) 206 | return max_fid - min_fid < 5 207 | # max_frame = np.max(track.iloc[:, 0]) 208 | # range_mask = (track[0] > 8) & (track[0] + 8 < max_frame) 209 | # if np.mean(track[range_mask][6]) < l: 210 | # return True 211 | # else: 212 | # return False 213 | 214 | def ip_linear(det1, det2, fid): 215 | fid1 = det1[0] 216 | fid2 = det2[0] 217 | w1 = 1.0 * (fid2 - fid) / (fid2 - fid1) 218 | w2 = 1.0 * (fid - fid1) / (fid2 - fid1) 219 | 220 | ip = np.copy(det1) 221 | ip[0] = fid 222 | ip[2:6] = w1 * det1[2:6] + w2 * det2[2:6] 223 | return np.array([ip]) 224 | 225 | def track_complete(track, gap_threshold=8): 226 | max_fid = int(np.max(track[:, 0])) 227 | min_fid = int(np.min(track[:, 0])) 228 | 229 | ips = [] 230 | ip_cnt = 0 231 | max_missing_len = 0 232 | for i, fid in enumerate(list(track[:-1, 0])): 233 | if track[i+1, 0] - 1 != track[i, 0]: 234 | if track[i+1, 0] - track[i, 0] - 1 > gap_threshold: 235 | continue 236 | cur_fid = track[i, 0] + 1 237 | missing_len = 0 238 | while cur_fid < track[i+1, 0]: 239 | ips.append(ip_linear(track[i+1], track[i], cur_fid)) 240 | cur_fid = cur_fid + 1 241 | missing_len = missing_len + 1 242 | ip_cnt = ip_cnt + missing_len 243 | max_missing_len = max(max_missing_len, missing_len) 244 | 245 | assert len(ips) == ip_cnt, (track, ips) 246 | ips.append(track) 247 | new_track = np.concatenate(ips, axis=0) 248 | new_track = new_track[new_track[:, 0].argsort()] 249 | if ip_cnt == 0: 250 | return track, 0 251 | else: 252 | return new_track, ip_cnt 253 | 254 | param_pairs = [ 255 | (['-05-'], [0, 4, 8]), 256 | (['-10-'], [0, 6, 8]), 257 | (['-11-'], [0, 6, 8]), 258 | (['-13-'], [0, 9, 8]), 259 | (['-02-'], [0, 6, 8]), 260 | (['-09-'], [0, 4, 8]), 261 | (['-04-'], [0, 12, 8]), 262 | (['-06-'], [0, 4, 8]), 263 | (['-07-'], [0, 6, 8]), 264 | (['-12-'], [0, 6, 8]), 265 | (['-14-'], [0, 9, 8]), 266 | (['-01-'], [0, 6, 30]), 267 | (['-08-'], [0, 4, 30]), 268 | (['-03-'], [0, 12, 30]) 269 | ] 270 | params = {} 271 | for file_nums, param in param_pairs: 272 | params.update({x: param for x in file_nums}) 273 | file_num = None 274 | for k in params.keys(): 275 | if k in save_path and file_num is None: 276 | file_num = k 277 | elif k in save_path: 278 | assert False 279 | # assert file_num is not None 280 | res = pd.read_csv(save_path, header=None) 281 | 282 | if file_num is not None: 283 | min_num = params[file_num][0] 284 | min_bbox = params[file_num][1] 285 | res = bboxfilt(res, min_bbox) 286 | filtered_tracks = [x[0] for x in res.groupby(1) if trackfilt(x[1], min_num)] 287 | inds = [res.iloc[x, 1] not in filtered_tracks for x in range(len(res))] 288 | res = res[inds] 289 | inds = np.unique(res[1]) 290 | dict_map = {x: i + 1 for i, x in enumerate(inds)} 291 | res[1] = res[1].map(lambda x: dict_map[x]) 292 | # res.to_csv(save_path, header=None, index=False) 293 | 294 | # track complete part 295 | tracks = res.groupby(1) 296 | new_tracks = [] 297 | for tid in tracks.groups.keys(): 298 | res, _ = track_complete(tracks.get_group(tid).values, params[file_num][2]) 299 | if res is not None: 300 | new_tracks.append(res) 301 | 302 | new_tracks = np.concatenate(new_tracks) 303 | new_tracks = new_tracks[new_tracks[:, 0].argsort()] 304 | np.savetxt(save_path, new_tracks, fmt='%i,%i,%f,%f,%f,%f,%i,%f,%i,%i', delimiter=',') 305 | 306 | 307 | def final_processing(tracks, save_path, mid_only): 308 | res = [] 309 | assert len(tracks) != 0, 'No Tracks: ' + str(save_path) 310 | for track in tracks: 311 | if mid_only: 312 | frames = track.mid_frames 313 | else: 314 | frames = track.frames 315 | cur_res = np.zeros((len(track.mid_frames), 10)) 316 | for i, (frame, bbox) in enumerate(track.mid_frames.items()): 317 | cur_res[i, 0] = frame + 1 318 | cur_res[i, 2:6] = bbox[0] / bbox[1] 319 | cur_res[i, 6] = track.frames[frame][1] # num of average bbox, use all frames 320 | cur_res[i, 7] = track.frames[frame][2] / track.frames[frame][1] # average score, use all frames 321 | cur_res[:, 1] = track.id 322 | res.append(cur_res) 323 | res = np.concatenate(res) 324 | res = res[res[:, 0].argsort()] 325 | res[:, -2:] = -1 326 | res[:, 4:6] -= res[:, 2:4] 327 | if save_path is not None: 328 | try: 329 | if save_path[0] == '/': 330 | os.makedirs(os.path.join('/', *(save_path.split('/')[:-1]))) 331 | else: 332 | os.makedirs(os.path.join(*(save_path.split('/')[:-1]))) 333 | except: 334 | pass 335 | np.savetxt(save_path, res, fmt='%i,%i,%f,%f,%f,%f,%i,%f,%i,%i', delimiter=',') 336 | filt_bbox(save_path) 337 | # ? return res or track 338 | 339 | 340 | def archive_tracks(tracks, arch_tracks, cur_frame, forward_frames): 341 | track_ = [] 342 | for track in tracks: 343 | max_frame = max(track.frames.keys()) 344 | if max_frame + 2 * forward_frames < cur_frame: 345 | arch_tracks.append(track) 346 | else: 347 | track_.append(track) 348 | 349 | return track_ 350 | 351 | 352 | def adjust_poi_tubes(tubes, poi_tubes): 353 | def adjust_single_frame(tubes, poi_tubes): 354 | tubes_end_mask = tubes[:, 5] > tubes[:, 0] 355 | # Trans from end to mid 356 | trans_x_end = (tubes[tubes_end_mask][:, 6] + tubes[tubes_end_mask][:, 8]) / 2 - \ 357 | (tubes[tubes_end_mask][:, 1] + tubes[tubes_end_mask][:, 3]) / 2 358 | trans_y_end = (tubes[tubes_end_mask][:, 7] + tubes[tubes_end_mask][:, 9]) / 2 - \ 359 | (tubes[tubes_end_mask][:, 2] + tubes[tubes_end_mask][:, 4]) / 2 360 | # Trans Per Frame 361 | trans_x_end = trans_x_end / (tubes[tubes_end_mask][:, 5] - tubes[tubes_end_mask][:, 0]) 362 | trans_y_end = trans_y_end / (tubes[tubes_end_mask][:, 5] - tubes[tubes_end_mask][:, 0]) 363 | # Trans Per Height 364 | mean_trans_x_end = np.mean(trans_x_end / (tubes[tubes_end_mask][:, 7] - tubes[tubes_end_mask][:, 9])) 365 | mean_trans_y_end = np.mean(trans_y_end / (tubes[tubes_end_mask][:, 7] - tubes[tubes_end_mask][:, 9])) 366 | poi_tubes[:, [6, 8]] += (mean_trans_x_end * (poi_tubes[:, 5] - poi_tubes[:, 0]) 367 | * (poi_tubes[:, 7] - poi_tubes[:, 9]))[:, None] 368 | poi_tubes[:, [7, 9]] += (mean_trans_y_end * (poi_tubes[:, 5] - poi_tubes[:, 0]) 369 | * (poi_tubes[:, 7] - poi_tubes[:, 9]))[:, None] 370 | 371 | tubes_start_mask = tubes[:, 10] < tubes[:, 0] 372 | trans_x_start = (tubes[tubes_start_mask][:, 11] + tubes[tubes_start_mask][:, 13]) / 2 - \ 373 | (tubes[tubes_start_mask][:, 1] + tubes[tubes_start_mask][:, 3]) / 2 374 | trans_y_start = (tubes[tubes_start_mask][:, 12] + tubes[tubes_start_mask][:, 14]) / 2 - \ 375 | (tubes[tubes_start_mask][:, 2] + tubes[tubes_start_mask][:, 4]) / 2 376 | # Trans Per Frame 377 | trans_x_start = trans_x_start / (tubes[tubes_start_mask][:, 10] - tubes[tubes_start_mask][:, 0]) 378 | trans_y_start = trans_y_start / (tubes[tubes_start_mask][:, 10] - tubes[tubes_start_mask][:, 0]) 379 | # Trans Per Height 380 | mean_trans_x_start = np.mean(trans_x_start / (tubes[tubes_start_mask][:, 12] - tubes[tubes_start_mask][:, 14])) 381 | mean_trans_y_start = np.mean(trans_y_start / (tubes[tubes_start_mask][:, 12] - tubes[tubes_start_mask][:, 14])) 382 | poi_tubes[:, [11, 13]] += (mean_trans_x_start * (poi_tubes[:, 10] - poi_tubes[:, 0]) 383 | * (poi_tubes[:, 12] - poi_tubes[:, 14]))[:, None] 384 | poi_tubes[:, [12, 14]] += (mean_trans_y_start * (poi_tubes[:, 10] - poi_tubes[:, 0]) 385 | * (poi_tubes[:, 12] - poi_tubes[:, 14]))[:, None] 386 | 387 | return poi_tubes 388 | 389 | frame_idxs = np.unique(tubes[:, 0]) 390 | for frame_idx in frame_idxs: 391 | poi_tubes[poi_tubes[:, 0] == frame_idx] = adjust_single_frame( 392 | tubes[tubes[:, 0] == frame_idx], poi_tubes[poi_tubes[:, 0] == frame_idx]) 393 | 394 | return poi_tubes 395 | 396 | 397 | def matching(tubes, arg, save_path=None, verbose=False, mid_only=True, poi_tubes=None): 398 | """ 399 | tubes: All tubes in a video to match. (n, 15 + 1) [mid_frame, mid_box, front_frame, front_box, back_frame, back_box, value] 400 | save_path: File path to save formatted result. 401 | """ 402 | tracks = [] 403 | if not isinstance(tubes, np.ndarray): 404 | tubes = tubes.cpu().data.numpy() 405 | 406 | if poi_tubes is not None: 407 | poi_tubes = adjust_poi_tubes(tubes, poi_tubes) 408 | tubes = np.concatenate((tubes, poi_tubes)) 409 | 410 | tubes = tubes[(-tubes[:, 15]).argsort()] 411 | tubes = tubes[tubes[:, 0].argsort(kind='stable')] 412 | arch_tracks = [] 413 | prev_frame = -1 414 | tubes_one_frame = 0 415 | 416 | for tube in tubes: 417 | update_tracks_fast(tracks, tube, arg) 418 | 419 | current_frame = tube[0] 420 | if prev_frame != current_frame and prev_frame != -1: # Switch Frame 421 | if verbose: 422 | print('{}\tFrame: {}\tTubes: {}\tCur tracks:{}\tArch tracks:{}'.format( 423 | datetime.now().time(), prev_frame, tubes_one_frame, len(tracks), len(arch_tracks))) 424 | tubes_one_frame = 0 425 | # Archive tracks 2*forward_frames frames away, they won't be useful anymore 426 | if int(current_frame) % 10 == 0: 427 | tracks = archive_tracks(tracks, arch_tracks, current_frame, arg.forward_frames * arg.frame_stride) 428 | 429 | prev_frame = current_frame 430 | tubes_one_frame += 1 431 | 432 | arch_tracks.extend(tracks) 433 | tracks = arch_tracks 434 | final_processing(tracks, save_path, mid_only) 435 | return tracks 436 | -------------------------------------------------------------------------------- /post_processing/tube_iou_matching_old.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import cv2 3 | import os 4 | import datetime 5 | import numpy as np 6 | import pandas as pd 7 | from tqdm import tqdm 8 | from network.utils import bbox_iou 9 | from datetime import datetime 10 | import multiprocessing 11 | 12 | 13 | class Node: 14 | 15 | def __init__(self, box): 16 | self.box = box 17 | 18 | 19 | class Track: 20 | ''' 21 | Track is the class of track. it contains all the node and manages the node. it contains the following information: 22 | 1) all the nodes 23 | 2) track id. it is unique it identify each track 24 | 3) track pool id. it is a number to give a new id to a new track 25 | 4) age. age indicates how old is the track 26 | 5) max_age. indicates the dead age of this track 27 | ''' 28 | _id_pool = 1 29 | ''' for mot 30 | ''' 31 | _max_num_node = 36 32 | '''for kitti 33 | _max_num_node = 5 34 | ''' 35 | 36 | def __init__(self): 37 | self.nodes = list() 38 | self.frames = {} 39 | self.mid_frames = {} 40 | self.id = Track._id_pool 41 | Track._id_pool += 1 42 | self.color = tuple((np.random.rand(3) * 255).astype(int).tolist()) 43 | 44 | def __del__(self): 45 | for n in self.nodes: 46 | del n 47 | 48 | def update_frames(self, node): 49 | tube = node.box 50 | 51 | mid_frame = tube[0].astype(np.int) 52 | mid_box = tube[1:5] 53 | end_frame = tube[5].astype(np.int) 54 | end_box = tube[6:10] 55 | start_frame = tube[10].astype(np.int) 56 | start_box = tube[11:15] 57 | score = tube[15] 58 | 59 | for frame in range(start_frame, mid_frame): 60 | frame_box = start_box * (mid_frame - frame) / (mid_frame - start_frame) + mid_box * ( 61 | frame - start_frame) / (mid_frame - start_frame) 62 | if frame not in self.frames: 63 | self.frames[frame] = [frame_box, 1, score] 64 | else: 65 | self.frames[frame][0] += frame_box.astype(np.float) 66 | self.frames[frame][1] += 1 67 | self.frames[frame][2] += score 68 | 69 | for frame in range(mid_frame + 1, end_frame + 1): 70 | frame_box = mid_box * (end_frame - frame) / (end_frame - mid_frame) + end_box * (frame - mid_frame) / ( 71 | end_frame - mid_frame) 72 | if frame not in self.frames: 73 | self.frames[frame] = [frame_box, 1, score] 74 | else: 75 | self.frames[frame][0] += frame_box.astype(np.float) 76 | self.frames[frame][1] += 1 77 | self.frames[frame][2] += score 78 | 79 | # Add middle frame 80 | if mid_frame not in self.frames: 81 | self.frames[mid_frame] = [mid_box.astype(np.float), 1, score] 82 | else: 83 | self.frames[mid_frame][0] += mid_box.astype(np.float) 84 | self.frames[mid_frame][1] += 1 85 | self.frames[mid_frame][2] += score 86 | 87 | if mid_frame not in self.mid_frames: 88 | self.mid_frames[mid_frame] = [mid_box.astype(np.float), 1, score] 89 | else: 90 | self.mid_frames[mid_frame][0] += mid_box.astype(np.float) 91 | self.mid_frames[mid_frame][1] += 1 92 | self.mid_frames[mid_frame][2] += score 93 | 94 | def add_node(self, node): 95 | # self.nodes.append(node) 96 | self.update_frames(node) 97 | # self._volatile_memory() 98 | 99 | def _volatile_memory(self): 100 | if len(self.nodes) > self._max_num_node: 101 | for i in range(int(self._max_num_node / 2)): 102 | del self.nodes[i] 103 | 104 | 105 | class Tracks: 106 | ''' 107 | Track set. It contains all the tracks and manage the tracks. it has the following information 108 | 1) tracks. the set of tracks 109 | 2) keep the previous image and features 110 | ''' 111 | 112 | def __init__(self): 113 | self.tracks = list() # the set of tracks 114 | self.max_drawing_track = 10 115 | 116 | def __getitem__(self, item): 117 | return self.tracks[item] 118 | 119 | def append(self, track): 120 | self.tracks.append(track) 121 | 122 | def get_track_by_id(self, id): 123 | for t in self.tracks: 124 | if t.id == id: 125 | return t 126 | return None 127 | 128 | def one_frame_pass(self): 129 | keep_track_set = list() 130 | for i, t in enumerate(self.tracks): 131 | t.add_age() 132 | if t.age < t._max_age: 133 | keep_track_set.append(i) 134 | 135 | self.tracks = [self.tracks[i] for i in keep_track_set] 136 | 137 | def show(self, image): 138 | h, w, _ = image.shape 139 | 140 | # draw rectangle 141 | for t in self.tracks: 142 | if len(t.nodes) > 0 and t.age < 2: 143 | b = t.nodes[-1].box 144 | image = cv2.putText(image, str(t.id), (int(b[0] * w), int((b[1]) * h)), cv2.FONT_HERSHEY_SIMPLEX, 1, 145 | t.color, 3) 146 | image = cv2.rectangle(image, (int(b[0] * w), int((b[1]) * h)), 147 | (int((b[0] + b[2]) * w), int((b[1] + b[3]) * h)), t.color, 2) 148 | 149 | # draw line 150 | for t in self.tracks: 151 | if t.age > 1: 152 | continue 153 | if len(t.nodes) > self.max_drawing_track: 154 | start = len(t.nodes) - self.max_drawing_track 155 | else: 156 | start = 0 157 | for n1, n2 in zip(t.nodes[start:], t.nodes[start + 1:]): 158 | c1 = (int((n1.box[0] + n1.box[2] / 2.0) * w), int((n1.box[1] + n1.box[3]) * h)) 159 | c2 = (int((n2.box[0] + n2.box[2] / 2.0) * w), int((n2.box[1] + n2.box[3]) * h)) 160 | image = cv2.line(image, c1, c2, t.color, 2) 161 | 162 | return image 163 | 164 | 165 | def update_tracks(tracks, tube, arg): 166 | mid_frame = tube[0].astype(np.int) 167 | mid_box = tube[1:5] 168 | end_frame = tube[5].astype(np.int) 169 | end_box = tube[6:10] 170 | start_frame = tube[10].astype(np.int) 171 | start_box = tube[11:15] 172 | score = tube[15] 173 | 174 | def get_center(box): 175 | 176 | return np.array([(box[0] + box[2]) / 2, (box[1] + box[3]) / 2]) 177 | 178 | tube_direction = get_center(end_box) - get_center(start_box) 179 | 180 | assert start_frame <= mid_frame and mid_frame <= end_frame 181 | 182 | # Pre-compute all inter frame_boxs in this tube 183 | back_frames = list(range(start_frame, mid_frame)) 184 | front_frames = list(range(mid_frame + 1, end_frame + 1)) 185 | all_tube_frames = back_frames + front_frames + [mid_frame] 186 | # ! CAUTION: all_tube_frames is not sorted, mid_frame is the last one 187 | 188 | back_start_coef = (mid_frame - back_frames) / (mid_frame - start_frame) 189 | back_mid_coef = (back_frames - start_frame) / (mid_frame - start_frame) 190 | front_mid_coef = (end_frame - front_frames) / (end_frame - mid_frame) 191 | front_end_coef = (front_frames - mid_frame) / (end_frame - mid_frame) 192 | frame_boxs = np.concatenate((np.outer(back_start_coef, start_box), np.outer(front_end_coef, end_box))) + \ 193 | np.outer(np.concatenate((back_mid_coef, front_mid_coef)), mid_box) 194 | frame_boxs = np.concatenate((frame_boxs, mid_box[None])) 195 | 196 | tube_frame_num = len(frame_boxs) 197 | 198 | # Above code computes bboxes in tube of corresponding frames 199 | # Equal to: 200 | # back_frame_boxs = np.outer((mid_frame - back_frames) / (mid_frame - start_frame), start_box) + \ 201 | # np.outer((back_frames - start_frame) / (mid_frame - start_frame), mid_box) 202 | # front_frame_boxs = np.outer((end_frame - front_frames) / (end_frame - mid_frame), mid_box) + \ 203 | # np.outer((front_frames - mid_frame) / (end_frame - mid_frame), end_box) 204 | # frame_boxs = np.concatenate((back_frame_boxs, front_frame_boxs)) 205 | 206 | # Preallocate array of bboxes in track 207 | track_boxs = np.zeros_like(frame_boxs) 208 | 209 | max_idx, max_iou = -1, -1 210 | 211 | for idx, track in enumerate(tracks): 212 | iou = [0, 0] 213 | 214 | has_frame = [(frame in track.frames) for frame in all_tube_frames] 215 | if np.sum(has_frame) == 0: # tube and track does not overlap 216 | continue 217 | 218 | # get the same length of area in the track that near to the tube 219 | overlap_frames = np.array(all_tube_frames)[np.where(has_frame)[0]] 220 | overlap_area = [min(overlap_frames), max(overlap_frames)] 221 | while overlap_area[1] - overlap_area[0] + 1 < tube_frame_num: 222 | if overlap_area[0] - 1 in track.frames: 223 | overlap_area[0] = overlap_area[0] - 1 224 | elif overlap_area[1] + 1 in track.frames: 225 | overlap_area[1] = overlap_area[1] + 1 226 | else: 227 | break 228 | # calculate the cos value 229 | track_direction = get_center(track.frames[overlap_area[1]][0] / track.frames[overlap_area[1]][1]) - \ 230 | get_center(track.frames[overlap_area[0]][0] / track.frames[overlap_area[0]][1]) 231 | 232 | if np.linalg.norm(tube_direction) < arg.noise_dis: 233 | tube_direction = np.array([0, 0]) 234 | if np.linalg.norm(track_direction) < arg.noise_dis: 235 | track_direction = np.array([0, 0]) 236 | if np.linalg.norm(track_direction) * np.linalg.norm(tube_direction) > 0: 237 | angle_cos = np.dot(track_direction, tube_direction) / ( 238 | np.linalg.norm(track_direction) * np.linalg.norm(tube_direction)) 239 | else: 240 | angle_cos = 1 241 | 242 | # calculate the IoU 243 | for i, frame in enumerate(all_tube_frames): 244 | if has_frame[i]: 245 | track_boxs[i] = track.frames[frame][0] / track.frames[frame][1] 246 | 247 | iou[0] = np.sum(bbox_iou(frame_boxs, track_boxs)[has_frame]) 248 | iou[1] = np.sum(has_frame) 249 | 250 | if iou[0] / iou[1] > arg.linking_min_iou + 0.2: 251 | angle_cos = 1 252 | 253 | # whether linking 254 | if iou[1] > 0 and iou[0] / iou[1] > max_iou and angle_cos > arg.cos_value: 255 | max_idx = idx 256 | max_iou = iou[0] / iou[1] 257 | 258 | if max_iou > arg.linking_min_iou: 259 | tracks[max_idx].update_frames(Node(tube)) 260 | else: 261 | new_tracks(tracks, [tube]) 262 | 263 | 264 | def new_tracks(tracks, tubes): 265 | for tube in tubes: 266 | track = Track() 267 | track.add_node(Node(tube)) 268 | tracks.append(track) 269 | 270 | 271 | def final_processing(tracks, save_path, mid_only): 272 | res = [] 273 | assert len(tracks) != 0, 'No Tracks: ' + str(save_path) 274 | for track in tracks: 275 | if mid_only: 276 | frames = track.mid_frames 277 | else: 278 | frames = track.frames 279 | cur_res = np.zeros((len(track.mid_frames), 10)) 280 | for i, (frame, bbox) in enumerate(track.mid_frames.items()): 281 | cur_res[i, 0] = frame + 1 282 | cur_res[i, 2:6] = bbox[0] / bbox[1] 283 | cur_res[i, 6] = track.frames[frame][1] # num of average bbox, use all frames 284 | cur_res[i, 7] = track.frames[frame][2] / track.frames[frame][1] # average score, use all frames 285 | cur_res[:, 1] = track.id 286 | res.append(cur_res) 287 | res = np.concatenate(res) 288 | res = res[res[:, 0].argsort()] 289 | res[:, -2:] = -1 290 | res[:, 4:6] -= res[:, 2:4] 291 | if save_path is not None: 292 | try: 293 | if save_path[0] == '/': 294 | os.makedirs(os.path.join('/', *(save_path.split('/')[:-1]))) 295 | else: 296 | os.makedirs(os.path.join(*(save_path.split('/')[:-1]))) 297 | except: 298 | pass 299 | np.savetxt(save_path, res, fmt='%i,%i,%f,%f,%f,%f,%i,%f,%i,%i', delimiter=',') 300 | # ? return res or track 301 | 302 | 303 | def archive_tracks(tracks, arch_tracks, cur_frame, forward_frames): 304 | track_ = [] 305 | for track in tracks: 306 | max_frame = max(track.frames.keys()) 307 | if (max_frame + 2 * forward_frames < cur_frame): 308 | arch_tracks.append(track) 309 | else: 310 | track_.append(track) 311 | 312 | return track_ 313 | 314 | 315 | def matching(tubes, arg, save_path=None, verbose=False, mid_only=True): 316 | """ 317 | tubes: All tubes in a video to match. (n, 15 + 1) [mid_frame, mid_box, front_frame, front_box, back_frame, back_box, value] 318 | save_path: File path to save formatted result. 319 | """ 320 | tracks = [] 321 | if not isinstance(tubes, np.ndarray): 322 | tubes = tubes.cpu().data.numpy() 323 | # tubes = pd.DataFrame(tubes) 324 | # tubes = tubes.astype({0: int, 5: int, 10: int}) 325 | 326 | # tubes_group = tubes.groupby(0) # group by back_frame, i.e. start_frame 327 | 328 | # arch_tracks = [] 329 | # for frame in sorted(tubes_group.indices.keys()): 330 | # tubes_one_frame = tubes_group.get_group(frame).values 331 | 332 | # for tube in tubes_one_frame: 333 | # update_tracks(tracks, tube, arg) 334 | 335 | # if verbose: 336 | # print('{}\tFrame: {}\tTubes: {}\tCur tracks:{}\tArch tracks:{}'.format(\ 337 | # datetime.now().time(), frame, len(tubes_one_frame), len(tracks), len(arch_tracks))) 338 | 339 | # # Archive tracks 2*forward_frames frames away, they won't be useful anymore 340 | # # if frame % 10 == 0: 341 | # tracks = archive_tracks(tracks, arch_tracks, frame, arg.forward_frames * arg.frame_stride) 342 | 343 | tubes = tubes[tubes[:, 0].argsort()] 344 | arch_tracks = [] 345 | prev_frame = -1 346 | tubes_one_frame = 0 347 | 348 | for tube in tubes: 349 | update_tracks(tracks, tube, arg) 350 | 351 | current_frame = tube[0] 352 | if prev_frame != current_frame and prev_frame != -1: # Switch Frame 353 | if verbose: 354 | print('{}\tFrame: {}\tTubes: {}\tCur tracks:{}\tArch tracks:{}'.format( \ 355 | datetime.now().time(), prev_frame, tubes_one_frame, len(tracks), len(arch_tracks))) 356 | tubes_one_frame = 0 357 | # Archive tracks 2*forward_frames frames away, they won't be useful anymore 358 | if int(current_frame) % 10 == 0: 359 | tracks = archive_tracks(tracks, arch_tracks, current_frame, arg.forward_frames * arg.frame_stride) 360 | 361 | prev_frame = current_frame 362 | tubes_one_frame += 1 363 | 364 | arch_tracks.extend(tracks) 365 | tracks = arch_tracks 366 | final_processing(tracks, save_path, mid_only) 367 | return tracks -------------------------------------------------------------------------------- /post_processing/tube_iou_matching_super_old.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import cv2 3 | import os 4 | import datetime 5 | import numpy as np 6 | import pandas as pd 7 | from tqdm import tqdm 8 | from network.utils import bbox_iou 9 | from datetime import datetime 10 | 11 | 12 | class Node: 13 | 14 | def __init__(self, box): 15 | self.box = box 16 | 17 | 18 | class Track: 19 | ''' 20 | Track is the class of track. it contains all the node and manages the node. it contains the following information: 21 | 1) all the nodes 22 | 2) track id. it is unique it identify each track 23 | 3) track pool id. it is a number to give a new id to a new track 24 | 4) age. age indicates how old is the track 25 | 5) max_age. indicates the dead age of this track 26 | ''' 27 | _id_pool = 1 28 | ''' for mot 29 | ''' 30 | _max_num_node = 36 31 | '''for kitti 32 | _max_num_node = 5 33 | ''' 34 | def __init__(self): 35 | self.nodes = list() 36 | self.frames = {} 37 | self.id = Track._id_pool 38 | Track._id_pool += 1 39 | self.color = tuple((np.random.rand(3) * 255).astype(int).tolist()) 40 | 41 | def __del__(self): 42 | for n in self.nodes: 43 | del n 44 | 45 | def update_frames(self, node): 46 | tube = node.box 47 | 48 | mid_frame = tube[0].astype(np.int) 49 | mid_box = tube[1:5] 50 | end_frame = tube[5].astype(np.int) 51 | end_box = tube[6:10] 52 | start_frame = tube[10].astype(np.int) 53 | start_box = tube[11:15] 54 | score = tube[15] 55 | 56 | for frame in range(start_frame, mid_frame): 57 | frame_box = start_box * (mid_frame - frame) / (mid_frame - start_frame) + mid_box * (frame - start_frame) / (mid_frame - start_frame) 58 | if frame not in self.frames: 59 | self.frames[frame] = [frame_box, 1, score] 60 | else: 61 | self.frames[frame][0] += frame_box.astype(np.float) 62 | self.frames[frame][1] += 1 63 | self.frames[frame][2] += score 64 | 65 | for frame in range(mid_frame + 1, end_frame + 1): 66 | frame_box = mid_box * (end_frame - frame) / (end_frame - mid_frame) + end_box * (frame - mid_frame) / (end_frame - mid_frame) 67 | if frame not in self.frames: 68 | self.frames[frame] = [frame_box, 1, score] 69 | else: 70 | self.frames[frame][0] += frame_box.astype(np.float) 71 | self.frames[frame][1] += 1 72 | self.frames[frame][2] += score 73 | 74 | # Add middle frame 75 | if mid_frame not in self.frames: 76 | self.frames[mid_frame] = [mid_box.astype(np.float), 1, score] 77 | else: 78 | self.frames[mid_frame][0] += mid_box.astype(np.float) 79 | self.frames[mid_frame][1] += 1 80 | self.frames[mid_frame][2] += score 81 | 82 | def add_node(self, node): 83 | # self.nodes.append(node) 84 | self.update_frames(node) 85 | # self._volatile_memory() 86 | 87 | def _volatile_memory(self): 88 | if len(self.nodes) > self._max_num_node: 89 | for i in range(int(self._max_num_node/2)): 90 | del self.nodes[i] 91 | 92 | 93 | class Tracks: 94 | ''' 95 | Track set. It contains all the tracks and manage the tracks. it has the following information 96 | 1) tracks. the set of tracks 97 | 2) keep the previous image and features 98 | ''' 99 | def __init__(self): 100 | self.tracks = list() # the set of tracks 101 | self.max_drawing_track = 10 102 | 103 | def __getitem__(self, item): 104 | return self.tracks[item] 105 | 106 | def append(self, track): 107 | self.tracks.append(track) 108 | 109 | def get_track_by_id(self, id): 110 | for t in self.tracks: 111 | if t.id == id: 112 | return t 113 | return None 114 | 115 | def one_frame_pass(self): 116 | keep_track_set = list() 117 | for i, t in enumerate(self.tracks): 118 | t.add_age() 119 | if t.age < t._max_age: 120 | keep_track_set.append(i) 121 | 122 | self.tracks = [self.tracks[i] for i in keep_track_set] 123 | 124 | def show(self, image): 125 | h, w, _ = image.shape 126 | 127 | # draw rectangle 128 | for t in self.tracks: 129 | if len(t.nodes) > 0 and t.age<2: 130 | b = t.nodes[-1].box 131 | image = cv2.putText(image, str(t.id), (int(b[0]*w),int((b[1])*h)), cv2.FONT_HERSHEY_SIMPLEX, 1, t.color, 3) 132 | image = cv2.rectangle(image, (int(b[0]*w),int((b[1])*h)), (int((b[0]+b[2])*w), int((b[1]+b[3])*h)), t.color, 2) 133 | 134 | # draw line 135 | for t in self.tracks: 136 | if t.age > 1: 137 | continue 138 | if len(t.nodes) > self.max_drawing_track: 139 | start = len(t.nodes) - self.max_drawing_track 140 | else: 141 | start = 0 142 | for n1, n2 in zip(t.nodes[start:], t.nodes[start+1:]): 143 | c1 = (int((n1.box[0] + n1.box[2]/2.0)*w), int((n1.box[1] + n1.box[3])*h)) 144 | c2 = (int((n2.box[0] + n2.box[2] / 2.0) * w), int((n2.box[1] + n2.box[3]) * h)) 145 | image = cv2.line(image, c1, c2, t.color, 2) 146 | 147 | return image 148 | 149 | 150 | def update_tracks(tracks, tube, arg): 151 | mid_frame = tube[0].astype(np.int) 152 | mid_box = tube[1:5] 153 | end_frame = tube[5].astype(np.int) 154 | end_box = tube[6:10] 155 | start_frame = tube[10].astype(np.int) 156 | start_box = tube[11:15] 157 | score = tube[15] 158 | 159 | def get_center(box): 160 | 161 | return np.array([(box[0] + box[2])/2, (box[1] + box[3])/2]) 162 | 163 | tube_direction = get_center(end_box) - get_center(start_box) 164 | 165 | assert start_frame <= mid_frame and mid_frame <= end_frame 166 | 167 | # Pre-compute all inter frame_boxs in this tube 168 | back_frames = list(range(start_frame, mid_frame)) 169 | front_frames = list(range(mid_frame + 1, end_frame + 1)) 170 | all_tube_frames = back_frames + front_frames + [mid_frame] 171 | # ! CAUTION: all_tube_frames is not sorted, mid_frame is the last one 172 | 173 | back_start_coef = (mid_frame - back_frames) / (mid_frame - start_frame) 174 | back_mid_coef = (back_frames - start_frame) / (mid_frame - start_frame) 175 | front_mid_coef = (end_frame - front_frames) / (end_frame - mid_frame) 176 | front_end_coef = (front_frames - mid_frame) / (end_frame - mid_frame) 177 | frame_boxs = np.concatenate((np.outer(back_start_coef, start_box), np.outer(front_end_coef, end_box))) + \ 178 | np.outer(np.concatenate((back_mid_coef, front_mid_coef)), mid_box) 179 | frame_boxs = np.concatenate((frame_boxs, mid_box[None])) 180 | 181 | tube_frame_num = len(frame_boxs) 182 | 183 | # Above code computes bboxes in tube of corresponding frames 184 | # Equal to: 185 | # back_frame_boxs = np.outer((mid_frame - back_frames) / (mid_frame - start_frame), start_box) + \ 186 | # np.outer((back_frames - start_frame) / (mid_frame - start_frame), mid_box) 187 | # front_frame_boxs = np.outer((end_frame - front_frames) / (end_frame - mid_frame), mid_box) + \ 188 | # np.outer((front_frames - mid_frame) / (end_frame - mid_frame), end_box) 189 | # frame_boxs = np.concatenate((back_frame_boxs, front_frame_boxs)) 190 | 191 | # Preallocate array of bboxes in track 192 | track_boxs = np.zeros_like(frame_boxs) 193 | 194 | max_idx, max_iou = -1, -1 195 | for idx, track in enumerate(tracks): 196 | iou = [0, 0] 197 | 198 | has_frame = [(frame in track.frames) for frame in all_tube_frames] 199 | if sum(has_frame) == 0: # tube and track does not overlap 200 | continue 201 | 202 | # get the same length of area in the track that near to the tube 203 | overlap_frames = np.array(all_tube_frames)[np.where(has_frame)[0]] 204 | overlap_area = [min(overlap_frames), max(overlap_frames)] 205 | while overlap_area[1] - overlap_area[0] + 1 < tube_frame_num: 206 | if overlap_area[0] - 1 in track.frames: 207 | overlap_area[0] = overlap_area[0] - 1 208 | elif overlap_area[1] + 1 in track.frames: 209 | overlap_area[1] = overlap_area[1] + 1 210 | else: 211 | break 212 | # calculate the cos value 213 | track_direction = get_center(track.frames[overlap_area[1]][0] / track.frames[overlap_area[1]][1]) - \ 214 | get_center(track.frames[overlap_area[0]][0] / track.frames[overlap_area[0]][1]) 215 | 216 | if np.linalg.norm(tube_direction) < arg.noise_dis: 217 | tube_direction = np.array([0, 0]) 218 | if np.linalg.norm(track_direction) < arg.noise_dis: 219 | track_direction = np.array([0, 0]) 220 | if np.linalg.norm(track_direction) * np.linalg.norm(tube_direction) > 0: 221 | angle_cos = np.dot(track_direction, tube_direction) / (np.linalg.norm(track_direction) * np.linalg.norm(tube_direction)) 222 | else: 223 | angle_cos = 1 224 | 225 | # calculate the IoU 226 | for i, frame in enumerate(all_tube_frames): 227 | if has_frame[i]: 228 | track_boxs[i] = track.frames[frame][0] / track.frames[frame][1] 229 | 230 | iou[0] = sum(bbox_iou(frame_boxs, track_boxs)[has_frame]) 231 | iou[1] = sum(has_frame) 232 | 233 | if iou[0] / iou[1] > arg.linking_min_iou + 0.2: 234 | angle_cos = 1 235 | 236 | # whether linking 237 | if iou[1] > 0 and iou[0] / iou[1] > max_iou and angle_cos > arg.cos_value: 238 | max_idx = idx 239 | max_iou = iou[0] / iou[1] 240 | 241 | if max_iou > arg.linking_min_iou: 242 | tracks[max_idx].update_frames(Node(tube)) 243 | else: 244 | new_tracks(tracks, [tube]) 245 | 246 | 247 | def new_tracks(tracks, tubes): 248 | for tube in tubes: 249 | track = Track() 250 | track.add_node(Node(tube)) 251 | tracks.append(track) 252 | 253 | 254 | def final_processing(tracks, save_path): 255 | res = [] 256 | assert len(tracks) != 0, 'No Tracks: ' + str(save_path) 257 | for track in tracks: 258 | cur_res = np.zeros((len(track.frames), 10)) 259 | for i, (frame, bbox) in enumerate(track.frames.items()): 260 | cur_res[i, 0] = frame + 1 261 | cur_res[i, 2:6] = bbox[0] / bbox[1] 262 | cur_res[i, 6] = bbox[1] # num of average bbox 263 | cur_res[i, 7] = bbox[2] / bbox[1] # average score 264 | cur_res[:, 1] = track.id 265 | res.append(cur_res) 266 | res = np.concatenate(res) 267 | res = res[res[:, 0].argsort()] 268 | res[:, -2:] = -1 269 | res[:, 4:6] -= res[:, 2:4] 270 | if save_path is not None: 271 | try: 272 | if save_path[0] == '/': 273 | os.makedirs(os.path.join('/', *(save_path.split('/')[:-1]))) 274 | else: 275 | os.makedirs(os.path.join(*(save_path.split('/')[:-1]))) 276 | except: 277 | pass 278 | np.savetxt(save_path, res, fmt='%i,%i,%f,%f,%f,%f,%i,%f,%i,%i', delimiter=',') 279 | # ? return res or track 280 | 281 | 282 | def archive_tracks(tracks, arch_tracks, cur_frame, forward_frames): 283 | track_ = [] 284 | for track in tracks: 285 | max_frame = max(track.frames.keys()) 286 | if (max_frame + 2 * forward_frames < cur_frame): 287 | arch_tracks.append(track) 288 | else: 289 | track_.append(track) 290 | 291 | return track_ 292 | 293 | 294 | def matching(tubes, arg, save_path=None, verbose=False): 295 | """ 296 | tubes: All tubes in a video to match. (n, 15 + 1) [mid_frame, mid_box, front_frame, front_box, back_frame, back_box, value] 297 | save_path: File path to save formatted result. 298 | """ 299 | tracks = [] 300 | if not isinstance(tubes, np.ndarray): 301 | tubes = tubes.cpu().data.numpy() 302 | tubes = pd.DataFrame(tubes) 303 | tubes = tubes.astype({0: int, 5: int, 10: int}) 304 | tubes_group = tubes.groupby(0) # group by back_frame, i.e. start_frame 305 | 306 | arch_tracks = [] 307 | for frame in sorted(tubes_group.indices.keys()): 308 | tubes_one_frame = tubes_group.get_group(frame).values 309 | 310 | for tube in tubes_one_frame: 311 | update_tracks(tracks, tube, arg) 312 | 313 | if verbose: 314 | print('{}\tFrame: {}\tTubes: {}\tCur tracks:{}\tArch tracks:{}'.format(\ 315 | datetime.now().time(), frame, len(tubes_one_frame), len(tracks), len(arch_tracks))) 316 | 317 | # Archive tracks 2*forward_frames frames away, they won't be useful anymore 318 | tracks = archive_tracks(tracks, arch_tracks, frame, arg.forward_frames * arg.frame_stride) 319 | 320 | arch_tracks.extend(tracks) 321 | tracks = arch_tracks 322 | final_processing(tracks, save_path) 323 | return tracks 324 | -------------------------------------------------------------------------------- /post_processing/tube_nms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from .nms import nms_cuda, nms_cpu 4 | 5 | 6 | def multiclass_nms(multi_tubes, # n, 15 7 | multi_scores, # n, 1 + n_cls 8 | score_thr, 9 | iou_thre, 10 | max_num=-1, 11 | score_factors=None, # n 12 | frame_num=16): 13 | """NMS for multi-class tubes. 14 | 15 | Args: 16 | multi_tubes (Tensor): shape (n, #class*4) or (n, 4) 17 | multi_scores (Tensor): shape (n, 1+#class) 18 | score_thr (float): bbox threshold, bboxes with scores lower than it 19 | will not be considered. 20 | iou_thre (float): NMS IoU threshold 21 | max_num (int): if there are more than max_num bboxes after NMS, 22 | only top max_num will be kept. 23 | score_factors (Tensor): The factors multiplied to scores before 24 | applying NMS 25 | frame_num (int): number of frames in input 26 | 27 | Returns: 28 | tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels 29 | are 0-based. 30 | """ 31 | num_classes = multi_scores.shape[1] 32 | tubes, labels = [], [] 33 | nms_op = nms 34 | for i in range(1, num_classes): 35 | cls_inds = multi_scores[:, i] > score_thr 36 | # print('before: ' + str(len(cls_inds))) 37 | if not cls_inds.any(): 38 | continue 39 | 40 | # get bboxes and scores of this class 41 | _tubes = multi_tubes[cls_inds, :] 42 | _scores = multi_scores[cls_inds, i] 43 | if score_factors is not None: 44 | _scores *= score_factors[cls_inds] 45 | pass 46 | 47 | # do nms in each frame 48 | for n_f in range(frame_num): 49 | frame_inds = torch.round(_tubes[:, 0]) == n_f 50 | if torch.sum(frame_inds) == 0: 51 | continue 52 | _tubes_single_frame = _tubes[frame_inds] 53 | # mid_frame = _bboxes_single_frame[:, 1:5] 54 | # cls_dets = torch.cat([mid_frame, _scores[frame_inds, None]], dim=1) # n, 4 + 1 55 | cls_dets = torch.cat([_tubes_single_frame, _scores[frame_inds, None]], dim=1) # n, 15 + 1 56 | _, inds = nms_op(cls_dets, iou_thre) 57 | # cls_dets = _bboxes_single_frame[inds] 58 | cls_dets = cls_dets[inds] 59 | cls_labels = multi_tubes.new_full( 60 | (cls_dets.shape[0], ), i - 1, dtype=torch.long) 61 | tubes.append(cls_dets) 62 | labels.append(cls_labels) 63 | if tubes: 64 | tubes = torch.cat(tubes) 65 | labels = torch.cat(labels) 66 | # print('middle: ' + str(len(bboxes))) 67 | 68 | # ===================================== 69 | # bboxes = bboxes[bboxes[:, -1] > score_thr] 70 | # ===================================== 71 | 72 | if tubes.shape[0] > max_num: 73 | _, inds = tubes[:, -1].sort(descending=True) 74 | inds = inds[:max_num] 75 | tubes = tubes[inds] 76 | labels = labels[inds] 77 | else: 78 | tubes = multi_tubes.new_zeros((0, multi_tubes.shape[1] + 1)) 79 | labels = multi_tubes.new_zeros((0,), dtype=torch.long) 80 | # print('after: ' + str(len(bboxes))) 81 | return tubes, labels 82 | 83 | 84 | def nms(dets, iou_thr, device_id=None): 85 | """Dispatch to either CPU or GPU NMS implementations. 86 | 87 | The input can be either a torch tensor or numpy array. GPU NMS will be used 88 | if the input is a gpu tensor or device_id is specified, otherwise CPU NMS 89 | will be used. The returned type will always be the same as inputs. 90 | 91 | Arguments: 92 | dets (torch.Tensor or np.ndarray): bboxes with scores. 93 | iou_thr (float): IoU threshold for NMS. 94 | device_id (int, optional): when `dets` is a numpy array, if `device_id` 95 | is None, then cpu nms is used, otherwise gpu_nms will be used. 96 | 97 | Returns: 98 | tuple: kept bboxes and indice, which is always the same data type as 99 | the input. 100 | """ 101 | # convert dets (tensor or numpy array) to tensor 102 | if isinstance(dets, torch.Tensor): 103 | is_numpy = False 104 | dets_th = dets 105 | elif isinstance(dets, np.ndarray): 106 | is_numpy = True 107 | device = 'cpu' if device_id is None else 'cuda:{}'.format(device_id) 108 | dets_th = torch.from_numpy(dets).to(device) 109 | else: 110 | raise TypeError( 111 | 'dets must be either a Tensor or numpy array, but got {}'.format( 112 | type(dets))) 113 | 114 | # execute cpu or cuda nms 115 | if dets_th.shape[0] == 0: 116 | inds = dets_th.new_zeros(0, dtype=torch.long) 117 | else: 118 | if dets_th.is_cuda: 119 | inds = nms_cuda.nms(dets_th, iou_thr, iou_thr) 120 | else: 121 | inds = nms_cpu.nms(dets_th, iou_thr, iou_thr) 122 | 123 | if is_numpy: 124 | inds = inds.cpu().numpy() 125 | return dets[inds, :], inds 126 | -------------------------------------------------------------------------------- /pre_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoPang1996/TubeTK/bcca334c5348f9ae33e04595e1af93cf8351e50e/pre_processing/__init__.py -------------------------------------------------------------------------------- /pre_processing/get_tubes_MOT17.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import pandas as pd 4 | from network.utils import bbox_iou 5 | import pickle 6 | from tqdm import tqdm 7 | import shutil 8 | import multiprocessing 9 | from configs.default import __C, cfg_from_file 10 | from dataset.Parsers.structures import * 11 | import argparse 12 | 13 | 14 | class GTSingleParser: 15 | def __init__(self, folder, 16 | min_visibility, 17 | forward_frames, 18 | frame_stride, 19 | tube_thre): 20 | # 1. get the gt path and image folder 21 | gt_file_path = os.path.join(folder, 'gt/gt.txt') 22 | self.folder = folder 23 | self.forward_frames = forward_frames 24 | self.tube_thre = tube_thre 25 | self.min_visibility = min_visibility 26 | self.frame_stride = frame_stride 27 | 28 | # 2. read the gt data 29 | gt_file = pd.read_csv(gt_file_path, header=None) 30 | gt_file = gt_file[gt_file[6] == 1] # human class 31 | gt_file = gt_file[gt_file[8] > min_visibility] 32 | gt_group = gt_file.groupby(0) 33 | gt_group_keys = gt_group.indices.keys() 34 | self.max_frame_index = max(gt_group_keys) 35 | # 3. update tracks 36 | self.tracks = Tracks() 37 | self.recorder = {} 38 | for key in gt_group_keys: 39 | det = gt_group.get_group(key).values 40 | ids = np.array(det[:, 1]).astype(int) 41 | det = np.array(det[:, 2:6]) 42 | det[:, 2:4] += det[:, :2] 43 | 44 | self.recorder[key - 1] = list() 45 | # 3.1 update tracks 46 | for id, d in zip(ids, det): 47 | node = Node(d, key - 1) 48 | track_index, node_index = self.tracks.add_node(node, id) 49 | self.recorder[key - 1].append((track_index, node_index)) 50 | 51 | def bbox2tube(self, track, mid_id, direction, pos_in_video, thre): 52 | def get_true_z(mid_node, end_node): 53 | return end_node.frame_id - mid_node.frame_id 54 | 55 | def get_inter_box(start_box, end_box, inter_id, end_id): 56 | return start_box * (end_id - inter_id) / end_id + end_box * inter_id / end_id 57 | 58 | mid_node = track.get_node_by_index(mid_id) 59 | mid_box = mid_node.box 60 | inter_boxes = [] 61 | 62 | z = 1 if direction == 'front' else -1 63 | if mid_id + z >= len(track.nodes) or mid_id + z < 0: 64 | return np.array([0, 0, 0, 0, 0]) 65 | 66 | true_z = get_true_z(mid_node, track.get_node_by_index(mid_id + z)) 67 | 68 | max_len = (self.forward_frames * 2 - 1) * self.frame_stride + 1 69 | 70 | while -1 * pos_in_video <= true_z < max_len - pos_in_video: 71 | iou_total = 0 72 | end_node = track.get_node_by_index(mid_id + z) 73 | end_box = end_node.box 74 | for i, gt_box in enumerate(inter_boxes): 75 | iou = sum(bbox_iou(gt_box[None], get_inter_box(mid_box, end_box, i + 1, len(inter_boxes) + 1)[None])) 76 | iou_total += iou 77 | iou_total += 1 78 | iou_total /= (len(inter_boxes) + 1) 79 | 80 | if iou_total < thre: 81 | break 82 | 83 | inter_boxes.append(end_box) 84 | if z % self.frame_stride == 0: 85 | res_z = true_z 86 | 87 | z += 1 if direction == 'front' else -1 88 | if mid_id + z >= len(track.nodes) or mid_id + z < 0: 89 | break 90 | true_z = get_true_z(mid_node, track.get_node_by_index(mid_id + z)) 91 | 92 | if not inter_boxes or len(inter_boxes) < self.frame_stride: 93 | return np.array([0, 0, 0, 0, 0]) 94 | else: 95 | ret_ind = (len(inter_boxes) // self.frame_stride) * self.frame_stride - 1 96 | return np.concatenate((np.array([abs(res_z)]), inter_boxes[ret_ind] - mid_box)) 97 | 98 | def get_item(self, frame_index): 99 | start_frame = frame_index 100 | max_len = (self.forward_frames * 2 - 1) * self.frame_stride + 1 101 | if self.max_frame_index - start_frame < max_len: 102 | return 0 103 | # if not frame_index in self.recorder: 104 | # return 0 105 | 106 | tubes = [] 107 | for i in range(self.forward_frames * 2): 108 | frame_index = start_frame + i * self.frame_stride 109 | if frame_index not in self.recorder: 110 | continue 111 | 112 | det_ids = self.recorder[frame_index] 113 | 114 | # 1. get tubes 115 | for track_index, node_index in det_ids: 116 | t = self.tracks.get_track_by_index(track_index) 117 | n = t.get_node_by_index(node_index) 118 | mid_box = np.concatenate((n.box, np.array([frame_index - start_frame]))) 119 | # backward 120 | back_box = self.bbox2tube(track=t, mid_id=node_index, direction='back', 121 | pos_in_video=i * self.frame_stride, thre=self.tube_thre) 122 | # forward 123 | front_box = self.bbox2tube(track=t, mid_id=node_index, direction='front', 124 | pos_in_video=i * self.frame_stride, thre=self.tube_thre) 125 | tube = np.concatenate((mid_box, front_box, back_box)) 126 | tubes.append(tube) 127 | 128 | if len(tubes) == 0: 129 | return 0 130 | tubes = np.array(tubes) 131 | try: 132 | os.makedirs(os.path.join(self.folder, 'tubes_' + str(self.forward_frames) + '_' + str(self.frame_stride) + '_' + str(self.min_visibility))) 133 | except: 134 | pass 135 | pickle.dump(tubes, open(os.path.join(self.folder, 'tubes_' + str(self.forward_frames) + '_' + str(self.frame_stride) + '_' + str(self.min_visibility), str(start_frame)), 'wb')) 136 | return 0 137 | 138 | def clear(self): 139 | try: 140 | shutil.rmtree(os.path.join(self.folder, 'tubes_' + str(self.forward_frames) + '_' + str(self.frame_stride) + '_' + str(self.min_visibility))) 141 | except: 142 | pass 143 | 144 | def __len__(self): 145 | return self.max_frame_index 146 | 147 | 148 | class GTParser: 149 | def __init__(self, mot_root, 150 | arg, 151 | type='train', 152 | ): 153 | # analsis all the folder in mot_root 154 | # 1. get all the folders 155 | mot_root = os.path.join(mot_root, type) 156 | all_folders = sorted( 157 | [os.path.join(mot_root, i) for i in os.listdir(mot_root) 158 | if os.path.isdir(os.path.join(mot_root, i)) 159 | and i.find('FRCNN') != -1] 160 | ) 161 | # 2. create single parser 162 | self.parsers = [GTSingleParser(folder, forward_frames=arg.forward_frames, 163 | min_visibility=arg.min_visibility, 164 | frame_stride=arg.frame_stride, 165 | tube_thre=arg.tube_thre) for folder in all_folders] 166 | 167 | # 3. get some basic information 168 | self.lens = [len(p) for p in self.parsers] 169 | self.len = sum(self.lens) 170 | 171 | def __len__(self): 172 | # get the length of all the matching frame 173 | return self.len 174 | 175 | def clear(self): 176 | print('Clearing') 177 | for parser in tqdm(self.parsers, ncols=20): 178 | parser.clear() 179 | 180 | def run(self): 181 | print('Running') 182 | pool = multiprocessing.Pool(processes=40) 183 | pool_list = [] 184 | for item in tqdm(range(self.len), ncols=20): 185 | total_len = 0 186 | index = 0 187 | current_item = item 188 | for l in self.lens: 189 | total_len += l 190 | if item < total_len: 191 | break 192 | else: 193 | index += 1 194 | current_item -= l 195 | 196 | if index >= len(self.parsers): 197 | return 198 | pool_list.append(pool.apply_async(self.parsers[index].get_item, (current_item,))) 199 | # self.parsers[index].get_item(current_item) 200 | for p in tqdm(pool_list, ncols=20): 201 | p.get() 202 | pool.close() 203 | pool.join() 204 | 205 | 206 | if __name__ == '__main__': 207 | arg_parser = argparse.ArgumentParser() 208 | arg_parser.add_argument('--mot_root', default='./data', type=str, help="mot data root") 209 | arg, unparsed = arg_parser.parse_known_args() 210 | config = __C 211 | cfg_from_file('../configs/get_MOT17_tube.yaml') 212 | parser = GTParser(mot_root=arg.mot_root, arg=config) 213 | parser.clear() 214 | parser.run() 215 | -------------------------------------------------------------------------------- /pre_processing/get_tubes_jta.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import pandas as pd 4 | from network.utils import bbox_iou 5 | import pickle 6 | from tqdm import tqdm 7 | import argparse 8 | import multiprocessing 9 | from configs.default import __C, cfg_from_file 10 | from dataset.Parsers.structures import * 11 | 12 | 13 | class GTSingleParser: 14 | def __init__(self, folder, 15 | min_visibility, 16 | forward_frames, 17 | frame_stride, 18 | tube_thre, 19 | loose, 20 | height_clamp): 21 | # 1. get the gt path and image folder 22 | split_path = folder.split('/') 23 | if folder[0] == '/': 24 | jta_root = '/' + os.path.join(*split_path[:-3]) 25 | else: 26 | jta_root = os.path.join(*split_path[:-3]) 27 | type = split_path[-2] 28 | video_name = split_path[-1] 29 | gt_file_path = os.path.join(jta_root, 'gt_' + str(loose) + '_' + str(min_visibility) + '_' + str(height_clamp), type, video_name, 'gt.txt') 30 | # gt_file_path = os.path.join(folder, 'gt/gt.txt') 31 | 32 | self.folder = folder 33 | self.forward_frames = forward_frames 34 | self.tube_thre = tube_thre 35 | self.min_visibility = min_visibility 36 | self.frame_stride = frame_stride 37 | 38 | self.tube_res_path = os.path.join(jta_root, 39 | 'tubes_' + str(self.forward_frames) + '_' + str( 40 | self.frame_stride) + '_' + str(self.min_visibility), 41 | type, 42 | video_name) 43 | 44 | try: 45 | os.makedirs(self.tube_res_path) 46 | except: 47 | pass 48 | 49 | # 2. read the gt data 50 | gt_file = pd.read_csv(gt_file_path, header=None) 51 | gt_file = gt_file[gt_file[6] == 1] # human class 52 | gt_file = gt_file[gt_file[8] > min_visibility] 53 | gt_group = gt_file.groupby(0) 54 | gt_group_keys = gt_group.indices.keys() 55 | self.max_frame_index = max(gt_group_keys) 56 | # 3. update tracks 57 | self.tracks = Tracks() 58 | self.recorder = {} 59 | for key in gt_group_keys: 60 | det = gt_group.get_group(key).values 61 | ids = np.array(det[:, 1]).astype(int) 62 | det = np.array(det[:, 2:6]) 63 | det[:, 2:4] += det[:, :2] 64 | 65 | self.recorder[key - 1] = list() 66 | # 3.1 update tracks 67 | for id, d in zip(ids, det): 68 | node = Node(d, key - 1) 69 | track_index, node_index = self.tracks.add_node(node, id) 70 | self.recorder[key - 1].append((track_index, node_index)) 71 | 72 | def bbox2tube(self, track, mid_id, direction, pos_in_video, thre): 73 | def get_true_z(mid_node, end_node): 74 | return end_node.frame_id - mid_node.frame_id 75 | 76 | def get_inter_box(start_box, end_box, inter_id, end_id): 77 | return start_box * (end_id - inter_id) / end_id + end_box * inter_id / end_id 78 | 79 | mid_node = track.get_node_by_index(mid_id) 80 | mid_box = mid_node.box 81 | inter_boxes = [] 82 | 83 | z = 1 if direction == 'front' else -1 84 | if mid_id + z >= len(track.nodes) or mid_id + z < 0: 85 | return np.array([0, 0, 0, 0, 0]) 86 | 87 | true_z = get_true_z(mid_node, track.get_node_by_index(mid_id + z)) 88 | 89 | max_len = (self.forward_frames * 2 - 1) * self.frame_stride + 1 90 | 91 | while -1 * pos_in_video <= true_z < max_len - pos_in_video: 92 | iou_total = 0 93 | end_node = track.get_node_by_index(mid_id + z) 94 | end_box = end_node.box 95 | for i, gt_box in enumerate(inter_boxes): 96 | iou = sum(bbox_iou(gt_box[None], get_inter_box(mid_box, end_box, i + 1, len(inter_boxes) + 1)[None])) 97 | iou_total += iou 98 | iou_total += 1 99 | iou_total /= (len(inter_boxes) + 1) 100 | 101 | if iou_total < thre: 102 | break 103 | 104 | inter_boxes.append(end_box) 105 | if z % self.frame_stride == 0: 106 | res_z = true_z 107 | 108 | z += 1 if direction == 'front' else -1 109 | if mid_id + z >= len(track.nodes) or mid_id + z < 0: 110 | break 111 | true_z = get_true_z(mid_node, track.get_node_by_index(mid_id + z)) 112 | 113 | if not inter_boxes or len(inter_boxes) < self.frame_stride: 114 | return np.array([0, 0, 0, 0, 0]) 115 | else: 116 | ret_ind = (len(inter_boxes) // self.frame_stride) * self.frame_stride - 1 117 | return np.concatenate((np.array([abs(res_z)]), inter_boxes[ret_ind] - mid_box)) 118 | 119 | def get_item(self, frame_index): 120 | start_frame = frame_index 121 | max_len = (self.forward_frames * 2 - 1) * self.frame_stride + 1 122 | if self.max_frame_index - start_frame < max_len: 123 | return 0 124 | 125 | tubes = [] 126 | for i in range(self.forward_frames * 2): 127 | frame_index = start_frame + i * self.frame_stride 128 | if frame_index not in self.recorder: 129 | continue 130 | 131 | det_ids = self.recorder[frame_index] 132 | 133 | # 1. get tubes 134 | for track_index, node_index in det_ids: 135 | t = self.tracks.get_track_by_index(track_index) 136 | n = t.get_node_by_index(node_index) 137 | mid_box = np.concatenate((n.box, np.array([frame_index - start_frame]))) 138 | # backward 139 | back_box = self.bbox2tube(track=t, mid_id=node_index, direction='back', 140 | pos_in_video=i * self.frame_stride, thre=self.tube_thre) 141 | # forward 142 | front_box = self.bbox2tube(track=t, mid_id=node_index, direction='front', 143 | pos_in_video=i * self.frame_stride, thre=self.tube_thre) 144 | 145 | # remove the fast turning 146 | mid_box_w = mid_box[2] - mid_box[0] 147 | if abs(front_box[0] - back_box[0]) > 2.5 * mid_box_w: 148 | print('remove turning') 149 | continue 150 | 151 | tube = np.concatenate((mid_box, front_box, back_box)) 152 | tubes.append(tube) 153 | 154 | if len(tubes) == 0: 155 | return 0 156 | tubes = np.array(tubes) 157 | 158 | pickle.dump(tubes, open(os.path.join(self.tube_res_path, str(start_frame)), 'wb')) 159 | return 0 160 | 161 | def __len__(self): 162 | return self.max_frame_index 163 | 164 | 165 | class GTParser: 166 | def __init__(self, jta_root, 167 | arg, 168 | loose, 169 | height_clamp, 170 | type='train', 171 | ): 172 | # analsis all the folder in jta_root 173 | # 1. get all the folders 174 | self.jta_root = jta_root 175 | jta_root = os.path.join(jta_root, type) 176 | all_folders = sorted( 177 | [os.path.join(jta_root, i) for i in os.listdir(jta_root) 178 | if os.path.isdir(os.path.join(jta_root, i))] 179 | ) 180 | # 2. create single parser 181 | print('Init SingleParser') 182 | self.parsers = [GTSingleParser(folder, forward_frames=arg.forward_frames, 183 | min_visibility=arg.min_visibility, 184 | frame_stride=arg.frame_stride, 185 | tube_thre=arg.tube_thre, 186 | loose=loose, 187 | height_clamp=height_clamp) for folder in tqdm(all_folders, ncols=20)] 188 | 189 | # 3. get some basic information 190 | self.lens = [len(p) for p in self.parsers] 191 | self.len = sum(self.lens) 192 | 193 | def __len__(self): 194 | # get the length of all the matching frame 195 | return self.len 196 | 197 | def clear(self): 198 | print('Clearing') 199 | 200 | def run(self): 201 | print('Running') 202 | pool = multiprocessing.Pool(processes=40) 203 | pool_list = [] 204 | for item in tqdm(range(self.len), ncols=20): 205 | total_len = 0 206 | index = 0 207 | current_item = item 208 | for l in self.lens: 209 | total_len += l 210 | if item < total_len: 211 | break 212 | else: 213 | index += 1 214 | current_item -= l 215 | 216 | if index >= len(self.parsers): 217 | return 218 | pool_list.append(pool.apply_async(self.parsers[index].get_item, (current_item,))) 219 | # self.parsers[index].get_item(current_item) 220 | for p in tqdm(pool_list, ncols=20): 221 | p.get() 222 | pool.close() 223 | pool.join() 224 | 225 | 226 | def get_gt(json_path, frames_path, loose, min_visiblity, height_clamp): 227 | assert os.path.exists(json_path), 'File does not exist: {}'.format(json_path) 228 | assert os.path.exists(frames_path), 'Folder does not exist: {}'.format(frames_path) 229 | split_path = frames_path.split('/') 230 | if frames_path[0] == '/': 231 | jta_root = '/' + os.path.join(*split_path[:-3]) 232 | else: 233 | jta_root = os.path.join(*split_path[:-3]) 234 | type = split_path[-2] 235 | video_name = split_path[-1] 236 | gt_path = os.path.join(jta_root, 'gt_' + str(loose) + '_' + str(min_visiblity) + '_' + str(height_clamp), type, video_name) 237 | try: 238 | os.makedirs(gt_path) 239 | except: 240 | pass 241 | gt_file = os.path.join(gt_path, 'gt.txt') 242 | df = pd.read_json(json_path) 243 | df = df.iloc[:, [0, 1, 3, 4, 8]] # Frame, ID, x, y, occluded 244 | df_group = df.groupby([0, 1]) # Group by frame and id 245 | 246 | def get_bbox(g): 247 | assert len(g.columns) == 5 248 | if g.iloc[:, 4].sum() >= (1 - min_visiblity) * len(g): # Completely occluded 249 | return pd.Series([-1, 0, 0, 0, 0, 0, 0], dtype=np.int) 250 | x1 = np.maximum(0, g.iloc[:, 2].min()) 251 | y1 = np.maximum(0, g.iloc[:, 3].min()) 252 | x2 = np.minimum(1920, g.iloc[:, 2].max()) 253 | y2 = np.minimum(1080, g.iloc[:, 3].max()) 254 | w = x2 - x1 255 | h = y2 - y1 256 | # Loose a little bit 257 | x1 -= np.round(w * loose) 258 | y1 -= np.round(h * loose) 259 | x1 = np.maximum(0.0, x1) 260 | y1 = np.maximum(0.0, y1) 261 | w = np.round(w * (1 + loose*2)) 262 | h = np.round(h * (1 + loose*2)) 263 | w = np.minimum(1920 - x1, w) 264 | h = np.minimum(1080 - y1, h) 265 | 266 | return pd.Series([x1, y1, w, h, 1, 1, 1], dtype=np.int) 267 | 268 | res_df = df_group.apply(get_bbox) 269 | res_df = res_df[res_df.iloc[:, 0] != -1] 270 | 271 | # get mode and remove the small box 272 | ns, edges = np.histogram(res_df.iloc[:, 3], bins=50) 273 | max_n = np.argmax(ns) 274 | mode = np.mean(edges[[max_n, max_n + 1]]) 275 | res_df = res_df[res_df.iloc[:, 3] > height_clamp * mode] 276 | res_df = res_df[res_df.iloc[:, 3] > 7] 277 | 278 | res_df.to_csv(gt_file, header=False) 279 | 280 | 281 | def get_gts(jta_root, frames_dir, loose, min_vis, height_clamp): 282 | pool = multiprocessing.Pool(processes=20) 283 | pool_list = [] 284 | anno_path = os.path.join(jta_root, 'annotations') 285 | for type in os.listdir(anno_path): 286 | for json_file in os.listdir(os.path.join(anno_path, type)): 287 | json_path = os.path.join(anno_path, type, json_file) 288 | frames_path = os.path.join(jta_root, frames_dir, type, os.path.splitext(json_file)[0]) 289 | pool_list.append(pool.apply_async(get_gt, (json_path, frames_path, loose, min_vis, height_clamp, ))) 290 | 291 | for p in tqdm(pool_list, ncols=20): 292 | p.get() 293 | pool.close() 294 | pool.join() 295 | 296 | 297 | if __name__ == '__main__': 298 | parser = argparse.ArgumentParser() 299 | parser.add_argument('--jta_root', type=str, help="data path of jta") 300 | parser.add_argument('--loose', type=float, default=0.1, help="ratio to loose the bbox generated from keypoint") 301 | parser.add_argument('--height_clamp', type=float, default=0.6, help="get rid of the bboxes whose height is smaller " 302 | "than 0.6 of the mean height") 303 | arg_input, unparsed = parser.parse_known_args() 304 | 305 | arg = __C 306 | cfg_from_file('../configs/get_jta_tube.yaml') 307 | 308 | print('Generating GT files') 309 | get_gts(jta_root=arg_input.jta_root, frames_dir='imgs', loose=arg_input.loose, min_vis=arg.min_visibility, 310 | height_clamp=arg_input.height_clamp) 311 | print('Generating Tubes') 312 | parser = GTParser(jta_root=os.path.join(arg_input.jta_root, 'imgs'), arg=arg, type='train', loose=arg_input.loose, 313 | height_clamp=arg_input.height_clamp) 314 | parser.clear() 315 | parser.run() 316 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.1.0 2 | torchvision 3 | tqdm 4 | tensorboardX 5 | PyYAML==5.1.2 6 | opencv-python 7 | Pillow 8 | easydict -------------------------------------------------------------------------------- /seqmaps/AVG-TownCentre.txt: -------------------------------------------------------------------------------- 1 | AVG-TownCentre -------------------------------------------------------------------------------- /seqmaps/JTA_train_turning: -------------------------------------------------------------------------------- 1 | seq_9 2 | seq_15 3 | seq_5 -------------------------------------------------------------------------------- /seqmaps/MOT15_test.txt: -------------------------------------------------------------------------------- 1 | ADL-Rundle-6 2 | ADL-Rundle-8 3 | ETH-Bahnhof 4 | ETH-Pedcross2 5 | ETH-Sunnyday 6 | KITTI-13 7 | KITTI-17 8 | PETS09-S2L1 9 | TUD-Campus 10 | TUD-Stadtmitte 11 | Venice-2 -------------------------------------------------------------------------------- /seqmaps/MOT17-01-FRCNN.txt: -------------------------------------------------------------------------------- 1 | MOT17-01-FRCNN -------------------------------------------------------------------------------- /seqmaps/MOT17-02-FRCNN.txt: -------------------------------------------------------------------------------- 1 | MOT17-02-FRCNN -------------------------------------------------------------------------------- /seqmaps/MOT17-04-FRCNN.txt: -------------------------------------------------------------------------------- 1 | MOT17-04-FRCNN -------------------------------------------------------------------------------- /seqmaps/MOT17-13-FRCNN.txt: -------------------------------------------------------------------------------- 1 | MOT17-13-FRCNN -------------------------------------------------------------------------------- /seqmaps/MOT17-14-FRCNN.txt: -------------------------------------------------------------------------------- 1 | MOT17-14-FRCNN -------------------------------------------------------------------------------- /seqmaps/MOT17_test.txt: -------------------------------------------------------------------------------- 1 | MOT17-01-FRCNN 2 | MOT17-03-FRCNN 3 | MOT17-06-FRCNN 4 | MOT17-07-FRCNN 5 | MOT17-08-FRCNN 6 | MOT17-12-FRCNN 7 | MOT17-14-FRCNN -------------------------------------------------------------------------------- /seqmaps/MOT17_train.txt: -------------------------------------------------------------------------------- 1 | MOT17-02-FRCNN 2 | MOT17-04-FRCNN 3 | MOT17-05-FRCNN 4 | MOT17-09-FRCNN 5 | MOT17-10-FRCNN 6 | MOT17-11-FRCNN 7 | MOT17-13-FRCNN -------------------------------------------------------------------------------- /seqmaps/PETS09-S2L2.txt: -------------------------------------------------------------------------------- 1 | PETS09-S2L2 -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoPang1996/TubeTK/bcca334c5348f9ae33e04595e1af93cf8351e50e/utils/__init__.py -------------------------------------------------------------------------------- /utils/mem_track.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import datetime 3 | import pynvml 4 | 5 | import torch 6 | import numpy as np 7 | 8 | 9 | class MemTracker(object): 10 | """ 11 | Class used to track pytorch memory usage 12 | Arguments: 13 | frame: a frame to detect current py-file runtime 14 | detail(bool, default True): whether the function shows the detail gpu memory usage 15 | path(str): where to save log file 16 | verbose(bool, default False): whether show the trivial exception 17 | device(int): GPU number, default is 0 18 | """ 19 | def __init__(self, frame, detail=True, path='', verbose=False, device=0): 20 | self.frame = frame 21 | self.print_detail = detail 22 | self.last_tensor_sizes = set() 23 | self.gpu_profile_fn = path + f'{datetime.datetime.now():%d-%b-%y-%H:%M:%S}-gpu_mem_track.txt' 24 | self.verbose = verbose 25 | self.begin = True 26 | self.device = device 27 | 28 | self.func_name = frame.f_code.co_name 29 | self.filename = frame.f_globals["__file__"] 30 | if (self.filename.endswith(".pyc") or 31 | self.filename.endswith(".pyo")): 32 | self.filename = self.filename[:-1] 33 | self.module_name = self.frame.f_globals["__name__"] 34 | self.curr_line = self.frame.f_lineno 35 | 36 | def get_tensors(self): 37 | for obj in gc.get_objects(): 38 | try: 39 | if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): 40 | tensor = obj 41 | else: 42 | continue 43 | if tensor.is_cuda: 44 | yield tensor 45 | except Exception as e: 46 | if self.verbose: 47 | print('A trivial exception occured: {}'.format(e)) 48 | 49 | def track(self): 50 | """ 51 | Track the GPU memory usage 52 | """ 53 | pynvml.nvmlInit() 54 | handle = pynvml.nvmlDeviceGetHandleByIndex(self.device) 55 | meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) 56 | self.curr_line = self.frame.f_lineno 57 | where_str = self.module_name + ' ' + self.func_name + ':' + ' line ' + str(self.curr_line) 58 | 59 | with open(self.gpu_profile_fn, 'a+') as f: 60 | 61 | if self.begin: 62 | f.write(f"GPU Memory Track | {datetime.datetime.now():%d-%b-%y-%H:%M:%S} |" 63 | f" Total Used Memory:{meminfo.used/1000**2:<7.1f}Mb\n\n") 64 | self.begin = False 65 | 66 | if self.print_detail is True: 67 | ts_list = [tensor.size() for tensor in self.get_tensors()] 68 | new_tensor_sizes = {(type(x), tuple(x.size()), ts_list.count(x.size()), np.prod(np.array(x.size()))*4/1000**2) 69 | for x in self.get_tensors()} 70 | for t, s, n, m in new_tensor_sizes - self.last_tensor_sizes: 71 | f.write(f'+ | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20}\n') 72 | for t, s, n, m in self.last_tensor_sizes - new_tensor_sizes: 73 | f.write(f'- | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20} \n') 74 | self.last_tensor_sizes = new_tensor_sizes 75 | 76 | f.write(f"\nAt {where_str:<50}" 77 | f"Total Used Memory:{meminfo.used/1000**2:<7.1f}Mb\n\n") 78 | 79 | pynvml.nvmlShutdown() -------------------------------------------------------------------------------- /utils/util.py: -------------------------------------------------------------------------------- 1 | class AverageMeter(object): 2 | """Computes and stores the average and current value""" 3 | 4 | def __init__(self): 5 | self.val = 0 6 | self.avg = 0 7 | self.sum = 0 8 | self.count = 0 9 | self.reset() 10 | 11 | def reset(self): 12 | self.val = 0 13 | self.avg = 0 14 | self.sum = 0 15 | self.count = 0 16 | 17 | def update(self, val, n=1): 18 | self.val = val 19 | self.sum += val * n 20 | self.count += n 21 | self.avg = self.sum / self.count 22 | 23 | --------------------------------------------------------------------------------