├── .gitignore ├── LICENSE ├── README.md ├── benchmark.py ├── configs ├── detracker_reidV3.yaml ├── mot_detectron2 │ ├── d2_2080ti_p3aformer_train.sh │ ├── d2_2080ti_p3aformer_val.sh │ ├── d2_debug_p3aformer_train.sh │ ├── d2_v100_p3aformer_train.sh │ ├── d2_v100_p3aformer_val.sh │ ├── p3aformer_base.yaml │ ├── p3aformer_big.yaml │ ├── p3aformer_config_init.py │ └── p3aformer_small.yaml ├── reid.yaml └── standard │ ├── 2080ti_debug_train_coco.sh │ ├── 2080ti_debug_train_crowdhuman.sh │ ├── 2080ti_debug_train_mot17.sh │ ├── interpolation.sh │ ├── v100_mot17_coco.sh │ ├── v100_mot17_crowdhuman.sh │ ├── v100_mot17_fine_tune_mot17.sh │ ├── v100_submit_mot17.sh │ ├── v100_test_mot15.sh │ └── v100_test_mot17.sh ├── d2_main.py ├── datasets ├── __init__.py ├── byte_mot_half.py ├── coco.py ├── coco_eval.py ├── coco_panoptic.py ├── d2_p3aformer_dataset │ ├── d2_generic_dataset_val.py │ ├── d2_mot15_val_dataset.py │ ├── d2_mot17_mixed_dataset.py │ └── d2_mot17_val_dataset.py ├── data_path │ ├── bdd100k.train │ ├── bdd100k.val │ ├── crowdhuman.train │ ├── crowdhuman.val │ ├── detmot16.train │ ├── detmot17.train │ ├── gen_bdd100k_mot.py │ ├── gen_labels_15.py │ ├── gen_labels_16.py │ ├── joint.train │ ├── mot16.train │ ├── mot17.train │ └── prepare.py ├── data_prefetcher.py ├── detmot.py ├── joint.py ├── p3aformer_dataset │ ├── __init__.py │ ├── coco.py │ ├── crowdhuman.py │ ├── generic_dataset_test_save_mem.py │ ├── generic_dataset_train.py │ ├── mot15_val_save_mem.py │ ├── mot17_train.py │ ├── mot17_val_save_mem.py │ └── mot20_val_save_mem.py ├── p3aformer_eval.py ├── panoptic_eval.py ├── samplers.py ├── static_detmot.py ├── torchvision_datasets │ ├── __init__.py │ └── coco.py └── transforms.py ├── engine.py ├── eval.py ├── exps ├── figs ├── P3AFormerModel_v12.png ├── model_mind_flow.png └── pixelwise_association_v8.png ├── interpolation.py ├── main.py ├── models ├── __init__.py ├── backbone.py ├── d2_p3aformer │ ├── __init__.py │ ├── d2_p3aformer_model.py │ ├── d2_postprocess.py │ ├── mask2former_modeling │ │ ├── __init__.py │ │ ├── backbone │ │ │ ├── __init__.py │ │ │ └── swin.py │ │ ├── criterion.py │ │ ├── matcher.py │ │ ├── meta_arch │ │ │ ├── __init__.py │ │ │ ├── mask_former_head.py │ │ │ └── per_pixel_baseline.py │ │ ├── pixel_decoder │ │ │ ├── __init__.py │ │ │ ├── fpn.py │ │ │ ├── msdeformattn.py │ │ │ └── ops │ │ │ │ ├── functions │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn_func.py │ │ │ │ ├── make.sh │ │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn.py │ │ │ │ ├── setup.py │ │ │ │ ├── src │ │ │ │ ├── cpu │ │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ │ ├── cuda │ │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ │ ├── ms_deform_attn.h │ │ │ │ └── vision.cpp │ │ │ │ └── test.py │ │ └── transformer_decoder │ │ │ ├── __init__.py │ │ │ ├── mask2former_transformer_decoder.py │ │ │ ├── maskformer_transformer_decoder.py │ │ │ ├── position_encoding.py │ │ │ └── transformer.py │ ├── p3aformer_deformable_transformer.py │ ├── transcenter_backbone.py │ ├── transcenter_dla.py │ ├── transcenter_losses │ │ ├── losses.py │ │ └── utils.py │ ├── transcenter_position_encoding.py │ └── transcenter_post_processing │ │ ├── decode.py │ │ ├── post_process.py │ │ └── utils.py ├── deformable_detr.py ├── deformable_transformer.py ├── deformable_transformer_plus.py ├── matcher.py ├── memory_bank.py ├── motr.py ├── ops │ ├── functions │ │ ├── __init__.py │ │ └── ms_deform_attn_func.py │ ├── make.sh │ ├── modules │ │ ├── __init__.py │ │ └── ms_deform_attn.py │ ├── server_make.sh │ ├── setup.py │ ├── src │ │ ├── cpu │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ └── ms_deform_attn_cpu.h │ │ ├── cuda │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ ├── ms_deform_attn_cuda.h │ │ │ └── ms_deform_im2col_cuda.cuh │ │ ├── ms_deform_attn.h │ │ └── vision.cpp │ └── test.py ├── p3aformer │ ├── __init__.py │ ├── p3aformer.py │ ├── p3aformer_backbone.py │ ├── p3aformer_deformable_transformer.py │ ├── p3aformer_dla.py │ ├── p3aformer_liteflownet │ │ ├── __init__.py │ │ ├── correlation_package │ │ │ ├── __init__.py │ │ │ ├── correlation.py │ │ │ ├── correlation_cuda.cc │ │ │ ├── correlation_cuda_kernel.cu │ │ │ ├── correlation_cuda_kernel.cuh │ │ │ ├── pyproject.toml │ │ │ └── setup.py │ │ └── light_flownet.py │ ├── p3aformer_losses │ │ ├── losses.py │ │ └── utils.py │ ├── p3aformer_post_processing │ │ ├── decode.py │ │ ├── post_process.py │ │ └── utils.py │ └── p3aformer_reid │ │ ├── resnet.py │ │ ├── slover.py │ │ └── triplet_loss.py ├── position_encoding.py ├── qim.py ├── segmentation.py └── structures │ ├── __init__.py │ ├── boxes.py │ └── instances.py ├── motr_demo.py ├── preprocess ├── convert_cityperson_to_coco.py ├── convert_crowdhuman_to_coco.py ├── convert_ethz_to_coco.py ├── convert_mot17_to_coco.py ├── convert_mot20_to_coco.py ├── data_preprocess.sh ├── make_mixed_dirs.sh ├── mix_data_ablation.py ├── mix_data_test_mot17.py └── mix_data_test_mot20.py ├── pretrained ├── requirements.txt ├── submit.py ├── tools ├── __init__.py ├── add_train_for_submission.py ├── combine_labels_mot.py ├── gen_labels_MOT17.py ├── gen_labels_mot15.py ├── launch.py ├── run_dist_launch.sh ├── run_dist_slurm.sh ├── transcenter_mot15_to_coco.py ├── visualization_tool.py └── visualize_validation_gt_mot17.py ├── tracker ├── .DS_Store ├── __init__.py ├── byte_tracker │ ├── __init__.py │ ├── byte_tracker.py │ └── mot_online │ │ ├── __init__.py │ │ ├── basetrack.py │ │ ├── kalman_filter.py │ │ └── matching.py ├── common │ ├── __init__.py │ └── track_structure_transfer.py ├── d2_p3aformer │ ├── d2_p3aformer_tracker.py │ └── write_results.py └── dense_tracker │ └── dense_tracker.py └── util ├── __init__.py ├── box_ops.py ├── evaluation.py ├── image.py ├── misc.py ├── motdet_eval.py ├── p3aformer ├── __init__.py ├── p3aformer_misc.py └── tracker_util.py ├── plot_utils.py ├── system.py ├── tool.py └── vis_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | Benchmark inference speed of Deformable DETR. 3 | """ 4 | import os 5 | import time 6 | import argparse 7 | 8 | import torch 9 | 10 | from main import get_args_parser as get_main_args_parser 11 | from models import build_model 12 | from datasets import build_dataset 13 | from util.misc import nested_tensor_from_tensor_list 14 | 15 | 16 | def get_benckmark_arg_parser(): 17 | parser = argparse.ArgumentParser('Benchmark inference speed of Deformable DETR.') 18 | parser.add_argument('--num_iters', type=int, default=300, help='total iters to benchmark speed') 19 | parser.add_argument('--warm_iters', type=int, default=5, help='ignore first several iters that are very slow') 20 | parser.add_argument('--batch_size', type=int, default=1, help='batch size in inference') 21 | parser.add_argument('--resume', type=str, help='load the pre-trained checkpoint') 22 | return parser 23 | 24 | 25 | @torch.no_grad() 26 | def measure_average_inference_time(model, inputs, num_iters=100, warm_iters=5): 27 | ts = [] 28 | for iter_ in range(num_iters): 29 | torch.cuda.synchronize() 30 | t_ = time.perf_counter() 31 | model(inputs) 32 | torch.cuda.synchronize() 33 | t = time.perf_counter() - t_ 34 | if iter_ >= warm_iters: 35 | ts.append(t) 36 | print(ts) 37 | return sum(ts) / len(ts) 38 | 39 | 40 | def benchmark(): 41 | args, _ = get_benckmark_arg_parser().parse_known_args() 42 | main_args = get_main_args_parser().parse_args(_) 43 | assert args.warm_iters < args.num_iters and args.num_iters > 0 and args.warm_iters >= 0 44 | assert args.batch_size > 0 45 | assert args.resume is None or os.path.exists(args.resume) 46 | dataset = build_dataset('val', main_args) 47 | model, _, _ = build_model(main_args) 48 | model.cuda() 49 | model.eval() 50 | if args.resume is not None: 51 | ckpt = torch.load(args.resume, map_location=lambda storage, loc: storage) 52 | model.load_state_dict(ckpt['model']) 53 | inputs = nested_tensor_from_tensor_list([dataset.__getitem__(0)[0].cuda() for _ in range(args.batch_size)]) 54 | t = measure_average_inference_time(model, inputs, args.num_iters, args.warm_iters) 55 | return 1.0 / t * args.batch_size 56 | 57 | 58 | if __name__ == '__main__': 59 | fps = benchmark() 60 | print(f'Inference Speed: {fps:.1f} FPS') 61 | 62 | -------------------------------------------------------------------------------- /configs/detracker_reidV3.yaml: -------------------------------------------------------------------------------- 1 | tracktor: 2 | name: P3AFormer 3 | module_name: MOT 4 | desription: transformer 5 | seed: 12345 6 | network: transformer 7 | 8 | output_dir: . 9 | obj_detect_model: . 10 | 11 | reid_weights: ResNet_iter_25245.pth 12 | 13 | frame_split: [0.0, 1.0] 14 | 15 | tracker: 16 | track_thresh: 0.5 17 | out_thresh: 0.3 18 | pre_thresh: 0.5 19 | new_thresh: 0.3 20 | detection_nms_thresh: 0.5 21 | motion_model: 22 | enabled: False 23 | # average velocity over last n_steps steps 24 | n_steps: 1 25 | # if true, only model the movement of the bounding box center. If false, width and height are also modeled. 26 | center_only: True 27 | # 0 tells the tracker to use private detections (Faster R-CNN) 28 | public_detections: False 29 | # How much last appearance features are to keep 30 | max_features_num: 10 31 | # Do camera motion compensation 32 | do_align: True 33 | # Use siamese network to do reid 34 | do_reid: True 35 | # How much timesteps dead tracks are kept and cnosidered for reid 36 | inactive_patience: 60 37 | # How similar do image and old track need to be to be considered the same person 38 | reid_sim_threshold: 2.0 39 | # How much IoU do track and image need to be considered for matching 40 | reid_iou_threshold: 0.1 -------------------------------------------------------------------------------- /configs/mot_detectron2/d2_2080ti_p3aformer_train.sh: -------------------------------------------------------------------------------- 1 | export OUTPUT_DIR="output/jun3_2080ti" 2 | python d2_main.py \ 3 | --config-file configs/mot_detectron2/p3aformer_small.yaml \ 4 | --num-gpus 8 SOLVER.IMS_PER_BATCH 16 SOLVER.MAX_ITER 83100 OUTPUT_DIR ${OUTPUT_DIR} INPUT.VAL_DATA_DIR "/data/dataset/mot" MODEL.DENSETRACK.ENC_LAYERS 2 MODEL.DENSETRACK.DEC_LAYERS 3 -------------------------------------------------------------------------------- /configs/mot_detectron2/d2_2080ti_p3aformer_val.sh: -------------------------------------------------------------------------------- 1 | # validation mot 15 2 | MODEL_DIR="output/jun3_2080ti" 3 | python d2_main.py \ 4 | --config-file configs/mot_detectron2/p3aformer_small.yaml --eval-only \ 5 | --num-gpus 1 DATALOADER.NUM_WORKERS 0 SOLVER.IMS_PER_BATCH 2 SOLVER.MAX_ITER 83100 MODEL.WEIGHTS ${MODEL_DIR}"/model_final.pth" INPUT.VAL_DATA_DIR "/data/dataset/MOT15" MODEL.DENSETRACK.ENC_LAYERS 2 MODEL.DENSETRACK.DEC_LAYERS 3 OUTPUT_DIR ${MODEL_DIR} TRACK.DENSETRACK.TRACK_THRE 0.2 6 | 7 | # validation mot17 8 | MODEL_DIR="output/jun3_2080ti" 9 | SPLIT="val_half" 10 | MODEL_NAME=${MODEL_DIR}"/model_final.pth" 11 | python d2_main.py \ 12 | --config-file configs/mot_detectron2/p3aformer_small.yaml --eval-only \ 13 | --num-gpus 1 DATALOADER.NUM_WORKERS 0 SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHTS ${MODEL_NAME} INPUT.VAL_DATA_DIR "/data/dataset/mot" DATASETS.TEST '("MOT17",)' MODEL.DENSETRACK.ENC_LAYERS 2 MODEL.DENSETRACK.DEC_LAYERS 3 OUTPUT_DIR ${MODEL_DIR} TRACK.DENSETRACK.TRACK_THRE 0.5 TRACK.DENSETRACK.PRE_THRE 0.0 TRACK.DENSETRACK.OUT_THRE 0.0 14 | 15 | # TRACK.VIS True 16 | 17 | # debug only 18 | MODEL_DIR="output/april17_2080ti" 19 | SPLIT="val_half" 20 | MODEL_NAME=${MODEL_DIR}"/model_0039999.pth" 21 | python d2_main.py \ 22 | --config-file configs/mot_detectron2/p3aformer_small.yaml --eval-only \ 23 | --num-gpus 1 DATALOADER.NUM_WORKERS 0 SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHTS ${MODEL_NAME} INPUT.VAL_DATA_DIR "/data/dataset/mot" DATASETS.TEST '("MOT17",)' MODEL.DENSETRACK.ENC_LAYERS 2 MODEL.DENSETRACK.DEC_LAYERS 3 OUTPUT_DIR ${MODEL_DIR} TRACK.DENSETRACK.TRACK_THRE 0.0 TRACK.DENSETRACK.PRE_THRE 0.0 TRACK.DENSETRACK.OUT_THRE 0.0 TRACK.VIS True 24 | -------------------------------------------------------------------------------- /configs/mot_detectron2/d2_debug_p3aformer_train.sh: -------------------------------------------------------------------------------- 1 | python d2_main.py \ 2 | --config-file configs/mot_detectron2/p3aformer_small.yaml \ 3 | --num-gpus 1 DATALOADER.NUM_WORKERS 0 DATASETS.TEST '("MOT17",)' INPUT.VAL_DATA_DIR "/data/dataset/mot" TRACK.VIS True -------------------------------------------------------------------------------- /configs/mot_detectron2/d2_v100_p3aformer_train.sh: -------------------------------------------------------------------------------- 1 | export OUTPUT_DIR="output/June2Mixed" 2 | python d2_main.py \ 3 | --config-file configs/mot_detectron2/p3aformer_big.yaml \ 4 | --num-gpus 4 SOLVER.IMS_PER_BATCH 16 SOLVER.MAX_ITER 83100 OUTPUT_DIR ${OUTPUT_DIR} MODEL.DENSETRACK.ENC_LAYERS 5 MODEL.DENSETRACK.DEC_LAYERS 5 -------------------------------------------------------------------------------- /configs/mot_detectron2/d2_v100_p3aformer_val.sh: -------------------------------------------------------------------------------- 1 | # # validation MOT15 2 | MODEL_NAME="output/feb15_v100/model_final.pth" 3 | python d2_main.py \ 4 | --config-file configs/mot_detectron2/p3aformer_small.yaml --eval-only \ 5 | --num-gpus 1 DATALOADER.NUM_WORKERS 0 SOLVER.IMS_PER_BATCH 1 SOLVER.MAX_ITER 83100 MODEL.WEIGHTS ${MODEL_NAME} INPUT.VAL_DATA_DIR "/data/dataset/MOT15" MODEL.DENSETRACK.ENC_LAYERS 6 MODEL.DENSETRACK.DEC_LAYERS 6 6 | 7 | # open visualization on MOT17 8 | MODEL_DIR="output/June2Mixed" 9 | SPLIT="val_half" 10 | MODEL_NAME=${MODEL_DIR}"/model_final.pth" 11 | OUTPUT_DIR="output/June2Mixed/model_final" 12 | python d2_main.py \ 13 | --config-file configs/mot_detectron2/p3aformer_big.yaml --eval-only \ 14 | --num-gpus 1 DATALOADER.NUM_WORKERS 0 SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHTS ${MODEL_NAME} INPUT.VAL_DATA_DIR "/data/dataset/mot" DATASETS.TEST '("MOT17",)' MODEL.DENSETRACK.ENC_LAYERS 5 MODEL.DENSETRACK.DEC_LAYERS 5 OUTPUT_DIR ${OUTPUT_DIR} 15 | # TRACK.VIS True 16 | -------------------------------------------------------------------------------- /configs/mot_detectron2/p3aformer_base.yaml: -------------------------------------------------------------------------------- 1 | # MODEL: 2 | # BACKBONE: 3 | # FREEZE_AT: 0 4 | # NAME: "build_resnet_backbone" 5 | # WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | # PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | # PIXEL_STD: [58.395, 57.120, 57.375] 8 | # RESNETS: 9 | # DEPTH: 50 10 | # STEM_TYPE: "basic" # not used 11 | # STEM_OUT_CHANNELS: 64 12 | # STRIDE_IN_1X1: False 13 | # OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # # NORM: "SyncBN" 15 | # RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: # not used 17 | TRAIN: ("coco_2017_train_panoptic",) # not used 18 | TEST: ("MOT15",) # to evaluate instance and semantic performance as well 19 | SOLVER: 20 | LR_BACKBONE: 2e-5 21 | OUTPUT_DIR: "output/debug" 22 | # IMS_PER_BATCH: 16 23 | # BASE_LR: 0.0001 24 | # STEPS: (327778, 355092) 25 | # MAX_ITER: 368750 26 | # WARMUP_FACTOR: 1.0 27 | # WARMUP_ITERS: 10 28 | # WEIGHT_DECAY: 0.05 29 | # OPTIMIZER: "ADAMW" 30 | # BACKBONE_MULTIPLIER: 0.1 31 | # CLIP_GRADIENTS: 32 | # ENABLED: True 33 | # CLIP_TYPE: "full_model" 34 | # CLIP_VALUE: 0.01 35 | # NORM_TYPE: 2.0 36 | # AMP: 37 | # ENABLED: True 38 | # INPUT: 39 | # IMAGE_SIZE: 1024 40 | # MIN_SCALE: 0.1 41 | # MAX_SCALE: 2.0 42 | # FORMAT: "RGB" 43 | # DATASET_MAPPER_NAME: "coco_panoptic_lsj" 44 | # TEST: 45 | # EVAL_PERIOD: 5000 46 | # DATALOADER: 47 | # FILTER_EMPTY_ANNOTATIONS: True 48 | # NUM_WORKERS: 4 49 | VERSION: 2 50 | -------------------------------------------------------------------------------- /configs/mot_detectron2/p3aformer_big.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: p3aformer_base.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "D2P3AFormer" 4 | DENSETRACK: 5 | ENC_LAYERS: 2 6 | DEC_LAYERS: 2 7 | NUM_FEATURE_LEVELS: 4 8 | DIM_FEEDFORWARD: 1024 9 | HIDDEN_DIM: 256 10 | POSITION_EMBEDDING: "sine" 11 | BACKBONE: "resnet50" 12 | DILATION: False 13 | DROPOUT: 0.1 14 | DEC_N_POINTS: 4 15 | ENC_N_POINTS: 4 16 | TRACKING: True 17 | SAME_AUG_PRE: True 18 | PRE_HM: True 19 | HM_WEIGHT: 1.0 20 | OFF_WEIGHT: 1.0 21 | WH_WEIGHT: 0.1 22 | BOXES_WEIGHT: 0.5 23 | GIOU_WEIGHT: 0.4 24 | CT_OFFSET_WEIGHT: 0.1 25 | TRACKING_WEIGHT: 1.0 26 | NORM_FACTOR: 1.0 27 | DEFAULT_RESOLUTION: [640, 1088] 28 | SOLVER: 29 | OPTIMIZER: "ADAMW" 30 | AUX_LOSS: False 31 | IMS_PER_BATCH: 2 32 | BASE_LR: 1e-4 33 | MAX_ITER: 160000 34 | WARMUP_FACTOR: 1.0 35 | WARMUP_ITERS: 0 36 | WEIGHT_DECAY: 0.05 37 | OPTIMIZER: "ADAMW" 38 | LR_SCHEDULER_NAME: "WarmupPolyLR" 39 | BACKBONE_MULTIPLIER: 0.1 40 | # ''' 41 | # python main.py \ 42 | # --meta_arch p3aformer \ 43 | # --data_dir /data/dataset/mix_det \ 44 | # --dataset_name MOT17 \ 45 | # --dataset_file p3aformer_mixed \ 46 | # --batch_size=2 \ 47 | # --output_dir=./output/whole_MOT17_from_CH \ 48 | # --num_workers=16 \ 49 | # --pre_hm \ 50 | # --tracking \ 51 | # --same_aug_pre \ 52 | # --image_blur_aug \ 53 | # --lr 1e-4 \ 54 | # --lr_backbone_names ["backbone.0"] \ 55 | # --lr_backbone 2e-5 \ 56 | # --lr_linear_proj_names ['reference_points', 'sampling_offsets',] \ 57 | # --lr_linear_proj_mult 0.1 \ 58 | # --lr_drop 40 \ 59 | # --epochs 5 \ 60 | # --weight_decay 1e-4 \ 61 | # --clip_max_norm 0.1 \ 62 | # --backbone 'resnet50' \ 63 | # --position_embedding 'sine' \ 64 | # --num_feature_levels 3 \ 65 | # --enc_layers 2 \ 66 | # --dec_layers 2 \ 67 | # --dim_feedforward 1024 \ 68 | # --hidden_dim 256 \ 69 | # --shift 0.05 \ 70 | # --scale 0.05 \ 71 | # --rotate 0 \ 72 | # --flip 0.5 \ 73 | # --hm_disturb 0.05 \ 74 | # --lost_disturb 0.4 \ 75 | # --fp_disturb 0.1 \ 76 | # --track_thresh 0.3 \ 77 | # --new_thresh 0.3 \ 78 | # --ltrb_amodal_weight 0.1 79 | # ''' 80 | # SEM_SEG_HEAD: 81 | # NAME: "MaskFormerHead" 82 | # IN_FEATURES: ["res2", "res3", "res4", "res5"] 83 | # IGNORE_VALUE: 255 84 | # NUM_CLASSES: 133 85 | # LOSS_WEIGHT: 1.0 86 | # CONVS_DIM: 256 87 | # MASK_DIM: 256 88 | # MASK_DIM: 256 89 | # NORM: "GN" 90 | # # pixel decoder 91 | # PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 92 | # IN_FEATURES: ["res2", "res3", "res4", "res5"] 93 | # DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 94 | # COMMON_STRIDE: 4 95 | # TRANSFORMER_ENC_LAYERS: 6 96 | # MASK_FORMER: 97 | # TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 98 | # TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 99 | # DEEP_SUPERVISION: True 100 | # NO_OBJECT_WEIGHT: 0.1 101 | # CLASS_WEIGHT: 2.0 102 | # MASK_WEIGHT: 5.0 103 | # DICE_WEIGHT: 5.0 104 | # HIDDEN_DIM: 256 105 | # NUM_OBJECT_QUERIES: 100 106 | # NHEADS: 8 107 | # DROPOUT: 0.0 108 | # DIM_FEEDFORWARD: 2048 109 | # ENC_LAYERS: 0 110 | # PRE_NORM: False 111 | # ENFORCE_INPUT_PROJ: False 112 | # SIZE_DIVISIBILITY: 32 113 | # DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 114 | # TRAIN_NUM_POINTS: 12544 115 | # OVERSAMPLE_RATIO: 3.0 116 | # IMPORTANCE_SAMPLE_RATIO: 0.75 117 | # TEST: 118 | # SEMANTIC_ON: True 119 | # INSTANCE_ON: True 120 | # PANOPTIC_ON: True 121 | # OVERLAP_THRESHOLD: 0.8 122 | # OBJECT_MASK_THRESHOLD: 0.8 123 | -------------------------------------------------------------------------------- /configs/mot_detectron2/p3aformer_small.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: p3aformer_base.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "D2P3AFormer" 4 | DENSETRACK: 5 | ENC_LAYERS: 2 6 | DEC_LAYERS: 2 7 | NUM_FEATURE_LEVELS: 4 8 | DIM_FEEDFORWARD: 1024 9 | HIDDEN_DIM: 256 10 | POSITION_EMBEDDING: "sine" 11 | BACKBONE: "resnet50" 12 | DILATION: False 13 | DROPOUT: 0.1 14 | DEC_N_POINTS: 4 15 | ENC_N_POINTS: 4 16 | TRACKING: True 17 | SAME_AUG_PRE: True 18 | PRE_HM: True 19 | HM_WEIGHT: 1.0 20 | OFF_WEIGHT: 1.0 21 | WH_WEIGHT: 0.1 22 | BOXES_WEIGHT: 0.5 23 | GIOU_WEIGHT: 0.4 24 | CT_OFFSET_WEIGHT: 0.1 25 | TRACKING_WEIGHT: 1.0 26 | NORM_FACTOR: 1.0 27 | DEFAULT_RESOLUTION: [640, 1088] 28 | SOLVER: 29 | OPTIMIZER: "ADAMW" 30 | AUX_LOSS: False 31 | IMS_PER_BATCH: 2 32 | BASE_LR: 1e-4 33 | MAX_ITER: 160000 34 | WARMUP_FACTOR: 1.0 35 | WARMUP_ITERS: 0 36 | WEIGHT_DECAY: 0.05 37 | OPTIMIZER: "ADAMW" 38 | LR_SCHEDULER_NAME: "WarmupPolyLR" 39 | BACKBONE_MULTIPLIER: 0.1 40 | # ''' 41 | # python main.py \ 42 | # --meta_arch p3aformer \ 43 | # --data_dir /data/dataset/mix_det \ 44 | # --dataset_name MOT17 \ 45 | # --dataset_file p3aformer_mixed \ 46 | # --batch_size=2 \ 47 | # --output_dir=./output/whole_MOT17_from_CH \ 48 | # --num_workers=16 \ 49 | # --pre_hm \ 50 | # --tracking \ 51 | # --same_aug_pre \ 52 | # --image_blur_aug \ 53 | # --lr 1e-4 \ 54 | # --lr_backbone_names ["backbone.0"] \ 55 | # --lr_backbone 2e-5 \ 56 | # --lr_linear_proj_names ['reference_points', 'sampling_offsets',] \ 57 | # --lr_linear_proj_mult 0.1 \ 58 | # --lr_drop 40 \ 59 | # --epochs 5 \ 60 | # --weight_decay 1e-4 \ 61 | # --clip_max_norm 0.1 \ 62 | # --backbone 'resnet50' \ 63 | # --position_embedding 'sine' \ 64 | # --num_feature_levels 3 \ 65 | # --enc_layers 2 \ 66 | # --dec_layers 2 \ 67 | # --dim_feedforward 1024 \ 68 | # --hidden_dim 256 \ 69 | # --shift 0.05 \ 70 | # --scale 0.05 \ 71 | # --rotate 0 \ 72 | # --flip 0.5 \ 73 | # --hm_disturb 0.05 \ 74 | # --lost_disturb 0.4 \ 75 | # --fp_disturb 0.1 \ 76 | # --track_thresh 0.3 \ 77 | # --new_thresh 0.3 \ 78 | # --ltrb_amodal_weight 0.1 79 | # ''' 80 | # SEM_SEG_HEAD: 81 | # NAME: "MaskFormerHead" 82 | # IN_FEATURES: ["res2", "res3", "res4", "res5"] 83 | # IGNORE_VALUE: 255 84 | # NUM_CLASSES: 133 85 | # LOSS_WEIGHT: 1.0 86 | # CONVS_DIM: 256 87 | # MASK_DIM: 256 88 | # MASK_DIM: 256 89 | # NORM: "GN" 90 | # # pixel decoder 91 | # PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 92 | # IN_FEATURES: ["res2", "res3", "res4", "res5"] 93 | # DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 94 | # COMMON_STRIDE: 4 95 | # TRANSFORMER_ENC_LAYERS: 6 96 | # MASK_FORMER: 97 | # TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 98 | # TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 99 | # DEEP_SUPERVISION: True 100 | # NO_OBJECT_WEIGHT: 0.1 101 | # CLASS_WEIGHT: 2.0 102 | # MASK_WEIGHT: 5.0 103 | # DICE_WEIGHT: 5.0 104 | # HIDDEN_DIM: 256 105 | # NUM_OBJECT_QUERIES: 100 106 | # NHEADS: 8 107 | # DROPOUT: 0.0 108 | # DIM_FEEDFORWARD: 2048 109 | # ENC_LAYERS: 0 110 | # PRE_NORM: False 111 | # ENFORCE_INPUT_PROJ: False 112 | # SIZE_DIVISIBILITY: 32 113 | # DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 114 | # TRAIN_NUM_POINTS: 12544 115 | # OVERSAMPLE_RATIO: 3.0 116 | # IMPORTANCE_SAMPLE_RATIO: 0.75 117 | # TEST: 118 | # SEMANTIC_ON: True 119 | # INSTANCE_ON: True 120 | # PANOPTIC_ON: True 121 | # OVERLAP_THRESHOLD: 0.8 122 | # OBJECT_MASK_THRESHOLD: 0.8 123 | -------------------------------------------------------------------------------- /configs/reid.yaml: -------------------------------------------------------------------------------- 1 | reid: 2 | name: test 3 | module_name: reid 4 | desription: 5 | seed: 12345 6 | # smth like MOT_train, KITTI_train_Pedestrian 7 | db_train: mot_reid_small_train 8 | db_val: False 9 | 10 | model_args: 11 | # Recommended for loss: batch_all, batch_hard 12 | # Unstable, no guarantee they are working: weighted_triplet, cross_entropy 13 | loss: batch_hard 14 | margin: 0.2 15 | # Plot prec at k to tensorboard, 0 for off 16 | prec_at_k: 3 17 | 18 | solver: 19 | optim: Adam 20 | optim_args: 21 | lr: 0.0003 22 | weight_decay: 0.0000 23 | 24 | dataloader: 25 | # all targets with visibility lower than this are filtered out, for kitti set it to 26 | # a sequence with maximal [truncation, occlusion] levels 27 | vis_threshold: 0.3 28 | P: 18 29 | K: 4 30 | # limit maximum number of images per identity 31 | max_per_person: 1000 32 | crop_H: 256 33 | crop_W: 128 34 | # center: just a center crop, random: random crop and 0.5 horizontal flip probability 35 | transform: random 36 | normalize_mean: 37 | - 0.485 38 | - 0.456 39 | - 0.406 40 | normalize_std: 41 | - 0.229 42 | - 0.224 43 | - 0.225 44 | 45 | cnn: 46 | output_dim: 128 -------------------------------------------------------------------------------- /configs/standard/2080ti_debug_train_coco.sh: -------------------------------------------------------------------------------- 1 | python main.py \ 2 | --dataset_name MOT17 --dataset_file coco \ 3 | --output_dir=./output/jul19_whole_coco --batch_size=1 --num_workers=0 --pre_hm --tracking --data_dir=/data/dataset/coco --scale 0.05 --shift 0.05 --flip 0.5 --meta_arch p3aformer --resume="" --num_feature_levels 3 --enc_layers 2 --dec_layers 2 -------------------------------------------------------------------------------- /configs/standard/2080ti_debug_train_crowdhuman.sh: -------------------------------------------------------------------------------- 1 | python main.py --output_dir=./output/debug_train_ch \ 2 | --dataset_name MOT17 --dataset_file crowdHuman --meta_arch p3aformer --batch_size=1 --num_workers=1 --pre_hm --tracking --data_dir=/data/dataset/crowdhuman --num_feature_levels 3 --enc_layers 2 --dec_layers 2 -------------------------------------------------------------------------------- /configs/standard/2080ti_debug_train_mot17.sh: -------------------------------------------------------------------------------- 1 | # debug, new dataset, reduced model size 2 | python main.py \ 3 | --meta_arch p3aformer \ 4 | --data_dir /data/dataset/mot \ 5 | --dataset_name MOT17 \ 6 | --dataset_file p3aformer_mot \ 7 | --batch_size=2 \ 8 | --output_dir=./output/debug \ 9 | --num_workers=20 \ 10 | --resume="" \ 11 | --pre_hm \ 12 | --tracking \ 13 | --same_aug_pre \ 14 | --image_blur_aug \ 15 | --lr 1e-4 \ 16 | --lr_backbone_names ["backbone.0"] \ 17 | --lr_backbone 2e-5 \ 18 | --lr_linear_proj_names ['reference_points', 'sampling_offsets',] \ 19 | --lr_linear_proj_mult 0.1 \ 20 | --lr_drop 40 \ 21 | --epochs 23 \ 22 | --weight_decay 1e-4 \ 23 | --clip_max_norm 0.1 \ 24 | --backbone 'resnet50' \ 25 | --position_embedding 'sine' \ 26 | --num_feature_levels 3 \ 27 | --enc_layers 2 \ 28 | --dec_layers 2 \ 29 | --dim_feedforward 1024 \ 30 | --hidden_dim 256 \ 31 | --shift 0.05 \ 32 | --scale 0.05 \ 33 | --rotate 0 \ 34 | --flip 0.5 \ 35 | --hm_disturb 0.05 \ 36 | --lost_disturb 0.4 \ 37 | --fp_disturb 0.1 \ 38 | --track_thresh 0.3 \ 39 | --new_thresh 0.3 \ 40 | --ltrb_amodal_weight 0.1 -------------------------------------------------------------------------------- /configs/standard/interpolation.sh: -------------------------------------------------------------------------------- 1 | # MOT 15 2 | EXP_DIR=exps/p3aformer_trained 3 | EXP_ID='' 4 | python3 interpolation.py \ 5 | --dataset_name MOT15 \ 6 | --data_dir /data/dataset/MOT15/ \ 7 | --input_txt_dir ${EXP_DIR}/${EXP_ID}/txt \ 8 | --output_txt_dir ${EXP_DIR}/${EXP_ID}/txt_interpolated 9 | 10 | # MOT 17 11 | EXP_DIR=exps/p3aformer_trained 12 | EXP_ID='p3aformer_trained' 13 | python3 interpolation.py \ 14 | --dataset_name MOT17 \ 15 | --data_dir /data/dataset/mot/ \ 16 | --input_txt_dir ${EXP_DIR}/${EXP_ID}/txt \ 17 | --output_txt_dir ${EXP_DIR}/${EXP_ID}/txt_interpolated -------------------------------------------------------------------------------- /configs/standard/v100_mot17_coco.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.launch \ 2 | --nproc_per_node=8 --use_env main.py \ 3 | --dataset_name MOT17 --dataset_file coco \ 4 | --output_dir=./output/jul19_whole_coco --batch_size=3 --num_workers=20 --pre_hm \ 5 | --tracking --data_dir=/data/dataset/coco --scale 0.05 --shift 0.05 --flip 0.5 --meta_arch p3aformer --resume="" -------------------------------------------------------------------------------- /configs/standard/v100_mot17_crowdhuman.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.launch \ 2 | --nproc_per_node=8 --use_env main.py \ 3 | --dataset_name MOT17 --dataset_file crowdHuman --output_dir=./output/jul14_whole_ch_from_COCO --batch_size=1 \ 4 | --num_workers=4 --resume=./output/whole_coco/checkpoint0049.pth --pre_hm --tracking \ 5 | --data_dir=/data/dataset/crowdhuman --meta_arch p3aformer 6 | -------------------------------------------------------------------------------- /configs/standard/v100_mot17_fine_tune_mot17.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.launch --nproc_per_node=8 \ 2 | --use_env main.py \ 3 | --meta_arch p3aformer \ 4 | --data_dir /data/dataset/mot \ 5 | --dataset_name MOT17 \ 6 | --dataset_file p3aformer_mot \ 7 | --batch_size=3 \ 8 | --output_dir=./output/jul16_mot17_finetune \ 9 | --num_workers=20 \ 10 | --resume=output/jul14_whole_ch_from_COCO/checkpoint0049.pth \ 11 | --pre_hm \ 12 | --tracking \ 13 | --same_aug_pre \ 14 | --image_blur_aug \ 15 | --lr 1e-4 \ 16 | --lr_backbone_names ["backbone.0"] \ 17 | --lr_backbone 2e-5 \ 18 | --lr_linear_proj_names ['reference_points', 'sampling_offsets',] \ 19 | --lr_linear_proj_mult 0.1 \ 20 | --lr_drop 40 \ 21 | --epochs 50 \ 22 | --weight_decay 1e-4 \ 23 | --clip_max_norm 0.1 \ 24 | --backbone 'resnet50' \ 25 | --position_embedding 'sine' \ 26 | --num_feature_levels 4 \ 27 | --enc_layers 6 \ 28 | --dec_layers 6 \ 29 | --dim_feedforward 1024 \ 30 | --hidden_dim 256 \ 31 | --shift 0.05 \ 32 | --scale 0.05 \ 33 | --rotate 0 \ 34 | --flip 0.5 \ 35 | --hm_disturb 0.05 \ 36 | --lost_disturb 0.4 \ 37 | --fp_disturb 0.1 \ 38 | --track_thresh 0.3 \ 39 | --new_thresh 0.3 \ 40 | --ltrb_amodal_weight 0.1 -------------------------------------------------------------------------------- /configs/standard/v100_submit_mot17.sh: -------------------------------------------------------------------------------- 1 | EXP_DIR=output/july5_mot17_finetune 2 | EXP_ID='Jul7WholeFineProcedure' 3 | MODEL_NAME=checkpoint0049.pth # our trained 4 | python3 eval.py \ 5 | --meta_arch p3aformer \ 6 | --dataset_file e2e_joint \ 7 | --dataset_name MOT17 \ 8 | --epoch 200 \ 9 | --with_box_refine \ 10 | --lr_drop 100 \ 11 | --lr 2e-4 \ 12 | --lr_backbone 2e-5 \ 13 | --pretrained ${EXP_DIR}/{MODEL_NAME} \ 14 | --output_dir ${EXP_DIR}/${EXP_ID} \ 15 | --batch_size 1 \ 16 | --sample_mode 'random_interval' \ 17 | --sample_interval 10 \ 18 | --sampler_steps 50 90 120 \ 19 | --sampler_lengths 2 3 4 5 \ 20 | --update_query_pos \ 21 | --merger_dropout 0 \ 22 | --dropout 0 \ 23 | --random_drop 0.1 \ 24 | --fp_ratio 0.3 \ 25 | --query_interaction_layer 'QIM' \ 26 | --extra_track_attn \ 27 | --resume ${EXP_DIR}/${MODEL_NAME} \ 28 | --mot_path datasets \ 29 | --detr_path ${EXP_DIR}/${MODEL_NAME} \ 30 | --reid_path ${EXP_DIR}/ResNet_iter_25245.pth \ 31 | --data_dir=/data/dataset/mot/ \ 32 | --track_thre 0.5 \ 33 | --low_thre 0.2 \ 34 | --submit -------------------------------------------------------------------------------- /configs/standard/v100_test_mot15.sh: -------------------------------------------------------------------------------- 1 | # validate our trained 2 | # for MOT15 3 | EXP_DIR=output/feb9_long_epoch 4 | EXP_ID='mot17_p3aformer_long_epoch' 5 | MODEL_NAME='checkpoint0199.pth' 6 | python3 eval.py \ 7 | --meta_arch p3aformer \ 8 | --dataset_name MOT15 \ 9 | --epoch 200 \ 10 | --with_box_refine \ 11 | --lr_drop 100 \ 12 | --lr 2e-4 \ 13 | --lr_backbone 2e-5 \ 14 | --pretrained ${EXP_DIR}/${MODEL_NAME} \ 15 | --output_dir ${EXP_DIR}/${EXP_ID} \ 16 | --batch_size 1 \ 17 | --sample_mode 'random_interval' \ 18 | --sample_interval 10 \ 19 | --sampler_steps 50 90 120 \ 20 | --sampler_lengths 2 3 4 5 \ 21 | --update_query_pos \ 22 | --merger_dropout 0 \ 23 | --dropout 0 \ 24 | --random_drop 0.1 \ 25 | --fp_ratio 0.3 \ 26 | --query_interaction_layer 'QIM' \ 27 | --extra_track_attn \ 28 | --resume ${EXP_DIR}/${MODEL_NAME} \ 29 | --mot_path datasets \ 30 | --detr_path ${EXP_DIR}/${MODEL_NAME} \ 31 | --reid_path ${EXP_DIR}/ResNet_iter_25245.pth \ 32 | --data_dir=/data/dataset/MOT15/ \ 33 | --track_thre 0.65 \ 34 | --low_thre 0.2 \ 35 | --first_assign_thre 0.9 \ 36 | --second_assign_thre 0.5 -------------------------------------------------------------------------------- /configs/standard/v100_test_mot17.sh: -------------------------------------------------------------------------------- 1 | # for MOT17 2 | EXP_DIR=output/jul16_mot17_finetune 3 | EXP_ID='Jul18Validate' 4 | MODEL_NAME=checkpoint0049.pth # our trained 5 | python3 eval.py \ 6 | --dataset_file p3aformer_mot \ 7 | --meta_arch p3aformer \ 8 | --dataset_name MOT17 \ 9 | --epoch 200 \ 10 | --with_box_refine \ 11 | --lr_drop 100 \ 12 | --lr 2e-4 \ 13 | --lr_backbone 2e-5 \ 14 | --pretrained ${EXP_DIR}/{MODEL_NAME} \ 15 | --output_dir ${EXP_DIR}/${EXP_ID} \ 16 | --batch_size 1 \ 17 | --sample_mode 'random_interval' \ 18 | --sample_interval 10 \ 19 | --sampler_steps 50 90 120 \ 20 | --sampler_lengths 2 3 4 5 \ 21 | --update_query_pos \ 22 | --merger_dropout 0 \ 23 | --dropout 0 \ 24 | --random_drop 0.1 \ 25 | --fp_ratio 0.3 \ 26 | --query_interaction_layer 'QIM' \ 27 | --extra_track_attn \ 28 | --resume ${EXP_DIR}/${MODEL_NAME} \ 29 | --mot_path datasets \ 30 | --detr_path ${EXP_DIR}/${MODEL_NAME} \ 31 | --reid_path ${EXP_DIR}/ResNet_iter_25245.pth \ 32 | --data_dir=/data/dataset/mot/ \ 33 | --track_thre 0.5 \ 34 | --low_thre 0.2 35 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data 2 | from .detmot import build as build_e2e_mot 3 | from .static_detmot import build as build_e2e_static_mot 4 | from .joint import build as build_e2e_joint 5 | from .torchvision_datasets import CocoDetection 6 | from .byte_mot_half import build as build_byte_mot_val 7 | from .p3aformer_dataset.coco import build as build_coco 8 | from .p3aformer_dataset.mot17_train import build as build_p3aformer_mot 9 | from .p3aformer_dataset.crowdhuman import CrowdHuman, build_crowdhuman 10 | from .p3aformer_dataset.mot17_train import build as build_p3aformer_mot_mixed 11 | 12 | 13 | def get_coco_api_from_dataset(dataset): 14 | for _ in range(10): 15 | # if isinstance(dataset, torchvision.datasets.CocoDetection): 16 | # break 17 | if isinstance(dataset, torch.utils.data.Subset): 18 | dataset = dataset.dataset 19 | if isinstance(dataset, CocoDetection): 20 | return dataset.coco 21 | 22 | 23 | def build_dataset(image_set, args): 24 | if args.dataset_file == "coco": 25 | return build_coco(image_set, args) 26 | if args.dataset_file == "coco_panoptic": 27 | # to avoid making panopticapi required for coco 28 | from .coco_panoptic import build as build_coco_panoptic 29 | 30 | return build_coco_panoptic(image_set, args) 31 | if args.dataset_file == "e2e_joint": # default dataset for MOT task 32 | return build_e2e_joint(image_set, args) 33 | if args.dataset_file == "e2e_static_mot": 34 | return build_e2e_static_mot(image_set, args) 35 | if args.dataset_file == "e2e_mot": 36 | return build_e2e_mot(image_set, args) 37 | if args.dataset_file == "byte_mot_half": 38 | return build_byte_mot_val(image_set, args) 39 | if args.dataset_file == "p3aformer_mot": 40 | return build_p3aformer_mot(image_set, args) 41 | if args.dataset_file == "p3aformer_mixed": 42 | return build_p3aformer_mot_mixed(image_set, args) 43 | if args.dataset_file == "crowdHuman": 44 | return build_crowdhuman(image_set, args) 45 | raise ValueError(f"dataset {args.dataset_file} not supported") 46 | -------------------------------------------------------------------------------- /datasets/coco_panoptic.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | import json 13 | from pathlib import Path 14 | 15 | import numpy as np 16 | import torch 17 | from PIL import Image 18 | 19 | from panopticapi.utils import rgb2id 20 | from util.box_ops import masks_to_boxes 21 | 22 | from .coco import make_coco_transforms 23 | 24 | 25 | class CocoPanoptic: 26 | def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True): 27 | with open(ann_file, 'r') as f: 28 | self.coco = json.load(f) 29 | 30 | # sort 'images' field so that they are aligned with 'annotations' 31 | # i.e., in alphabetical order 32 | self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id']) 33 | # sanity check 34 | if "annotations" in self.coco: 35 | for img, ann in zip(self.coco['images'], self.coco['annotations']): 36 | assert img['file_name'][:-4] == ann['file_name'][:-4] 37 | 38 | self.img_folder = img_folder 39 | self.ann_folder = ann_folder 40 | self.ann_file = ann_file 41 | self.transforms = transforms 42 | self.return_masks = return_masks 43 | 44 | def __getitem__(self, idx): 45 | ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx] 46 | img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg') 47 | ann_path = Path(self.ann_folder) / ann_info['file_name'] 48 | 49 | img = Image.open(img_path).convert('RGB') 50 | w, h = img.size 51 | if "segments_info" in ann_info: 52 | masks = np.asarray(Image.open(ann_path), dtype=np.uint32) 53 | masks = rgb2id(masks) 54 | 55 | ids = np.array([ann['id'] for ann in ann_info['segments_info']]) 56 | masks = masks == ids[:, None, None] 57 | 58 | masks = torch.as_tensor(masks, dtype=torch.uint8) 59 | labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64) 60 | 61 | target = {} 62 | target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]]) 63 | if self.return_masks: 64 | target['masks'] = masks 65 | target['labels'] = labels 66 | 67 | target["boxes"] = masks_to_boxes(masks) 68 | 69 | target['size'] = torch.as_tensor([int(h), int(w)]) 70 | target['orig_size'] = torch.as_tensor([int(h), int(w)]) 71 | if "segments_info" in ann_info: 72 | for name in ['iscrowd', 'area']: 73 | target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']]) 74 | 75 | if self.transforms is not None: 76 | img, target = self.transforms(img, target) 77 | 78 | return img, target 79 | 80 | def __len__(self): 81 | return len(self.coco['images']) 82 | 83 | def get_height_and_width(self, idx): 84 | img_info = self.coco['images'][idx] 85 | height = img_info['height'] 86 | width = img_info['width'] 87 | return height, width 88 | 89 | 90 | def build(image_set, args): 91 | img_folder_root = Path(args.coco_path) 92 | ann_folder_root = Path(args.coco_panoptic_path) 93 | assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist' 94 | assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist' 95 | mode = 'panoptic' 96 | PATHS = { 97 | "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'), 98 | "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'), 99 | } 100 | 101 | img_folder, ann_file = PATHS[image_set] 102 | img_folder_path = img_folder_root / img_folder 103 | ann_folder = ann_folder_root / f'{mode}_{img_folder}' 104 | ann_file = ann_folder_root / ann_file 105 | 106 | dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file, 107 | transforms=make_coco_transforms(image_set), return_masks=args.masks) 108 | 109 | return dataset 110 | -------------------------------------------------------------------------------- /datasets/d2_p3aformer_dataset/d2_mot15_val_dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import json 6 | import os 7 | 8 | try: 9 | from datasets.d2_p3aformer_dataset.d2_generic_dataset_val import GenericDataset_val 10 | except: 11 | from datasets.d2_p3aformer_dataset.d2_generic_dataset_val import GenericDataset_val 12 | from detectron2.config import configurable 13 | 14 | 15 | class MOT15_val(GenericDataset_val): 16 | num_classes = 1 17 | default_resolution = [640, 1088] 18 | max_objs = 300 19 | class_name = ["person"] 20 | cat_ids = {1: 1} 21 | 22 | @configurable 23 | def __init__(self, data_dir, split, input_w, input_h, output_w, output_h, private): 24 | assert split == "train", "We use MOT15 training split for validation." 25 | img_dir = os.path.join(data_dir, "images", "train") 26 | if split == "train": 27 | ann_path = os.path.join(data_dir, "annotations", "{}.json").format(split) 28 | elif split == "val": 29 | ann_path = os.path.join(data_dir, "annotations", "{}_last25.json").format( 30 | split 31 | ) 32 | else: # testset 33 | ann_path = os.path.join(data_dir, "annotations", "{}.json").format(split) 34 | print(f"==> initializing MOT15 {split} data from ann_path {ann_path}.") 35 | self.is_mot17 = False 36 | self.images = None 37 | super(MOT15_val, self).__init__( 38 | input_w=input_w, 39 | input_h=input_h, 40 | output_w=output_w, 41 | output_h=output_h, 42 | split=split, 43 | ann_path=ann_path, 44 | img_dir=img_dir, 45 | private=private, 46 | ) 47 | # load image list and coco 48 | self.num_samples = len(self.video_list) 49 | print("Loaded {} {} videos.".format(split, self.num_samples)) 50 | 51 | @classmethod 52 | def from_config(cls, cfg): 53 | input_h, input_w = ( 54 | cfg.MODEL.DENSETRACK.DEFAULT_RESOLUTION[0], 55 | cfg.MODEL.DENSETRACK.DEFAULT_RESOLUTION[1], 56 | ) 57 | output_h = input_h // cfg.MODEL.DENSETRACK.DOWN_RATIO 58 | output_w = input_w // cfg.MODEL.DENSETRACK.DOWN_RATIO 59 | ret = { 60 | "data_dir": cfg.INPUT.VAL_DATA_DIR, 61 | "split": "train", 62 | "input_w": input_w, 63 | "input_h": input_h, 64 | "output_w": output_w, 65 | "output_h": output_h, 66 | "private": cfg.TRACK.DENSETRACK.PRIVATE, 67 | } 68 | return ret 69 | 70 | def _to_float(self, x): 71 | return float("{:.2f}".format(x)) 72 | 73 | def _save_results(self, records, fpath): 74 | with open(fpath, "w") as fid: 75 | for record in records: 76 | line = json.dumps(record) + "\n" 77 | fid.write(line) 78 | return fpath 79 | 80 | def convert_eval_format(self, all_bboxes): 81 | detections = [] 82 | person_id = 1 83 | for image_id in all_bboxes: 84 | if type(all_bboxes[image_id]) != type({}): 85 | # newest format 86 | dtboxes = [] 87 | for j in range(len(all_bboxes[image_id])): 88 | item = all_bboxes[image_id][j] 89 | if item["class"] != person_id: 90 | continue 91 | bbox = item["bbox"] 92 | bbox[2] -= bbox[0] 93 | bbox[3] -= bbox[1] 94 | bbox_out = list(map(self._to_float, bbox[0:4])) 95 | detection = { 96 | "tag": 1, 97 | "box": bbox_out, 98 | "score": float("{:.2f}".format(item["score"])), 99 | } 100 | dtboxes.append(detection) 101 | img_info = self.coco.loadImgs(ids=[image_id])[0] 102 | file_name = img_info["file_name"] 103 | detections.append({"ID": file_name[:-4], "dtboxes": dtboxes}) 104 | return detections 105 | 106 | def __len__(self): 107 | return self.num_samples 108 | 109 | def save_results(self, results, save_dir): 110 | self._save_results( 111 | self.convert_eval_format(results), 112 | "{}/results_crowdhuman.odgt".format(save_dir), 113 | ) 114 | 115 | def run_eval(self, results, save_dir): 116 | self.save_results(results, save_dir) 117 | try: 118 | os.system( 119 | "python tools/crowdhuman_eval/demo.py " 120 | + "../data/crowdhuman/annotation_val.odgt " 121 | + "{}/results_crowdhuman.odgt".format(save_dir) 122 | ) 123 | except: 124 | print("Crowdhuman evaluation not setup!") 125 | -------------------------------------------------------------------------------- /datasets/data_path/gen_labels_15.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import os 3 | import numpy as np 4 | import cv2 5 | from tqdm import tqdm 6 | 7 | def mkdirs(d): 8 | if not osp.exists(d): 9 | os.makedirs(d) 10 | 11 | seq_root = '/data/workspace/datasets/mot/MOT15/images/train' 12 | label_root = '/data/workspace/datasets/mot/MOT15/labels_with_ids/train' 13 | mkdirs(label_root) 14 | seqs = ['ADL-Rundle-6', 'ETH-Bahnhof', 'KITTI-13', 'PETS09-S2L1', 'TUD-Stadtmitte', 'ADL-Rundle-8', 'KITTI-17', 15 | 'ETH-Pedcross2', 'ETH-Sunnyday', 'TUD-Campus', 'Venice-2'] 16 | 17 | tid_curr = 0 18 | tid_last = -1 19 | for seq in tqdm(seqs): 20 | 21 | # seq_info = open(osp.join(seq_root, seq, 'seqinfo.ini')).read() 22 | # seq_width = int(seq_info[seq_info.find('imWidth=') + 8:seq_info.find('\nimHeight')]) 23 | # seq_height = int(seq_info[seq_info.find('imHeight=') + 9:seq_info.find('\nimExt')]) 24 | 25 | all_imgs = os.listdir(osp.join(seq_root, seq, 'img1')) 26 | fm = cv2.imread(osp.join(seq_root, seq, 'img1', all_imgs[0])) 27 | seq_height, seq_width, c = fm.shape 28 | 29 | gt_txt = osp.join(seq_root, seq, 'gt', 'gt.txt') 30 | gt = np.loadtxt(gt_txt, dtype=np.float64, delimiter=',') 31 | idx = np.lexsort(gt.T[:2, :]) 32 | gt = gt[idx, :] 33 | 34 | seq_label_root = osp.join(label_root, seq, 'img1') 35 | mkdirs(seq_label_root) 36 | 37 | for fid, tid, x, y, w, h, mark, _, _, _ in gt: 38 | if mark == 0: 39 | continue 40 | fid = int(fid) 41 | tid = int(tid) 42 | if not tid == tid_last: 43 | tid_curr += 1 44 | tid_last = tid 45 | x += w / 2 46 | y += h / 2 47 | label_fpath = osp.join(seq_label_root, '{:06d}.txt'.format(fid)) 48 | label_str = '0 {:d} {:.6f} {:.6f} {:.6f} {:.6f}\n'.format( 49 | tid_curr, x / seq_width, y / seq_height, w / seq_width, h / seq_height) 50 | with open(label_fpath, 'a') as f: 51 | f.write(label_str) -------------------------------------------------------------------------------- /datasets/data_path/gen_labels_16.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import os 3 | import numpy as np 4 | def mkdirs(d): 5 | if not osp.exists(d): 6 | os.makedirs(d) 7 | 8 | seq_root = '/data/workspace/datasets/mot/MOT16/images/train' 9 | label_root = '/data/workspace/datasets/mot/MOT16/labels_with_ids/train' 10 | mkdirs(label_root) 11 | seqs = [s for s in os.listdir(seq_root)] 12 | 13 | tid_curr = 0 14 | tid_last = -1 15 | for seq in seqs: 16 | seq_info = open(osp.join(seq_root, seq, 'seqinfo.ini')).read() 17 | seq_width = int(seq_info[seq_info.find('imWidth=') + 8:seq_info.find('\nimHeight')]) 18 | seq_height = int(seq_info[seq_info.find('imHeight=') + 9:seq_info.find('\nimExt')]) 19 | 20 | gt_txt = osp.join(seq_root, seq, 'gt', 'gt.txt') 21 | gt = np.loadtxt(gt_txt, dtype=np.float64, delimiter=',') 22 | idx = np.lexsort(gt.T[:2, :]) 23 | gt = gt[idx, :] 24 | 25 | seq_label_root = osp.join(label_root, seq, 'img1') 26 | mkdirs(seq_label_root) 27 | 28 | for fid, tid, x, y, w, h, mark, _, _ in gt: 29 | if mark == 0: 30 | continue 31 | fid = int(fid) 32 | tid = int(tid) 33 | if not tid == tid_last: 34 | tid_curr += 1 35 | tid_last = tid 36 | x += w / 2 37 | y += h / 2 38 | label_fpath = osp.join(seq_label_root, '{:06d}.txt'.format(fid)) 39 | label_str = '0 {:d} {:.6f} {:.6f} {:.6f} {:.6f}\n'.format( 40 | tid_curr, x / seq_width, y / seq_height, w / seq_width, h / seq_height) 41 | with open(label_fpath, 'a') as f: 42 | f.write(label_str) -------------------------------------------------------------------------------- /datasets/data_path/prepare.py: -------------------------------------------------------------------------------- 1 | import os 2 | from functools import partial 3 | from typing import List 4 | 5 | 6 | def solve_MOT_train(root, year): 7 | assert year in [15, 16, 17] 8 | dataset_path = 'MOT{}/images/train'.format(year) 9 | data_root = os.path.join(root, dataset_path) 10 | if year == 17: 11 | video_paths = [] 12 | for video_name in os.listdir(data_root): 13 | if 'SDP' in video_name: 14 | video_paths.append(video_name) 15 | else: 16 | video_paths = os.listdir(data_root) 17 | 18 | frames = [] 19 | for video_name in video_paths: 20 | files = os.listdir(os.path.join(data_root, video_name, 'img1')) 21 | files.sort() 22 | for i in range(1, len(files) + 1): 23 | frames.append(os.path.join(dataset_path, video_name, 'img1', '%06d.jpg' % i)) 24 | return frames 25 | 26 | 27 | def solve_CUHK(root): 28 | dataset_path = 'ethz/CUHK-SYSU' 29 | data_root = os.path.join(root, dataset_path) 30 | file_names = os.listdir(os.path.join(data_root, 'images')) 31 | file_names.sort() 32 | 33 | frames = [] 34 | for i in range(len(file_names)): 35 | if os.path.exists(os.path.join(root, 'ethz/CUHK-SYSU/labels_with_ids', f's{i + 1}.txt')): 36 | if os.path.exists(os.path.join(root, 'ethz/CUHK-SYSU/images', f's{i + 1}.jpg')): 37 | frames.append(os.path.join('ethz/CUHK-SYSU/images', f's{i + 1}.jpg')) 38 | return frames 39 | 40 | def solve_ETHZ(root): 41 | dataset_path = 'ethz/ETHZ' 42 | data_root = os.path.join(root, dataset_path) 43 | video_paths = [] 44 | for name in os.listdir(data_root): 45 | if name not in ['eth01', 'eth03']: 46 | video_paths.append(name) 47 | 48 | frames = [] 49 | for video_path in video_paths: 50 | files = os.listdir(os.path.join(data_root, video_path, 'images')) 51 | files.sort() 52 | for img_name in files: 53 | if os.path.exists(os.path.join(data_root, video_path, 'labels_with_ids', img_name.replace('.png', '.txt'))): 54 | if os.path.exists(os.path.join(data_root, video_path, 'images', img_name)): 55 | frames.append(os.path.join('ethz/ETHZ', video_path, 'images', img_name)) 56 | return frames 57 | 58 | 59 | def solve_PRW(root): 60 | dataset_path = 'ethz/PRW' 61 | data_root = os.path.join(root, dataset_path) 62 | frame_paths = os.listdir(os.path.join(data_root, 'images')) 63 | frame_paths.sort() 64 | frames = [] 65 | for i in range(len(frame_paths)): 66 | if os.path.exists(os.path.join(data_root, 'labels_with_ids', frame_paths[i].split('.')[0] + '.txt')): 67 | if os.path.exists(os.path.join(data_root, 'images', frame_paths[i])): 68 | frames.append(os.path.join(dataset_path, 'images', frame_paths[i])) 69 | return frames 70 | 71 | 72 | dataset_catalog = { 73 | 'MOT15': partial(solve_MOT_train, year=15), 74 | 'MOT16': partial(solve_MOT_train, year=16), 75 | 'MOT17': partial(solve_MOT_train, year=17), 76 | 'CUHK-SYSU': solve_CUHK, 77 | 'ETHZ': solve_ETHZ, 78 | 'PRW': solve_PRW, 79 | } 80 | 81 | 82 | def solve(dataset_list: List[str], root, save_path): 83 | all_frames = [] 84 | for dataset_name in dataset_list: 85 | dataset_frames = dataset_catalog[dataset_name](root) 86 | print("solve {} frames from dataset:{} ".format(len(dataset_frames), dataset_name)) 87 | all_frames.extend(dataset_frames) 88 | print("totally {} frames are solved.".format(len(all_frames))) 89 | with open(save_path, 'w') as f: 90 | for u in all_frames: 91 | line = '{}'.format(u) + '\n' 92 | f.writelines(line) 93 | 94 | root = '/data/workspace/datasets/mot' 95 | save_path = '/data/workspace/detr-mot/datasets/data_path/mot17.train' # for fangao 96 | dataset_list = ['MOT17', ] 97 | 98 | solve(dataset_list, root, save_path) 99 | -------------------------------------------------------------------------------- /datasets/data_prefetcher.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | import torch 13 | from functools import partial 14 | from models.structures import Instances 15 | 16 | def to_cuda(samples, targets, device): 17 | samples = samples.to(device, non_blocking=True) 18 | targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets] 19 | return samples, targets 20 | 21 | 22 | def tensor_to_cuda(tensor: torch.Tensor, device): 23 | return tensor.to(device) 24 | 25 | 26 | def is_tensor_or_instances(data): 27 | return isinstance(data, torch.Tensor) or isinstance(data, Instances) 28 | 29 | 30 | def data_apply(data, check_func, apply_func): 31 | if isinstance(data, dict): 32 | for k in data.keys(): 33 | if check_func(data[k]): 34 | data[k] = apply_func(data[k]) 35 | elif isinstance(data[k], dict) or isinstance(data[k], list): 36 | data_apply(data[k], check_func, apply_func) 37 | else: 38 | raise ValueError() 39 | elif isinstance(data, list): 40 | for i in range(len(data)): 41 | if check_func(data[i]): 42 | data[i] = apply_func(data[i]) 43 | elif isinstance(data[i], dict) or isinstance(data[i], list): 44 | data_apply(data[i], check_func, apply_func) 45 | else: 46 | raise ValueError("invalid type {}".format(type(data[i]))) 47 | else: 48 | raise ValueError("invalid type {}".format(type(data))) 49 | return data 50 | 51 | 52 | def data_dict_to_cuda(data_dict, device): 53 | return data_apply(data_dict, is_tensor_or_instances, partial(tensor_to_cuda, device=device)) 54 | 55 | 56 | class data_prefetcher(): 57 | def __init__(self, loader, device, prefetch=True): 58 | self.loader = iter(loader) 59 | self.prefetch = prefetch 60 | self.device = device 61 | if prefetch: 62 | self.stream = torch.cuda.Stream() 63 | self.preload() 64 | 65 | def preload(self): 66 | try: 67 | self.next_samples, self.next_targets = next(self.loader) 68 | except StopIteration: 69 | self.next_samples = None 70 | self.next_targets = None 71 | return 72 | # if record_stream() doesn't work, another option is to make sure device inputs are created 73 | # on the main stream. 74 | # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda') 75 | # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda') 76 | # Need to make sure the memory allocated for next_* is not still in use by the main stream 77 | # at the time we start copying to next_*: 78 | # self.stream.wait_stream(torch.cuda.current_stream()) 79 | with torch.cuda.stream(self.stream): 80 | self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device) 81 | # more code for the alternative if record_stream() doesn't work: 82 | # copy_ will record the use of the pinned source tensor in this side stream. 83 | # self.next_input_gpu.copy_(self.next_input, non_blocking=True) 84 | # self.next_target_gpu.copy_(self.next_target, non_blocking=True) 85 | # self.next_input = self.next_input_gpu 86 | # self.next_target = self.next_target_gpu 87 | 88 | # With Amp, it isn't necessary to manually convert data to half. 89 | # if args.fp16: 90 | # self.next_input = self.next_input.half() 91 | # else: 92 | 93 | def next(self): 94 | if self.prefetch: 95 | torch.cuda.current_stream().wait_stream(self.stream) 96 | samples = self.next_samples 97 | targets = self.next_targets 98 | if samples is not None: 99 | samples.record_stream(torch.cuda.current_stream()) 100 | if targets is not None: 101 | for t in targets: 102 | for k, v in t.items(): 103 | v.record_stream(torch.cuda.current_stream()) 104 | self.preload() 105 | else: 106 | try: 107 | samples, targets = next(self.loader) 108 | samples, targets = to_cuda(samples, targets, self.device) 109 | except StopIteration: 110 | print("catch_stop_iter") 111 | samples = None 112 | targets = None 113 | 114 | return samples, targets 115 | -------------------------------------------------------------------------------- /datasets/p3aformer_dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/datasets/p3aformer_dataset/__init__.py -------------------------------------------------------------------------------- /datasets/p3aformer_dataset/coco.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from pycocotools.cocoeval import COCOeval 6 | import json 7 | import os 8 | 9 | try: 10 | from .generic_dataset_train import GenericDataset 11 | except: 12 | from generic_dataset_train import GenericDataset 13 | 14 | 15 | class COCO(GenericDataset): 16 | default_resolution = [640, 1088] 17 | num_categories = 1 18 | class_name = ["person"] 19 | _valid_ids = [1] 20 | cat_ids = {v: i + 1 for i, v in enumerate(_valid_ids)} 21 | num_joints = 17 22 | flip_idx = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] 23 | edges = [ 24 | [0, 1], 25 | [0, 2], 26 | [1, 3], 27 | [2, 4], 28 | [4, 6], 29 | [3, 5], 30 | [5, 6], 31 | [5, 7], 32 | [7, 9], 33 | [6, 8], 34 | [8, 10], 35 | [6, 12], 36 | [5, 11], 37 | [11, 12], 38 | [12, 14], 39 | [14, 16], 40 | [11, 13], 41 | [13, 15], 42 | ] 43 | max_objs = 300 44 | 45 | def __init__(self, opt, split): 46 | # load annotations 47 | data_dir = os.path.join(opt.data_dir) 48 | img_dir = os.path.join(data_dir, "{}2017".format(split)) 49 | ann_path = os.path.join( 50 | data_dir, "annotations", "instances_{}2017_person.json" 51 | ).format(split) 52 | 53 | self.images = None 54 | # load image list and coco 55 | super(COCO, self).__init__(opt, split, ann_path, img_dir) 56 | self.sf = 0.3 57 | self.num_samples = len(self.images) 58 | 59 | print("Loaded {} {} samples".format(split, self.num_samples)) 60 | 61 | def _to_float(self, x): 62 | return float("{:.2f}".format(x)) 63 | 64 | def convert_eval_format(self, all_bboxes): 65 | detections = [] 66 | for image_id in all_bboxes: 67 | if type(all_bboxes[image_id]) != type({}): 68 | # newest format 69 | for j in range(len(all_bboxes[image_id])): 70 | item = all_bboxes[image_id][j] 71 | cat_id = item["class"] - 1 72 | category_id = self._valid_ids[cat_id] 73 | bbox = item["bbox"] 74 | bbox[2] -= bbox[0] 75 | bbox[3] -= bbox[1] 76 | bbox_out = list(map(self._to_float, bbox[0:4])) 77 | detection = { 78 | "image_id": int(image_id), 79 | "category_id": int(category_id), 80 | "bbox": bbox_out, 81 | "score": float("{:.2f}".format(item["score"])), 82 | } 83 | detections.append(detection) 84 | return detections 85 | 86 | def __len__(self): 87 | return self.num_samples 88 | 89 | def save_results(self, results, save_dir): 90 | json.dump( 91 | self.convert_eval_format(results), 92 | open("{}/results_coco.json".format(save_dir), "w"), 93 | ) 94 | 95 | def run_eval(self, results, save_dir): 96 | self.save_results(results, save_dir) 97 | coco_dets = self.coco.loadRes("{}/results_coco.json".format(save_dir)) 98 | coco_eval = COCOeval(self.coco, coco_dets, "bbox") 99 | coco_eval.evaluate() 100 | coco_eval.accumulate() 101 | coco_eval.summarize() 102 | 103 | 104 | def build(image_set, args): 105 | d = COCO(args, image_set) 106 | # input output shapes 107 | args.input_h, args.input_w = d.default_resolution[0], d.default_resolution[1] 108 | args.output_h = args.input_h // args.down_ratio 109 | args.output_w = args.input_w // args.down_ratio 110 | args.input_res = max(args.input_h, args.input_w) 111 | args.output_res = max(args.output_h, args.output_w) 112 | # threshold 113 | args.out_thresh = max(args.track_thresh, args.out_thresh) 114 | args.pre_thresh = max(args.track_thresh, args.pre_thresh) 115 | args.new_thresh = max(args.track_thresh, args.new_thresh) 116 | return d 117 | -------------------------------------------------------------------------------- /datasets/p3aformer_dataset/crowdhuman.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import json 6 | import os 7 | 8 | try: 9 | from .generic_dataset_train import GenericDataset 10 | except: 11 | from generic_dataset_train import GenericDataset 12 | 13 | 14 | class CrowdHuman(GenericDataset): 15 | num_classes = 1 16 | num_joints = 17 17 | default_resolution = [640, 1088] 18 | max_objs = 300 19 | class_name = ["person"] 20 | cat_ids = {1: 1} 21 | 22 | def __init__(self, opt, split): 23 | super(CrowdHuman, self).__init__() 24 | data_dir = opt.data_dir 25 | img_dir = os.path.join(data_dir, "Images") 26 | ann_path = os.path.join(data_dir, "annotations", "{}.json").format(split) 27 | 28 | print("==> initializing CrowdHuman {} data.".format(split)) 29 | 30 | self.images = None 31 | # load image list and coco 32 | super(CrowdHuman, self).__init__(opt, split, ann_path, img_dir) 33 | self.sf = 0.3 34 | 35 | self.num_samples = len(self.images) 36 | 37 | print("Loaded {} {} samples".format(split, self.num_samples)) 38 | 39 | def _to_float(self, x): 40 | return float("{:.2f}".format(x)) 41 | 42 | def _save_results(self, records, fpath): 43 | with open(fpath, "w") as fid: 44 | for record in records: 45 | line = json.dumps(record) + "\n" 46 | fid.write(line) 47 | return fpath 48 | 49 | def convert_eval_format(self, all_bboxes): 50 | detections = [] 51 | person_id = 1 52 | for image_id in all_bboxes: 53 | if type(all_bboxes[image_id]) != type({}): 54 | # newest format 55 | dtboxes = [] 56 | for j in range(len(all_bboxes[image_id])): 57 | item = all_bboxes[image_id][j] 58 | if item["class"] != person_id: 59 | continue 60 | bbox = item["bbox"] 61 | bbox[2] -= bbox[0] 62 | bbox[3] -= bbox[1] 63 | bbox_out = list(map(self._to_float, bbox[0:4])) 64 | detection = { 65 | "tag": 1, 66 | "box": bbox_out, 67 | "score": float("{:.2f}".format(item["score"])), 68 | } 69 | dtboxes.append(detection) 70 | img_info = self.coco.loadImgs(ids=[image_id])[0] 71 | file_name = img_info["file_name"] 72 | detections.append({"ID": file_name[:-4], "dtboxes": dtboxes}) 73 | return detections 74 | 75 | def __len__(self): 76 | return self.num_samples 77 | 78 | def save_results(self, results, save_dir): 79 | self._save_results( 80 | self.convert_eval_format(results), 81 | "{}/results_crowdhuman.odgt".format(save_dir), 82 | ) 83 | 84 | def run_eval(self, results, save_dir): 85 | self.save_results(results, save_dir) 86 | try: 87 | os.system( 88 | "python tools/crowdhuman_eval/demo.py " 89 | + "../data/crowdhuman/annotation_val.odgt " 90 | + "{}/results_crowdhuman.odgt".format(save_dir) 91 | ) 92 | except: 93 | print("Crowdhuman evaluation not setup!") 94 | 95 | 96 | def build_crowdhuman(image_set, args): 97 | d = CrowdHuman(args, image_set) 98 | # input output shapes 99 | args.input_h, args.input_w = d.default_resolution[0], d.default_resolution[1] 100 | args.output_h = args.input_h // args.down_ratio 101 | args.output_w = args.input_w // args.down_ratio 102 | args.input_res = max(args.input_h, args.input_w) 103 | args.output_res = max(args.output_h, args.output_w) 104 | # threshold 105 | args.out_thresh = max(args.track_thresh, args.out_thresh) 106 | args.pre_thresh = max(args.track_thresh, args.pre_thresh) 107 | args.new_thresh = max(args.track_thresh, args.new_thresh) 108 | args.adaptive_clip = True 109 | return d 110 | -------------------------------------------------------------------------------- /datasets/p3aformer_dataset/mot17_train.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import json 6 | import os 7 | try: 8 | from .generic_dataset_train import GenericDataset 9 | except: 10 | from generic_dataset_train import GenericDataset 11 | 12 | 13 | class MOT17(GenericDataset): 14 | num_classes = 1 15 | num_joints = 17 16 | default_resolution = [640, 1088] 17 | max_objs = 300 18 | class_name = ['person'] 19 | cat_ids = {1: 1} 20 | 21 | def __init__(self, opt, split): 22 | super(MOT17, self).__init__() 23 | data_dir = opt.data_dir 24 | if split == 'test': 25 | img_dir = os.path.join( 26 | data_dir, 'test') 27 | else: 28 | img_dir = os.path.join( 29 | data_dir, 'train') 30 | if opt.half_train: 31 | print("==> Using half of the MOT 17 data!") 32 | if split == 'train' and not opt.half_train: 33 | ann_path = os.path.join(data_dir, 'annotations_onlySDP', '{}.json').format(split) 34 | else: 35 | ann_path = os.path.join(data_dir, 'annotations_onlySDP', '{}_half.json').format(split) 36 | 37 | print('==> initializing MOT17 {} data.'.format(split)) 38 | 39 | self.images = None 40 | # load image list and coco 41 | super(MOT17, self).__init__(opt, split, ann_path, img_dir) 42 | 43 | self.num_samples = len(self.images) 44 | 45 | print('Loaded {} {} samples'.format(split, self.num_samples)) 46 | 47 | def _to_float(self, x): 48 | return float("{:.2f}".format(x)) 49 | 50 | def _save_results(self, records, fpath): 51 | with open(fpath,'w') as fid: 52 | for record in records: 53 | line = json.dumps(record)+'\n' 54 | fid.write(line) 55 | return fpath 56 | 57 | def convert_eval_format(self, all_bboxes): 58 | detections = [] 59 | person_id = 1 60 | for image_id in all_bboxes: 61 | if type(all_bboxes[image_id]) != type({}): 62 | # newest format 63 | dtboxes = [] 64 | for j in range(len(all_bboxes[image_id])): 65 | item = all_bboxes[image_id][j] 66 | if item['class'] != person_id: 67 | continue 68 | bbox = item['bbox'] 69 | bbox[2] -= bbox[0] 70 | bbox[3] -= bbox[1] 71 | bbox_out = list(map(self._to_float, bbox[0:4])) 72 | detection = { 73 | "tag": 1, 74 | "box": bbox_out, 75 | "score": float("{:.2f}".format(item['score'])) 76 | } 77 | dtboxes.append(detection) 78 | img_info = self.coco.loadImgs(ids=[image_id])[0] 79 | file_name = img_info['file_name'] 80 | detections.append({'ID': file_name[:-4], 'dtboxes': dtboxes}) 81 | return detections 82 | 83 | def __len__(self): 84 | return self.num_samples 85 | 86 | def save_results(self, results, save_dir): 87 | self._save_results(self.convert_eval_format(results), 88 | '{}/results_crowdhuman.odgt'.format(save_dir)) 89 | def run_eval(self, results, save_dir): 90 | self.save_results(results, save_dir) 91 | try: 92 | os.system('python tools/crowdhuman_eval/demo.py ' + \ 93 | '../data/crowdhuman/annotation_val.odgt ' + \ 94 | '{}/results_crowdhuman.odgt'.format(save_dir)) 95 | except: 96 | print('Crowdhuman evaluation not setup!') 97 | 98 | 99 | def build(image_set, args): 100 | d = MOT17(args, image_set) 101 | # input output shapes 102 | args.input_h, args.input_w = d.default_resolution[0], d.default_resolution[1] 103 | args.output_h = args.input_h // args.down_ratio 104 | args.output_w = args.input_w // args.down_ratio 105 | args.input_res = max(args.input_h, args.input_w) 106 | args.output_res = max(args.output_h, args.output_w) 107 | # threshold 108 | args.out_thresh = max(args.track_thresh, args.out_thresh) 109 | args.pre_thresh = max(args.track_thresh, args.pre_thresh) 110 | args.new_thresh = max(args.track_thresh, args.new_thresh) 111 | args.adaptive_clip = True 112 | return d 113 | -------------------------------------------------------------------------------- /datasets/p3aformer_dataset/mot17_val_save_mem.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import json 6 | import os 7 | 8 | try: 9 | from datasets.p3aformer_dataset.generic_dataset_test_save_mem import ( 10 | GenericDataset_val, 11 | ) 12 | except: 13 | from datasets.p3aformer_dataset.generic_dataset_test_save_mem import ( 14 | GenericDataset_val, 15 | ) 16 | 17 | 18 | class MOT17_val(GenericDataset_val): 19 | num_classes = 1 20 | default_resolution = [640, 1088] 21 | max_objs = 300 22 | class_name = ["person"] 23 | cat_ids = {1: 1} 24 | 25 | def __init__(self, opt, split): 26 | super(MOT17_val, self).__init__() 27 | data_dir = opt.data_dir 28 | if split == "test": 29 | img_dir = os.path.join(data_dir, "test") 30 | else: 31 | img_dir = os.path.join(data_dir, "train") 32 | 33 | if split == "train": 34 | ann_path = os.path.join(data_dir, "annotations", "{}.json").format(split) 35 | elif split == "val": 36 | ann_path = os.path.join(data_dir, "annotations", "{}_last25.json").format( 37 | split 38 | ) 39 | else: # testset 40 | ann_path = os.path.join(data_dir, "annotations", "{}.json").format(split) 41 | 42 | print("ann_path: ", ann_path) 43 | 44 | print("==> initializing MOT17 {} data.".format(split)) 45 | 46 | self.images = None 47 | # load image list and coco 48 | super(MOT17_val, self).__init__(opt, split, ann_path, img_dir) 49 | 50 | self.num_samples = len(self.video_list) 51 | self.is_mot17 = True 52 | print("Loaded {} {} samples".format(split, self.num_samples)) 53 | 54 | def _to_float(self, x): 55 | return float("{:.2f}".format(x)) 56 | 57 | def _save_results(self, records, fpath): 58 | with open(fpath, "w") as fid: 59 | for record in records: 60 | line = json.dumps(record) + "\n" 61 | fid.write(line) 62 | return fpath 63 | 64 | def convert_eval_format(self, all_bboxes): 65 | detections = [] 66 | person_id = 1 67 | for image_id in all_bboxes: 68 | if type(all_bboxes[image_id]) != type({}): 69 | # newest format 70 | dtboxes = [] 71 | for j in range(len(all_bboxes[image_id])): 72 | item = all_bboxes[image_id][j] 73 | if item["class"] != person_id: 74 | continue 75 | bbox = item["bbox"] 76 | bbox[2] -= bbox[0] 77 | bbox[3] -= bbox[1] 78 | bbox_out = list(map(self._to_float, bbox[0:4])) 79 | detection = { 80 | "tag": 1, 81 | "box": bbox_out, 82 | "score": float("{:.2f}".format(item["score"])), 83 | } 84 | dtboxes.append(detection) 85 | img_info = self.coco.loadImgs(ids=[image_id])[0] 86 | file_name = img_info["file_name"] 87 | detections.append({"ID": file_name[:-4], "dtboxes": dtboxes}) 88 | return detections 89 | 90 | def __len__(self): 91 | return self.num_samples 92 | 93 | def save_results(self, results, save_dir): 94 | self._save_results( 95 | self.convert_eval_format(results), 96 | "{}/results_crowdhuman.odgt".format(save_dir), 97 | ) 98 | 99 | def run_eval(self, results, save_dir): 100 | self.save_results(results, save_dir) 101 | try: 102 | os.system( 103 | "python tools/crowdhuman_eval/demo.py " 104 | + "../data/crowdhuman/annotation_val.odgt " 105 | + "{}/results_crowdhuman.odgt".format(save_dir) 106 | ) 107 | except: 108 | print("Crowdhuman evaluation not setup!") 109 | -------------------------------------------------------------------------------- /datasets/p3aformer_dataset/mot20_val_save_mem.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import json 6 | import os 7 | try: 8 | from .generic_dataset_test_save_mem import GenericDataset_val 9 | except: 10 | from generic_dataset_test_save_mem import GenericDataset_val 11 | 12 | 13 | class MOT20_val(GenericDataset_val): 14 | num_classes = 1 15 | default_resolution = [640, 1088] 16 | max_objs = 300 17 | class_name = ['person'] 18 | cat_ids = {1: 1} 19 | 20 | def __init__(self, opt, split): 21 | super(MOT20_val, self).__init__() 22 | data_dir = opt.data_dir 23 | if split == 'test': 24 | img_dir = os.path.join( 25 | data_dir, 'test') 26 | else: 27 | img_dir = os.path.join( 28 | data_dir, 'train') 29 | 30 | if split == 'train': 31 | ann_path = os.path.join(data_dir, 'annotations', 32 | '{}.json').format(split) 33 | elif split == 'val': 34 | ann_path = os.path.join(data_dir, 'annotations', 35 | '{}_last25.json').format(split) 36 | else: #testset 37 | ann_path = os.path.join(data_dir, 'annotations', 38 | '{}.json').format(split) 39 | 40 | print("ann_path: ", ann_path) 41 | 42 | print('==> initializing MOT20 {} data.'.format(split)) 43 | 44 | self.images = None 45 | # load image list and coco 46 | super(MOT20_val, self).__init__(opt, split, ann_path, img_dir) 47 | 48 | self.num_samples = len(self.video_list) 49 | 50 | print('Loaded {} {} samples'.format(split, self.num_samples)) 51 | 52 | def _to_float(self, x): 53 | return float("{:.2f}".format(x)) 54 | 55 | def _save_results(self, records, fpath): 56 | with open(fpath,'w') as fid: 57 | for record in records: 58 | line = json.dumps(record)+'\n' 59 | fid.write(line) 60 | return fpath 61 | 62 | def convert_eval_format(self, all_bboxes): 63 | detections = [] 64 | person_id = 1 65 | for image_id in all_bboxes: 66 | if type(all_bboxes[image_id]) != type({}): 67 | # newest format 68 | dtboxes = [] 69 | for j in range(len(all_bboxes[image_id])): 70 | item = all_bboxes[image_id][j] 71 | if item['class'] != person_id: 72 | continue 73 | bbox = item['bbox'] 74 | bbox[2] -= bbox[0] 75 | bbox[3] -= bbox[1] 76 | bbox_out = list(map(self._to_float, bbox[0:4])) 77 | detection = { 78 | "tag": 1, 79 | "box": bbox_out, 80 | "score": float("{:.2f}".format(item['score'])) 81 | } 82 | dtboxes.append(detection) 83 | img_info = self.coco.loadImgs(ids=[image_id])[0] 84 | file_name = img_info['file_name'] 85 | detections.append({'ID': file_name[:-4], 'dtboxes': dtboxes}) 86 | return detections 87 | 88 | def __len__(self): 89 | return self.num_samples 90 | 91 | def save_results(self, results, save_dir): 92 | self._save_results(self.convert_eval_format(results), 93 | '{}/results_crowdhuman.odgt'.format(save_dir)) 94 | def run_eval(self, results, save_dir): 95 | self.save_results(results, save_dir) 96 | try: 97 | os.system('python tools/crowdhuman_eval/demo.py ' + \ 98 | '../data/crowdhuman/annotation_val.odgt ' + \ 99 | '{}/results_crowdhuman.odgt'.format(save_dir)) 100 | except: 101 | print('Crowdhuman evaluation not setup!') 102 | -------------------------------------------------------------------------------- /datasets/panoptic_eval.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | import json 13 | import os 14 | 15 | import util.misc as utils 16 | 17 | try: 18 | from panopticapi.evaluation import pq_compute 19 | except ImportError: 20 | pass 21 | 22 | 23 | class PanopticEvaluator(object): 24 | def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"): 25 | self.gt_json = ann_file 26 | self.gt_folder = ann_folder 27 | if utils.is_main_process(): 28 | if not os.path.exists(output_dir): 29 | os.mkdir(output_dir) 30 | self.output_dir = output_dir 31 | self.predictions = [] 32 | 33 | def update(self, predictions): 34 | for p in predictions: 35 | with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f: 36 | f.write(p.pop("png_string")) 37 | 38 | self.predictions += predictions 39 | 40 | def synchronize_between_processes(self): 41 | all_predictions = utils.all_gather(self.predictions) 42 | merged_predictions = [] 43 | for p in all_predictions: 44 | merged_predictions += p 45 | self.predictions = merged_predictions 46 | 47 | def summarize(self): 48 | if utils.is_main_process(): 49 | json_data = {"annotations": self.predictions} 50 | predictions_json = os.path.join(self.output_dir, "predictions.json") 51 | with open(predictions_json, "w") as f: 52 | f.write(json.dumps(json_data)) 53 | return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir) 54 | return None 55 | -------------------------------------------------------------------------------- /datasets/torchvision_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from .coco import CocoDetection 13 | -------------------------------------------------------------------------------- /datasets/torchvision_datasets/coco.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | """ 13 | Copy-Paste from torchvision, but add utility of caching images on memory 14 | """ 15 | from torchvision.datasets.vision import VisionDataset 16 | from PIL import Image 17 | import os 18 | import os.path 19 | import tqdm 20 | from io import BytesIO 21 | 22 | 23 | class CocoDetection(VisionDataset): 24 | """`MS Coco Detection `_ Dataset. 25 | Args: 26 | root (string): Root directory where images are downloaded to. 27 | annFile (string): Path to json annotation file. 28 | transform (callable, optional): A function/transform that takes in an PIL image 29 | and returns a transformed version. E.g, ``transforms.ToTensor`` 30 | target_transform (callable, optional): A function/transform that takes in the 31 | target and transforms it. 32 | transforms (callable, optional): A function/transform that takes input sample and its target as entry 33 | and returns a transformed version. 34 | """ 35 | 36 | def __init__(self, root, annFile, transform=None, target_transform=None, transforms=None, 37 | cache_mode=False, local_rank=0, local_size=1): 38 | super(CocoDetection, self).__init__(root, transforms, transform, target_transform) 39 | from pycocotools.coco import COCO 40 | self.coco = COCO(annFile) 41 | self.ids = list(sorted(self.coco.imgs.keys())) 42 | self.cache_mode = cache_mode 43 | self.local_rank = local_rank 44 | self.local_size = local_size 45 | if cache_mode: 46 | self.cache = {} 47 | self.cache_images() 48 | 49 | def cache_images(self): 50 | self.cache = {} 51 | for index, img_id in zip(tqdm.trange(len(self.ids)), self.ids): 52 | if index % self.local_size != self.local_rank: 53 | continue 54 | path = self.coco.loadImgs(img_id)[0]['file_name'] 55 | with open(os.path.join(self.root, path), 'rb') as f: 56 | self.cache[path] = f.read() 57 | 58 | def get_image(self, path): 59 | if self.cache_mode: 60 | if path not in self.cache.keys(): 61 | with open(os.path.join(self.root, path), 'rb') as f: 62 | self.cache[path] = f.read() 63 | return Image.open(BytesIO(self.cache[path])).convert('RGB') 64 | return Image.open(os.path.join(self.root, path)).convert('RGB') 65 | 66 | def __getitem__(self, index): 67 | """ 68 | Args: 69 | index (int): Index 70 | Returns: 71 | tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``. 72 | """ 73 | coco = self.coco 74 | img_id = self.ids[index] 75 | ann_ids = coco.getAnnIds(imgIds=img_id) 76 | target = coco.loadAnns(ann_ids) 77 | 78 | path = coco.loadImgs(img_id)[0]['file_name'] 79 | 80 | img = self.get_image(path) 81 | if self.transforms is not None: 82 | img, target = self.transforms(img, target) 83 | 84 | return img, target 85 | 86 | def __len__(self): 87 | return len(self.ids) 88 | -------------------------------------------------------------------------------- /exps: -------------------------------------------------------------------------------- 1 | /data/P3AFormer/exps -------------------------------------------------------------------------------- /figs/P3AFormerModel_v12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/figs/P3AFormerModel_v12.png -------------------------------------------------------------------------------- /figs/model_mind_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/figs/model_mind_flow.png -------------------------------------------------------------------------------- /figs/pixelwise_association_v8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/figs/pixelwise_association_v8.png -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | from .deformable_detr import build as build_deformable_detr 11 | from .motr import build as build_motr 12 | from .p3aformer.p3aformer import build as build_p3aformer 13 | 14 | 15 | def build_model(args): 16 | arch_catalog = { 17 | "deformable_detr": build_deformable_detr, 18 | "motr": build_motr, 19 | "p3aformer": build_p3aformer, 20 | } 21 | assert args.meta_arch in arch_catalog, "invalid arch: {}".format(args.meta_arch) 22 | build_func = arch_catalog[args.meta_arch] 23 | return build_func(args) 24 | -------------------------------------------------------------------------------- /models/d2_p3aformer/__init__.py: -------------------------------------------------------------------------------- 1 | from .d2_p3aformer_model import D2P3AFormer 2 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .backbone.swin import D2SwinTransformer 3 | from .pixel_decoder.fpn import BasePixelDecoder 4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder 5 | from .meta_arch.mask_former_head import MaskFormerHead 6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead 7 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/pixel_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | try: 22 | import MultiScaleDeformableAttention as MSDA 23 | except ModuleNotFoundError as e: 24 | info_string = ( 25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" 26 | "\t`cd mask2former/modeling/pixel_decoder/ops`\n" 27 | "\t`sh make.sh`\n" 28 | ) 29 | raise ModuleNotFoundError(info_string) 30 | 31 | 32 | class MSDeformAttnFunction(Function): 33 | @staticmethod 34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 35 | ctx.im2col_step = im2col_step 36 | output = MSDA.ms_deform_attn_forward( 37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 39 | return output 40 | 41 | @staticmethod 42 | @once_differentiable 43 | def backward(ctx, grad_output): 44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 45 | grad_value, grad_sampling_loc, grad_attn_weight = \ 46 | MSDA.ms_deform_attn_backward( 47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 48 | 49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 50 | 51 | 52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 53 | # for debug and test only, 54 | # need to use cuda version instead 55 | N_, S_, M_, D_ = value.shape 56 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 57 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 58 | sampling_grids = 2 * sampling_locations - 1 59 | sampling_value_list = [] 60 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 61 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 62 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 63 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 64 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 65 | # N_*M_, D_, Lq_, P_ 66 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 67 | mode='bilinear', padding_mode='zeros', align_corners=False) 68 | sampling_value_list.append(sampling_value_l_) 69 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 70 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 71 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 72 | return output.transpose(1, 2).contiguous() 73 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | # Copyright (c) Facebook, Inc. and its affiliates. 11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 12 | 13 | python setup.py build install 14 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | import os 13 | import glob 14 | 15 | import torch 16 | 17 | from torch.utils.cpp_extension import CUDA_HOME 18 | from torch.utils.cpp_extension import CppExtension 19 | from torch.utils.cpp_extension import CUDAExtension 20 | 21 | from setuptools import find_packages 22 | from setuptools import setup 23 | 24 | requirements = ["torch", "torchvision"] 25 | 26 | def get_extensions(): 27 | this_dir = os.path.dirname(os.path.abspath(__file__)) 28 | extensions_dir = os.path.join(this_dir, "src") 29 | 30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 33 | 34 | sources = main_file + source_cpu 35 | extension = CppExtension 36 | extra_compile_args = {"cxx": []} 37 | define_macros = [] 38 | 39 | # Force cuda since torch ask for a device, not if cuda is in fact available. 40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: 41 | extension = CUDAExtension 42 | sources += source_cuda 43 | define_macros += [("WITH_CUDA", None)] 44 | extra_compile_args["nvcc"] = [ 45 | "-DCUDA_HAS_FP16=1", 46 | "-D__CUDA_NO_HALF_OPERATORS__", 47 | "-D__CUDA_NO_HALF_CONVERSIONS__", 48 | "-D__CUDA_NO_HALF2_OPERATORS__", 49 | ] 50 | else: 51 | if CUDA_HOME is None: 52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') 53 | else: 54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') 55 | 56 | sources = [os.path.join(extensions_dir, s) for s in sources] 57 | include_dirs = [extensions_dir] 58 | ext_modules = [ 59 | extension( 60 | "MultiScaleDeformableAttention", 61 | sources, 62 | include_dirs=include_dirs, 63 | define_macros=define_macros, 64 | extra_compile_args=extra_compile_args, 65 | ) 66 | ] 67 | return ext_modules 68 | 69 | setup( 70 | name="MultiScaleDeformableAttention", 71 | version="1.0", 72 | author="Weijie Su", 73 | url="https://github.com/fundamentalvision/Deformable-DETR", 74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 75 | packages=find_packages(exclude=("configs", "tests",)), 76 | ext_modules=get_extensions(), 77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 78 | ) 79 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import time 17 | import torch 18 | import torch.nn as nn 19 | from torch.autograd import gradcheck 20 | 21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 22 | 23 | 24 | N, M, D = 1, 2, 2 25 | Lq, L, P = 2, 2, 2 26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 28 | S = sum([(H*W).item() for H, W in shapes]) 29 | 30 | 31 | torch.manual_seed(3) 32 | 33 | 34 | @torch.no_grad() 35 | def check_forward_equal_with_pytorch_double(): 36 | value = torch.rand(N, S, M, D).cuda() * 0.01 37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 40 | im2col_step = 2 41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 43 | fwdok = torch.allclose(output_cuda, output_pytorch) 44 | max_abs_err = (output_cuda - output_pytorch).abs().max() 45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 46 | 47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 48 | 49 | 50 | @torch.no_grad() 51 | def check_forward_equal_with_pytorch_float(): 52 | value = torch.rand(N, S, M, D).cuda() * 0.01 53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 56 | im2col_step = 2 57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 60 | max_abs_err = (output_cuda - output_pytorch).abs().max() 61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 62 | 63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 64 | 65 | 66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 67 | 68 | value = torch.rand(N, S, M, channels).cuda() * 0.01 69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 72 | im2col_step = 2 73 | func = MSDeformAttnFunction.apply 74 | 75 | value.requires_grad = grad_value 76 | sampling_locations.requires_grad = grad_sampling_loc 77 | attention_weights.requires_grad = grad_attn_weight 78 | 79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 80 | 81 | print(f'* {gradok} check_gradient_numerical(D={channels})') 82 | 83 | 84 | if __name__ == '__main__': 85 | check_forward_equal_with_pytorch_double() 86 | check_forward_equal_with_pytorch_float() 87 | 88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 89 | check_gradient_numerical(channels, True, True, True) 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .maskformer_transformer_decoder import StandardTransformerDecoder 3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder 4 | -------------------------------------------------------------------------------- /models/d2_p3aformer/mask2former_modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | if mask is None: 31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 32 | not_mask = ~mask 33 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 34 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 35 | if self.normalize: 36 | eps = 1e-6 37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 39 | 40 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 41 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 42 | 43 | pos_x = x_embed[:, :, :, None] / dim_t 44 | pos_y = y_embed[:, :, :, None] / dim_t 45 | pos_x = torch.stack( 46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 47 | ).flatten(3) 48 | pos_y = torch.stack( 49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 50 | ).flatten(3) 51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 52 | return pos 53 | 54 | def __repr__(self, _repr_indent=4): 55 | head = "Positional encoding " + self.__class__.__name__ 56 | body = [ 57 | "num_pos_feats: {}".format(self.num_pos_feats), 58 | "temperature: {}".format(self.temperature), 59 | "normalize: {}".format(self.normalize), 60 | "scale: {}".format(self.scale), 61 | ] 62 | # _repr_indent = 4 63 | lines = [head] + [" " * _repr_indent + line for line in body] 64 | return "\n".join(lines) 65 | -------------------------------------------------------------------------------- /models/d2_p3aformer/transcenter_dla.py: -------------------------------------------------------------------------------- 1 | ## TransCenter has code derived from 2 | ## (1) 2020 fundamentalvision.(Apache License 2.0: https://github.com/fundamentalvision/Deformable-DETR) 3 | ## (2) 2020 Philipp Bergmann, Tim Meinhardt. (GNU General Public License v3.0 Licence: https://github.com/phil-bergmann/tracking_wo_bnw) 4 | ## (3) 2020 Facebook. (Apache License Version 2.0: https://github.com/facebookresearch/detr/) 5 | ## (4) 2020 Xingyi Zhou.(MIT License: https://github.com/xingyizhou/CenterTrack) 6 | ## 7 | ## TransCenter uses packages from 8 | ## (1) 2019 Charles Shang. (BSD 3-Clause Licence: https://github.com/CharlesShang/DCNv2) 9 | ## (2) 2017 NVIDIA CORPORATION. (Apache License, Version 2.0: https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package) 10 | ## (3) 2019 Simon Niklaus. (GNU General Public License v3.0: https://github.com/sniklaus/pytorch-liteflownet) 11 | ## (4) 2018 Tak-Wai Hui. (Copyright (c), see details in the LICENSE file: https://github.com/twhui/LiteFlowNet) 12 | import torch.nn as nn 13 | from dcn_v2 import DCN 14 | import math 15 | import numpy as np 16 | import torch.nn.functional as F 17 | 18 | BN_MOMENTUM = 0.1 19 | 20 | 21 | class DeformConv(nn.Module): 22 | def __init__(self, chi, cho): 23 | super(DeformConv, self).__init__() 24 | self.actf = nn.Sequential( 25 | nn.BatchNorm2d(cho, momentum=BN_MOMENTUM), 26 | nn.ReLU(inplace=True) 27 | ) 28 | self.conv = DCN(chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1) 29 | 30 | def forward(self, x): 31 | x = self.conv(x) 32 | x = self.actf(x) 33 | return x 34 | 35 | DLA_NODE = { 36 | 'dcn': (DeformConv, DeformConv) 37 | } 38 | 39 | 40 | def fill_fc_weights(layers): 41 | for m in layers.modules(): 42 | if isinstance(m, nn.Conv2d): 43 | if m.bias is not None: 44 | nn.init.constant_(m.bias, 0) 45 | 46 | 47 | def fill_up_weights(up): 48 | w = up.weight.data 49 | f = math.ceil(w.size(2) / 2) 50 | c = (2 * f - 1 - f % 2) / (2. * f) 51 | for i in range(w.size(2)): 52 | for j in range(w.size(3)): 53 | w[0, 0, i, j] = \ 54 | (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) 55 | for c in range(1, w.size(0)): 56 | w[c, 0, :, :] = w[0, 0, :, :] 57 | 58 | 59 | class IDAUpV3(nn.Module): 60 | # bilinear upsampling version of IDA 61 | def __init__(self, o, channels, up_f, node_type=(DeformConv, DeformConv)): 62 | super(IDAUpV3, self).__init__() 63 | self.up = nn.UpsamplingBilinear2d(scale_factor=2) # no params 64 | 65 | for i in range(0, len(channels)): 66 | c = channels[i] 67 | if i == 0: 68 | node = node_type[1](c, o) 69 | else: 70 | node = node_type[1](c, c) 71 | setattr(self, 'node_' + str(i), node) 72 | 73 | def forward(self, layers, startp, endp): 74 | for i in range(endp-1, startp, -1): 75 | upsample = self.up 76 | layers[i] = upsample(layers[i]) # ch 256-> 256 77 | node = getattr(self, 'node_' + str(i)) 78 | layers[i-1] = node(layers[i] + layers[i - 1]) 79 | layers[startp] = self.up(layers[startp]) # 256=>256 80 | node = getattr(self, 'node_' + str(startp)) 81 | layers[startp] = node(layers[startp]) 82 | return [layers[startp]] 83 | 84 | 85 | class Interpolate(nn.Module): 86 | def __init__(self, scale, mode): 87 | super(Interpolate, self).__init__() 88 | self.scale = scale 89 | self.mode = mode 90 | 91 | def forward(self, x): 92 | x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False) 93 | return x 94 | -------------------------------------------------------------------------------- /models/d2_p3aformer/transcenter_position_encoding.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | """ 13 | Various positional encodings for the transformer. 14 | """ 15 | import math 16 | import torch 17 | from torch import nn 18 | 19 | from util.misc import NestedTensor 20 | 21 | 22 | class PositionEmbeddingSine(nn.Module): 23 | """ 24 | This is a more standard version of the position embedding, very similar to the one 25 | used by the Attention is all you need paper, generalized to work on images. 26 | """ 27 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 28 | super().__init__() 29 | self.num_pos_feats = num_pos_feats 30 | self.temperature = temperature 31 | self.normalize = normalize 32 | if scale is not None and normalize is False: 33 | raise ValueError("normalize should be True if scale is passed") 34 | if scale is None: 35 | scale = 2 * math.pi 36 | self.scale = scale 37 | 38 | def forward(self, tensor_list: NestedTensor): 39 | x = tensor_list.tensors 40 | mask = tensor_list.mask 41 | assert mask is not None 42 | not_mask = ~mask 43 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 44 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 45 | if self.normalize: 46 | eps = 1e-6 47 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale 48 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale 49 | 50 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 51 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 52 | 53 | pos_x = x_embed[:, :, :, None] / dim_t 54 | pos_y = y_embed[:, :, :, None] / dim_t 55 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 56 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 57 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 58 | return pos 59 | 60 | 61 | class PositionEmbeddingLearned(nn.Module): 62 | """ 63 | Absolute pos embedding, learned. 64 | """ 65 | def __init__(self, num_pos_feats=256): 66 | super().__init__() 67 | self.row_embed = nn.Embedding(50, num_pos_feats) 68 | self.col_embed = nn.Embedding(50, num_pos_feats) 69 | self.reset_parameters() 70 | 71 | def reset_parameters(self): 72 | nn.init.uniform_(self.row_embed.weight) 73 | nn.init.uniform_(self.col_embed.weight) 74 | 75 | def forward(self, tensor_list: NestedTensor): 76 | x = tensor_list.tensors 77 | h, w = x.shape[-2:] 78 | i = torch.arange(w, device=x.device) 79 | j = torch.arange(h, device=x.device) 80 | x_emb = self.col_embed(i) 81 | y_emb = self.row_embed(j) 82 | pos = torch.cat([ 83 | x_emb.unsqueeze(0).repeat(h, 1, 1), 84 | y_emb.unsqueeze(1).repeat(1, w, 1), 85 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) 86 | return pos 87 | 88 | 89 | def build_position_encoding(cfg): 90 | N_steps = cfg.MODEL.DENSETRACK.HIDDEN_DIM // 2 91 | if cfg.MODEL.DENSETRACK.POSITION_EMBEDDING in ('v2', 'sine'): 92 | # TODO find a better way of exposing other arguments 93 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 94 | elif cfg.MODEL.DENSETRACK.POSITION_EMBEDDING in ('v3', 'learned'): 95 | position_embedding = PositionEmbeddingLearned(N_steps) 96 | else: 97 | raise ValueError(f"not supported {cfg.MODEL.DENSETRACK.POSITION_EMBEDDING}.") 98 | 99 | return position_embedding 100 | -------------------------------------------------------------------------------- /models/d2_p3aformer/transcenter_post_processing/post_process.py: -------------------------------------------------------------------------------- 1 | ## TransCenter: Transformers with Dense Queries for Multiple-Object Tracking 2 | ## Copyright Inria 3 | ## Year 2021 4 | ## Contact : yihong.xu@inria.fr 5 | ## 6 | ## TransCenter is free software: you can redistribute it and/or modify 7 | ## it under the terms of the GNU General Public License as published by 8 | ## the Free Software Foundation, either version 3 of the License, or 9 | ## (at your option) any later version. 10 | 11 | ## TransCenter is distributed in the hope that it will be useful, 12 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | ## GNU General Public License for more details. 15 | ## 16 | ## You should have received a copy of the GNU General Public License 17 | ## along with this program, TransCenter. If not, see and the LICENSE file. 18 | ## 19 | ## 20 | ## TransCenter has code derived from 21 | ## (1) 2020 fundamentalvision.(Apache License 2.0: https://github.com/fundamentalvision/Deformable-DETR) 22 | ## (2) 2020 Philipp Bergmann, Tim Meinhardt. (GNU General Public License v3.0 Licence: https://github.com/phil-bergmann/tracking_wo_bnw) 23 | ## (3) 2020 Facebook. (Apache License Version 2.0: https://github.com/facebookresearch/detr/) 24 | ## (4) 2020 Xingyi Zhou.(MIT License: https://github.com/xingyizhou/CenterTrack) 25 | ## 26 | ## TransCenter uses packages from 27 | ## (1) 2019 Charles Shang. (BSD 3-Clause Licence: https://github.com/CharlesShang/DCNv2) 28 | ## (2) 2017 NVIDIA CORPORATION. (Apache License, Version 2.0: https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package) 29 | ## (3) 2019 Simon Niklaus. (GNU General Public License v3.0: https://github.com/sniklaus/pytorch-liteflownet) 30 | ## (4) 2018 Tak-Wai Hui. (Copyright (c), see details in the LICENSE file: https://github.com/twhui/LiteFlowNet) 31 | from __future__ import absolute_import 32 | from __future__ import division 33 | from __future__ import print_function 34 | 35 | import numpy as np 36 | import cv2 37 | import pdb 38 | from util.image import transform_preds_with_trans, get_affine_transform 39 | 40 | 41 | def get_alpha(rot): 42 | # output: (B, 8) [bin1_cls[0], bin1_cls[1], bin1_sin, bin1_cos, 43 | # bin2_cls[0], bin2_cls[1], bin2_sin, bin2_cos] 44 | # return rot[:, 0] 45 | idx = rot[:, 1] > rot[:, 5] 46 | alpha1 = np.arctan2(rot[:, 2], rot[:, 3]) + (-0.5 * np.pi) 47 | alpha2 = np.arctan2(rot[:, 6], rot[:, 7]) + (0.5 * np.pi) 48 | return alpha1 * idx + alpha2 * (1 - idx) 49 | 50 | 51 | def generic_post_process(dets, c, s, h, w, filter_by_scores=0.3): 52 | if not ("scores" in dets): 53 | return [{}], [{}] 54 | ret = [] 55 | 56 | for i in range(len(dets["scores"])): 57 | preds = [] 58 | trans = get_affine_transform(c[i], s[i], 0, (w, h), inv=1).astype(np.float32) 59 | for j in range(len(dets["scores"][i])): 60 | if dets["scores"][i][j] < filter_by_scores: 61 | break 62 | 63 | item = {} 64 | item["score"] = dets["scores"][i][j] 65 | item["class"] = int(dets["clses"][i][j]) + 1 66 | item["ct"] = transform_preds_with_trans( 67 | (dets["cts"][i][j]).reshape(1, 2), trans 68 | ).reshape(2) 69 | 70 | if "tracking" in dets: 71 | # displacement to original image space 72 | tracking = transform_preds_with_trans( 73 | (dets["tracking"][i][j] + dets["cts"][i][j]).reshape(1, 2), trans 74 | ).reshape(2) 75 | item["tracking"] = ( 76 | tracking - item["ct"] 77 | ) # ct in the ct int in original image plan 78 | item["pre_cts"] = tracking 79 | 80 | if "bboxes" in dets: 81 | bbox = transform_preds_with_trans( 82 | dets["bboxes"][i][j].reshape(2, 2), trans 83 | ).reshape(4) 84 | item["bbox"] = bbox 85 | 86 | preds.append(item) 87 | ret.append(preds) 88 | return ret 89 | -------------------------------------------------------------------------------- /models/d2_p3aformer/transcenter_post_processing/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import torch 6 | import torch.nn as nn 7 | from util.image import gaussian_radius 8 | import math 9 | import numpy as np 10 | 11 | def _sigmoid(x): 12 | y = torch.clamp(x.sigmoid_(), min=1e-4, max=1-1e-4) 13 | return y 14 | 15 | def _sigmoid12(x): 16 | y = torch.clamp(x.sigmoid_(), 1e-12) 17 | return y 18 | 19 | def _gather_feat(feat, ind): 20 | dim = feat.size(2) 21 | ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim) 22 | feat = feat.gather(1, ind) 23 | return feat 24 | 25 | def _tranpose_and_gather_feat(feat, ind): 26 | feat = feat.permute(0, 2, 3, 1).contiguous() 27 | feat = feat.view(feat.size(0), -1, feat.size(3)) 28 | feat = _gather_feat(feat, ind) 29 | return feat 30 | 31 | def flip_tensor(x): 32 | return torch.flip(x, [3]) 33 | # tmp = x.detach().cpu().numpy()[..., ::-1].copy() 34 | # return torch.from_numpy(tmp).to(x.device) 35 | 36 | def flip_lr(x, flip_idx): 37 | tmp = x.detach().cpu().numpy()[..., ::-1].copy() 38 | shape = tmp.shape 39 | for e in flip_idx: 40 | tmp[:, e[0], ...], tmp[:, e[1], ...] = \ 41 | tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy() 42 | return torch.from_numpy(tmp.reshape(shape)).to(x.device) 43 | 44 | def flip_lr_off(x, flip_idx): 45 | tmp = x.detach().cpu().numpy()[..., ::-1].copy() 46 | shape = tmp.shape 47 | tmp = tmp.reshape(tmp.shape[0], 17, 2, 48 | tmp.shape[2], tmp.shape[3]) 49 | tmp[:, :, 0, :, :] *= -1 50 | for e in flip_idx: 51 | tmp[:, e[0], ...], tmp[:, e[1], ...] = \ 52 | tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy() 53 | return torch.from_numpy(tmp.reshape(shape)).to(x.device) 54 | 55 | def _nms(heat, kernel=3): 56 | pad = (kernel - 1) // 2 57 | 58 | hmax = nn.functional.max_pool2d( 59 | heat, (kernel, kernel), stride=1, padding=pad) 60 | keep = (hmax == heat).float() 61 | return heat * keep 62 | 63 | def _topk_channel(scores, K=100): 64 | batch, cat, height, width = scores.size() 65 | 66 | topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K) 67 | 68 | topk_inds = topk_inds % (height * width) 69 | topk_ys = (topk_inds / width).int().float() 70 | topk_xs = (topk_inds % width).int().float() 71 | 72 | return topk_scores, topk_inds, topk_ys, topk_xs 73 | 74 | def _topk(scores, K=100): 75 | batch, cat, height, width = scores.size() 76 | 77 | topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K) 78 | 79 | topk_inds = topk_inds % (height * width) 80 | topk_ys = (topk_inds / width).int().float() 81 | topk_xs = (topk_inds % width).int().float() 82 | 83 | topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K) 84 | topk_clses = (topk_ind / K).int() 85 | topk_inds = _gather_feat( 86 | topk_inds.view(batch, -1, 1), topk_ind).view(batch, K) 87 | topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K) 88 | topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K) 89 | 90 | return topk_score, topk_inds, topk_clses, topk_ys, topk_xs 91 | -------------------------------------------------------------------------------- /models/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /models/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | import MultiScaleDeformableAttention as MSDA 22 | 23 | 24 | class MSDeformAttnFunction(Function): 25 | @staticmethod 26 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 27 | ctx.im2col_step = im2col_step 28 | output = MSDA.ms_deform_attn_forward( 29 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 30 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 31 | return output 32 | 33 | @staticmethod 34 | @once_differentiable 35 | def backward(ctx, grad_output): 36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 37 | grad_value, grad_sampling_loc, grad_attn_weight = \ 38 | MSDA.ms_deform_attn_backward( 39 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 40 | 41 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 42 | 43 | 44 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 45 | # for debug and test only, 46 | # need to use cuda version instead 47 | N_, S_, M_, D_ = value.shape 48 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 49 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 50 | sampling_grids = 2 * sampling_locations - 1 51 | sampling_value_list = [] 52 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 53 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 54 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 55 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 56 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 57 | # N_*M_, D_, Lq_, P_ 58 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 59 | mode='bilinear', padding_mode='zeros', align_corners=False) 60 | sampling_value_list.append(sampling_value_l_) 61 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 62 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 63 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 64 | return output.transpose(1, 2).contiguous() 65 | -------------------------------------------------------------------------------- /models/ops/make.sh: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | python setup.py build install 10 | -------------------------------------------------------------------------------- /models/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /models/ops/server_make.sh: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | rlaunch --cpu 8 --gpu 8 --memory 100000 --charged-group v_detection \ 9 | --priority Medium --preemptible no \ 10 | -- python setup.py build install 11 | -------------------------------------------------------------------------------- /models/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import os 10 | import glob 11 | 12 | import torch 13 | 14 | from torch.utils.cpp_extension import CUDA_HOME 15 | from torch.utils.cpp_extension import CppExtension 16 | from torch.utils.cpp_extension import CUDAExtension 17 | 18 | from setuptools import find_packages 19 | from setuptools import setup 20 | 21 | requirements = ["torch", "torchvision"] 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | if torch.cuda.is_available() and CUDA_HOME is not None: 37 | extension = CUDAExtension 38 | sources += source_cuda 39 | define_macros += [("WITH_CUDA", None)] 40 | extra_compile_args["nvcc"] = [ 41 | "-DCUDA_HAS_FP16=1", 42 | "-D__CUDA_NO_HALF_OPERATORS__", 43 | "-D__CUDA_NO_HALF_CONVERSIONS__", 44 | "-D__CUDA_NO_HALF2_OPERATORS__", 45 | ] 46 | else: 47 | raise NotImplementedError('Cuda is not availabel') 48 | 49 | sources = [os.path.join(extensions_dir, s) for s in sources] 50 | include_dirs = [extensions_dir] 51 | ext_modules = [ 52 | extension( 53 | "MultiScaleDeformableAttention", 54 | sources, 55 | include_dirs=include_dirs, 56 | define_macros=define_macros, 57 | extra_compile_args=extra_compile_args, 58 | ) 59 | ] 60 | return ext_modules 61 | 62 | setup( 63 | name="MultiScaleDeformableAttention", 64 | version="1.0", 65 | author="Weijie Su", 66 | url="https://github.com/fundamentalvision/Deformable-DETR", 67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 68 | packages=find_packages(exclude=("configs", "tests",)), 69 | ext_modules=get_extensions(), 70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 71 | ) 72 | -------------------------------------------------------------------------------- /models/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /models/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | 34 | -------------------------------------------------------------------------------- /models/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | 31 | -------------------------------------------------------------------------------- /models/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /models/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /models/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import time 14 | import torch 15 | import torch.nn as nn 16 | from torch.autograd import gradcheck 17 | 18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 19 | 20 | 21 | N, M, D = 1, 2, 2 22 | Lq, L, P = 2, 2, 2 23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 25 | S = sum([(H*W).item() for H, W in shapes]) 26 | 27 | 28 | torch.manual_seed(3) 29 | 30 | 31 | @torch.no_grad() 32 | def check_forward_equal_with_pytorch_double(): 33 | value = torch.rand(N, S, M, D).cuda() * 0.01 34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 37 | im2col_step = 2 38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 40 | fwdok = torch.allclose(output_cuda, output_pytorch) 41 | max_abs_err = (output_cuda - output_pytorch).abs().max() 42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 43 | 44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 45 | 46 | 47 | @torch.no_grad() 48 | def check_forward_equal_with_pytorch_float(): 49 | value = torch.rand(N, S, M, D).cuda() * 0.01 50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 53 | im2col_step = 2 54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 57 | max_abs_err = (output_cuda - output_pytorch).abs().max() 58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 59 | 60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 61 | 62 | 63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 64 | 65 | value = torch.rand(N, S, M, channels).cuda() * 0.01 66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 69 | im2col_step = 2 70 | func = MSDeformAttnFunction.apply 71 | 72 | value.requires_grad = grad_value 73 | sampling_locations.requires_grad = grad_sampling_loc 74 | attention_weights.requires_grad = grad_attn_weight 75 | 76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 77 | 78 | print(f'* {gradok} check_gradient_numerical(D={channels})') 79 | 80 | 81 | if __name__ == '__main__': 82 | check_forward_equal_with_pytorch_double() 83 | check_forward_equal_with_pytorch_float() 84 | 85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 86 | check_gradient_numerical(channels, True, True, True) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /models/p3aformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/models/p3aformer/__init__.py -------------------------------------------------------------------------------- /models/p3aformer/p3aformer_dla.py: -------------------------------------------------------------------------------- 1 | ## TransCenter has code derived from 2 | ## (1) 2020 fundamentalvision.(Apache License 2.0: https://github.com/fundamentalvision/Deformable-DETR) 3 | ## (2) 2020 Philipp Bergmann, Tim Meinhardt. (GNU General Public License v3.0 Licence: https://github.com/phil-bergmann/tracking_wo_bnw) 4 | ## (3) 2020 Facebook. (Apache License Version 2.0: https://github.com/facebookresearch/detr/) 5 | ## (4) 2020 Xingyi Zhou.(MIT License: https://github.com/xingyizhou/CenterTrack) 6 | ## 7 | ## TransCenter uses packages from 8 | ## (1) 2019 Charles Shang. (BSD 3-Clause Licence: https://github.com/CharlesShang/DCNv2) 9 | ## (2) 2017 NVIDIA CORPORATION. (Apache License, Version 2.0: https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package) 10 | ## (3) 2019 Simon Niklaus. (GNU General Public License v3.0: https://github.com/sniklaus/pytorch-liteflownet) 11 | ## (4) 2018 Tak-Wai Hui. (Copyright (c), see details in the LICENSE file: https://github.com/twhui/LiteFlowNet) 12 | import torch.nn as nn 13 | from dcn_v2 import DCN 14 | import math 15 | import numpy as np 16 | import torch.nn.functional as F 17 | 18 | BN_MOMENTUM = 0.1 19 | 20 | 21 | class DeformConv(nn.Module): 22 | def __init__(self, chi, cho): 23 | super(DeformConv, self).__init__() 24 | self.actf = nn.Sequential( 25 | nn.BatchNorm2d(cho, momentum=BN_MOMENTUM), 26 | nn.ReLU(inplace=True) 27 | ) 28 | self.conv = DCN(chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1) 29 | 30 | def forward(self, x): 31 | x = self.conv(x) 32 | x = self.actf(x) 33 | return x 34 | 35 | DLA_NODE = { 36 | 'dcn': (DeformConv, DeformConv) 37 | } 38 | 39 | 40 | def fill_fc_weights(layers): 41 | for m in layers.modules(): 42 | if isinstance(m, nn.Conv2d): 43 | if m.bias is not None: 44 | nn.init.constant_(m.bias, 0) 45 | 46 | 47 | def fill_up_weights(up): 48 | w = up.weight.data 49 | f = math.ceil(w.size(2) / 2) 50 | c = (2 * f - 1 - f % 2) / (2. * f) 51 | for i in range(w.size(2)): 52 | for j in range(w.size(3)): 53 | w[0, 0, i, j] = \ 54 | (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) 55 | for c in range(1, w.size(0)): 56 | w[c, 0, :, :] = w[0, 0, :, :] 57 | 58 | 59 | class IDAUpV3(nn.Module): 60 | # bilinear upsampling version of IDA 61 | def __init__(self, o, channels, up_f, node_type=(DeformConv, DeformConv)): 62 | super(IDAUpV3, self).__init__() 63 | self.up = nn.UpsamplingBilinear2d(scale_factor=2) # no params 64 | 65 | for i in range(0, len(channels)): 66 | c = channels[i] 67 | if i == 0: 68 | node = node_type[1](c, o) 69 | else: 70 | node = node_type[1](c, c) 71 | setattr(self, 'node_' + str(i), node) 72 | 73 | def forward(self, layers, startp, endp): 74 | for i in range(endp-1, startp, -1): 75 | upsample = self.up 76 | layers[i] = upsample(layers[i]) # ch 256-> 256 77 | node = getattr(self, 'node_' + str(i)) 78 | layers[i-1] = node(layers[i] + layers[i - 1]) 79 | layers[startp] = self.up(layers[startp]) # 256=>256 80 | node = getattr(self, 'node_' + str(startp)) 81 | layers[startp] = node(layers[startp]) 82 | return [layers[startp]] 83 | 84 | 85 | class Interpolate(nn.Module): 86 | def __init__(self, scale, mode): 87 | super(Interpolate, self).__init__() 88 | self.scale = scale 89 | self.mode = mode 90 | 91 | def forward(self, x): 92 | x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False) 93 | return x 94 | -------------------------------------------------------------------------------- /models/p3aformer/p3aformer_liteflownet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/models/p3aformer/p3aformer_liteflownet/__init__.py -------------------------------------------------------------------------------- /models/p3aformer/p3aformer_liteflownet/correlation_package/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/models/p3aformer/p3aformer_liteflownet/correlation_package/__init__.py -------------------------------------------------------------------------------- /models/p3aformer/p3aformer_liteflownet/correlation_package/correlation.py: -------------------------------------------------------------------------------- 1 | ## TransCenter: Transformers with Dense Queries for Multiple-Object Tracking 2 | ## Copyright Inria 3 | ## Year 2021 4 | ## Contact : yihong.xu@inria.fr 5 | ## 6 | ## TransCenter is free software: you can redistribute it and/or modify 7 | ## it under the terms of the GNU General Public License as published by 8 | ## the Free Software Foundation, either version 3 of the License, or 9 | ## (at your option) any later version. 10 | 11 | ## TransCenter is distributed in the hope that it will be useful, 12 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | ## GNU General Public License for more details. 15 | ## 16 | ## You should have received a copy of the GNU General Public License 17 | ## along with this program, TransCenter. If not, see and the LICENSE file. 18 | ## 19 | ## 20 | ## TransCenter has code derived from 21 | ## (1) 2020 fundamentalvision.(Apache License 2.0: https://github.com/fundamentalvision/Deformable-DETR) 22 | ## (2) 2020 Philipp Bergmann, Tim Meinhardt. (GNU General Public License v3.0 Licence: https://github.com/phil-bergmann/tracking_wo_bnw) 23 | ## (3) 2020 Facebook. (Apache License Version 2.0: https://github.com/facebookresearch/detr/) 24 | ## (4) 2020 Xingyi Zhou.(MIT License: https://github.com/xingyizhou/CenterTrack) 25 | ## 26 | ## TransCenter uses packages from 27 | ## (1) 2019 Charles Shang. (BSD 3-Clause Licence: https://github.com/CharlesShang/DCNv2) 28 | ## (2) 2017 NVIDIA CORPORATION. (Apache License, Version 2.0: https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package) 29 | ## (3) 2019 Simon Niklaus. (GNU General Public License v3.0: https://github.com/sniklaus/pytorch-liteflownet) 30 | ## (4) 2018 Tak-Wai Hui. (Copyright (c), see details in the LICENSE file: https://github.com/twhui/LiteFlowNet) 31 | import torch 32 | from torch.nn.modules.module import Module 33 | from torch.autograd import Function 34 | import correlation_cuda 35 | 36 | class CorrelationFunction(Function): 37 | 38 | # def __init__(self, pad_size=3, kernel_size=3, max_displacement=20, stride1=1, stride2=2, corr_multiply=1): 39 | # super(CorrelationFunction, self).__init__() 40 | # self.pad_size = pad_size 41 | # self.kernel_size = kernel_size 42 | # self.max_displacement = max_displacement 43 | # self.stride1 = stride1 44 | # self.stride2 = stride2 45 | # self.corr_multiply = corr_multiply 46 | # # self.out_channel = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1) 47 | 48 | @staticmethod 49 | def forward(ctx, input1, input2, pad_size, kernel_size, max_displacement,stride1, stride2, corr_multiply): 50 | ctx.save_for_backward(input1, input2) 51 | ctx.pad_size = pad_size 52 | ctx.kernel_size = kernel_size 53 | ctx.max_displacement = max_displacement 54 | ctx.stride1 = stride1 55 | ctx.stride2 = stride2 56 | ctx.corr_multiply = corr_multiply 57 | 58 | with torch.cuda.device_of(input1): 59 | rbot1 = input1.new() 60 | rbot2 = input2.new() 61 | output = input1.new() 62 | 63 | correlation_cuda.forward(input1, input2, rbot1, rbot2, output, 64 | pad_size, kernel_size, max_displacement, stride1, stride2, corr_multiply) 65 | 66 | return output 67 | 68 | @staticmethod 69 | def backward(ctx, grad_output): 70 | input1, input2 = ctx.saved_tensors 71 | 72 | with torch.cuda.device_of(input1): 73 | rbot1 = input1.new() 74 | rbot2 = input2.new() 75 | 76 | grad_input1 = input1.new() 77 | grad_input2 = input2.new() 78 | 79 | correlation_cuda.backward(input1, input2, rbot1, rbot2, grad_output, grad_input1, grad_input2, 80 | ctx.pad_size, ctx.kernel_size, ctx.max_displacement, ctx.stride1, ctx.stride2, ctx.corr_multiply) 81 | 82 | return grad_input1, grad_input2, None, None, None, None, None, None 83 | 84 | 85 | class Correlation(Module): 86 | def __init__(self, pad_size=0, kernel_size=0, max_displacement=0, stride1=1, stride2=2, corr_multiply=1): 87 | super(Correlation, self).__init__() 88 | self.pad_size = pad_size 89 | self.kernel_size = kernel_size 90 | self.max_displacement = max_displacement 91 | self.stride1 = stride1 92 | self.stride2 = stride2 93 | self.corr_multiply = corr_multiply 94 | 95 | # @staticmethod 96 | def forward(self, input1, input2): 97 | 98 | input1 = input1.contiguous() 99 | input2 = input2.contiguous() 100 | # result = CorrelationFunction(self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)(input1, input2) 101 | result = CorrelationFunction.apply(input1, input2, self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply) 102 | 103 | return result 104 | 105 | -------------------------------------------------------------------------------- /models/p3aformer/p3aformer_liteflownet/correlation_package/correlation_cuda_kernel.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | int correlation_forward_cuda_kernel(at::Tensor& output, 8 | int ob, 9 | int oc, 10 | int oh, 11 | int ow, 12 | int osb, 13 | int osc, 14 | int osh, 15 | int osw, 16 | 17 | at::Tensor& input1, 18 | int ic, 19 | int ih, 20 | int iw, 21 | int isb, 22 | int isc, 23 | int ish, 24 | int isw, 25 | 26 | at::Tensor& input2, 27 | int gc, 28 | int gsb, 29 | int gsc, 30 | int gsh, 31 | int gsw, 32 | 33 | at::Tensor& rInput1, 34 | at::Tensor& rInput2, 35 | int pad_size, 36 | int kernel_size, 37 | int max_displacement, 38 | int stride1, 39 | int stride2, 40 | int corr_type_multiply, 41 | cudaStream_t stream); 42 | 43 | 44 | int correlation_backward_cuda_kernel( 45 | at::Tensor& gradOutput, 46 | int gob, 47 | int goc, 48 | int goh, 49 | int gow, 50 | int gosb, 51 | int gosc, 52 | int gosh, 53 | int gosw, 54 | 55 | at::Tensor& input1, 56 | int ic, 57 | int ih, 58 | int iw, 59 | int isb, 60 | int isc, 61 | int ish, 62 | int isw, 63 | 64 | at::Tensor& input2, 65 | int gsb, 66 | int gsc, 67 | int gsh, 68 | int gsw, 69 | 70 | at::Tensor& gradInput1, 71 | int gisb, 72 | int gisc, 73 | int gish, 74 | int gisw, 75 | 76 | at::Tensor& gradInput2, 77 | int ggc, 78 | int ggsb, 79 | int ggsc, 80 | int ggsh, 81 | int ggsw, 82 | 83 | at::Tensor& rInput1, 84 | at::Tensor& rInput2, 85 | int pad_size, 86 | int kernel_size, 87 | int max_displacement, 88 | int stride1, 89 | int stride2, 90 | int corr_type_multiply, 91 | cudaStream_t stream); 92 | -------------------------------------------------------------------------------- /models/p3aformer/p3aformer_liteflownet/correlation_package/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | # Minimum requirements for the build system to execute. 3 | requires = ["setuptools", "wheel", "numpy", "torch"] # PEP 508 specifications. 4 | -------------------------------------------------------------------------------- /models/p3aformer/p3aformer_liteflownet/correlation_package/setup.py: -------------------------------------------------------------------------------- 1 | ## TransCenter: Transformers with Dense Queries for Multiple-Object Tracking 2 | ## Copyright Inria 3 | ## Year 2021 4 | ## Contact : yihong.xu@inria.fr 5 | ## 6 | ## TransCenter is free software: you can redistribute it and/or modify 7 | ## it under the terms of the GNU General Public License as published by 8 | ## the Free Software Foundation, either version 3 of the License, or 9 | ## (at your option) any later version. 10 | 11 | ## TransCenter is distributed in the hope that it will be useful, 12 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | ## GNU General Public License for more details. 15 | ## 16 | ## You should have received a copy of the GNU General Public License 17 | ## along with this program, TransCenter. If not, see and the LICENSE file. 18 | ## 19 | ## 20 | ## TransCenter has code derived from 21 | ## (1) 2020 fundamentalvision.(Apache License 2.0: https://github.com/fundamentalvision/Deformable-DETR) 22 | ## (2) 2020 Philipp Bergmann, Tim Meinhardt. (GNU General Public License v3.0 Licence: https://github.com/phil-bergmann/tracking_wo_bnw) 23 | ## (3) 2020 Facebook. (Apache License Version 2.0: https://github.com/facebookresearch/detr/) 24 | ## (4) 2020 Xingyi Zhou.(MIT License: https://github.com/xingyizhou/CenterTrack) 25 | ## 26 | ## TransCenter uses packages from 27 | ## (1) 2019 Charles Shang. (BSD 3-Clause Licence: https://github.com/CharlesShang/DCNv2) 28 | ## (2) 2017 NVIDIA CORPORATION. (Apache License, Version 2.0: https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package) 29 | ## (3) 2019 Simon Niklaus. (GNU General Public License v3.0: https://github.com/sniklaus/pytorch-liteflownet) 30 | ## (4) 2018 Tak-Wai Hui. (Copyright (c), see details in the LICENSE file: https://github.com/twhui/LiteFlowNet) 31 | #!/usr/bin/env python3 32 | import os 33 | import torch 34 | 35 | from setuptools import setup, find_packages 36 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 37 | 38 | cxx_args = ['-std=c++14'] 39 | 40 | nvcc_args = [ 41 | '-gencode', 'arch=compute_50,code=sm_50', 42 | '-gencode', 'arch=compute_52,code=sm_52', 43 | '-gencode', 'arch=compute_60,code=sm_60', 44 | '-gencode', 'arch=compute_61,code=sm_61', 45 | '-gencode', 'arch=compute_70,code=sm_70', 46 | '-gencode', 'arch=compute_70,code=compute_70', 47 | '-gencode', 'arch=compute_75,code=compute_75', 48 | '-gencode', 'arch=compute_80,code=compute_80', 49 | '-gencode', 'arch=compute_86,code=compute_86', 50 | 51 | ] 52 | 53 | setup( 54 | name='correlation_cuda', 55 | ext_modules=[ 56 | CUDAExtension('correlation_cuda', [ 57 | 'correlation_cuda.cc', 58 | 'correlation_cuda_kernel.cu' 59 | ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args}) 60 | ], 61 | cmdclass={ 62 | 'build_ext': BuildExtension 63 | }) 64 | -------------------------------------------------------------------------------- /models/p3aformer/p3aformer_post_processing/post_process.py: -------------------------------------------------------------------------------- 1 | ## TransCenter: Transformers with Dense Queries for Multiple-Object Tracking 2 | ## Copyright Inria 3 | ## Year 2021 4 | ## Contact : yihong.xu@inria.fr 5 | ## 6 | ## TransCenter is free software: you can redistribute it and/or modify 7 | ## it under the terms of the GNU General Public License as published by 8 | ## the Free Software Foundation, either version 3 of the License, or 9 | ## (at your option) any later version. 10 | 11 | ## TransCenter is distributed in the hope that it will be useful, 12 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | ## GNU General Public License for more details. 15 | ## 16 | ## You should have received a copy of the GNU General Public License 17 | ## along with this program, TransCenter. If not, see and the LICENSE file. 18 | ## 19 | ## 20 | ## TransCenter has code derived from 21 | ## (1) 2020 fundamentalvision.(Apache License 2.0: https://github.com/fundamentalvision/Deformable-DETR) 22 | ## (2) 2020 Philipp Bergmann, Tim Meinhardt. (GNU General Public License v3.0 Licence: https://github.com/phil-bergmann/tracking_wo_bnw) 23 | ## (3) 2020 Facebook. (Apache License Version 2.0: https://github.com/facebookresearch/detr/) 24 | ## (4) 2020 Xingyi Zhou.(MIT License: https://github.com/xingyizhou/CenterTrack) 25 | ## 26 | ## TransCenter uses packages from 27 | ## (1) 2019 Charles Shang. (BSD 3-Clause Licence: https://github.com/CharlesShang/DCNv2) 28 | ## (2) 2017 NVIDIA CORPORATION. (Apache License, Version 2.0: https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package) 29 | ## (3) 2019 Simon Niklaus. (GNU General Public License v3.0: https://github.com/sniklaus/pytorch-liteflownet) 30 | ## (4) 2018 Tak-Wai Hui. (Copyright (c), see details in the LICENSE file: https://github.com/twhui/LiteFlowNet) 31 | from __future__ import absolute_import 32 | from __future__ import division 33 | from __future__ import print_function 34 | 35 | import numpy as np 36 | import cv2 37 | from util.image import transform_preds_with_trans, get_affine_transform 38 | 39 | 40 | def get_alpha(rot): 41 | # output: (B, 8) [bin1_cls[0], bin1_cls[1], bin1_sin, bin1_cos, 42 | # bin2_cls[0], bin2_cls[1], bin2_sin, bin2_cos] 43 | # return rot[:, 0] 44 | idx = rot[:, 1] > rot[:, 5] 45 | alpha1 = np.arctan2(rot[:, 2], rot[:, 3]) + (-0.5 * np.pi) 46 | alpha2 = np.arctan2(rot[:, 6], rot[:, 7]) + ( 0.5 * np.pi) 47 | return alpha1 * idx + alpha2 * (1 - idx) 48 | 49 | 50 | def generic_post_process(opt, dets, c, s, h, w, filter_by_scores=0.3): 51 | if not ('scores' in dets): 52 | return [{}], [{}] 53 | ret = [] 54 | 55 | for i in range(len(dets['scores'])): 56 | preds = [] 57 | trans = get_affine_transform( 58 | c[i], s[i], 0, (w, h), inv=1).astype(np.float32) 59 | for j in range(len(dets['scores'][i])): 60 | if dets['scores'][i][j] < filter_by_scores: 61 | break 62 | 63 | item = {} 64 | item['score'] = dets['scores'][i][j] 65 | item['class'] = int(dets['clses'][i][j]) + 1 66 | item['ct'] = transform_preds_with_trans( 67 | (dets['cts'][i][j]).reshape(1, 2), trans).reshape(2) 68 | 69 | if 'tracking' in dets: 70 | # displacement to original image space 71 | tracking = transform_preds_with_trans( 72 | (dets['tracking'][i][j] + dets['cts'][i][j]).reshape(1, 2), trans).reshape(2) 73 | item['tracking'] = tracking - item['ct'] # ct in the ct int in original image plan 74 | item['pre_cts'] = tracking 75 | 76 | if 'bboxes' in dets: 77 | bbox = transform_preds_with_trans( 78 | dets['bboxes'][i][j].reshape(2, 2), trans).reshape(4) 79 | item['bbox'] = bbox 80 | 81 | preds.append(item) 82 | 83 | ret.append(preds) 84 | 85 | return ret -------------------------------------------------------------------------------- /models/p3aformer/p3aformer_post_processing/utils.py: -------------------------------------------------------------------------------- 1 | ## TransCenter: Transformers with Dense Queries for Multiple-Object Tracking 2 | ## Copyright Inria 3 | ## Year 2021 4 | ## Contact : yihong.xu@inria.fr 5 | ## 6 | ## TransCenter is free software: you can redistribute it and/or modify 7 | ## it under the terms of the GNU General Public License as published by 8 | ## the Free Software Foundation, either version 3 of the License, or 9 | ## (at your option) any later version. 10 | 11 | ## TransCenter is distributed in the hope that it will be useful, 12 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | ## GNU General Public License for more details. 15 | ## 16 | ## You should have received a copy of the GNU General Public License 17 | ## along with this program, TransCenter. If not, see and the LICENSE file. 18 | ## 19 | ## 20 | ## TransCenter has code derived from 21 | ## (1) 2020 fundamentalvision.(Apache License 2.0: https://github.com/fundamentalvision/Deformable-DETR) 22 | ## (2) 2020 Philipp Bergmann, Tim Meinhardt. (GNU General Public License v3.0 Licence: https://github.com/phil-bergmann/tracking_wo_bnw) 23 | ## (3) 2020 Facebook. (Apache License Version 2.0: https://github.com/facebookresearch/detr/) 24 | ## (4) 2020 Xingyi Zhou.(MIT License: https://github.com/xingyizhou/CenterTrack) 25 | ## 26 | ## TransCenter uses packages from 27 | ## (1) 2019 Charles Shang. (BSD 3-Clause Licence: https://github.com/CharlesShang/DCNv2) 28 | ## (2) 2017 NVIDIA CORPORATION. (Apache License, Version 2.0: https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package) 29 | ## (3) 2019 Simon Niklaus. (GNU General Public License v3.0: https://github.com/sniklaus/pytorch-liteflownet) 30 | ## (4) 2018 Tak-Wai Hui. (Copyright (c), see details in the LICENSE file: https://github.com/twhui/LiteFlowNet) 31 | from __future__ import absolute_import 32 | from __future__ import division 33 | from __future__ import print_function 34 | 35 | import torch 36 | import torch.nn as nn 37 | from util.image import gaussian_radius 38 | import math 39 | import numpy as np 40 | 41 | def _sigmoid(x): 42 | y = torch.clamp(x.sigmoid_(), min=1e-4, max=1-1e-4) 43 | return y 44 | 45 | def _sigmoid12(x): 46 | y = torch.clamp(x.sigmoid_(), 1e-12) 47 | return y 48 | 49 | def _gather_feat(feat, ind): 50 | dim = feat.size(2) 51 | ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim) 52 | feat = feat.gather(1, ind) 53 | return feat 54 | 55 | def _tranpose_and_gather_feat(feat, ind): 56 | feat = feat.permute(0, 2, 3, 1).contiguous() 57 | feat = feat.view(feat.size(0), -1, feat.size(3)) 58 | feat = _gather_feat(feat, ind) 59 | return feat 60 | 61 | def flip_tensor(x): 62 | return torch.flip(x, [3]) 63 | # tmp = x.detach().cpu().numpy()[..., ::-1].copy() 64 | # return torch.from_numpy(tmp).to(x.device) 65 | 66 | def flip_lr(x, flip_idx): 67 | tmp = x.detach().cpu().numpy()[..., ::-1].copy() 68 | shape = tmp.shape 69 | for e in flip_idx: 70 | tmp[:, e[0], ...], tmp[:, e[1], ...] = \ 71 | tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy() 72 | return torch.from_numpy(tmp.reshape(shape)).to(x.device) 73 | 74 | def flip_lr_off(x, flip_idx): 75 | tmp = x.detach().cpu().numpy()[..., ::-1].copy() 76 | shape = tmp.shape 77 | tmp = tmp.reshape(tmp.shape[0], 17, 2, 78 | tmp.shape[2], tmp.shape[3]) 79 | tmp[:, :, 0, :, :] *= -1 80 | for e in flip_idx: 81 | tmp[:, e[0], ...], tmp[:, e[1], ...] = \ 82 | tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy() 83 | return torch.from_numpy(tmp.reshape(shape)).to(x.device) 84 | 85 | def _nms(heat, kernel=3): 86 | pad = (kernel - 1) // 2 87 | 88 | hmax = nn.functional.max_pool2d( 89 | heat, (kernel, kernel), stride=1, padding=pad) 90 | keep = (hmax == heat).float() 91 | return heat * keep 92 | 93 | def _topk_channel(scores, K=100): 94 | batch, cat, height, width = scores.size() 95 | 96 | topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K) 97 | 98 | topk_inds = topk_inds % (height * width) 99 | topk_ys = (topk_inds / width).int().float() 100 | topk_xs = (topk_inds % width).int().float() 101 | 102 | return topk_scores, topk_inds, topk_ys, topk_xs 103 | 104 | def _topk(scores, K=100): 105 | batch, cat, height, width = scores.size() 106 | 107 | topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K) 108 | 109 | topk_inds = topk_inds % (height * width) 110 | topk_ys = (topk_inds / width).int().float() 111 | topk_xs = (topk_inds % width).int().float() 112 | 113 | topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K) 114 | topk_clses = (topk_ind / K).int() 115 | topk_inds = _gather_feat( 116 | topk_inds.view(batch, -1, 1), topk_ind).view(batch, K) 117 | topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K) 118 | topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K) 119 | 120 | return topk_score, topk_inds, topk_clses, topk_ys, topk_xs 121 | -------------------------------------------------------------------------------- /models/position_encoding.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | """ 13 | Various positional encodings for the transformer. 14 | """ 15 | import math 16 | import torch 17 | from torch import nn 18 | 19 | from util.misc import NestedTensor 20 | 21 | 22 | class PositionEmbeddingSine(nn.Module): 23 | """ 24 | This is a more standard version of the position embedding, very similar to the one 25 | used by the Attention is all you need paper, generalized to work on images. 26 | """ 27 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 28 | super().__init__() 29 | self.num_pos_feats = num_pos_feats 30 | self.temperature = temperature 31 | self.normalize = normalize 32 | if scale is not None and normalize is False: 33 | raise ValueError("normalize should be True if scale is passed") 34 | if scale is None: 35 | scale = 2 * math.pi 36 | self.scale = scale 37 | 38 | def forward(self, tensor_list: NestedTensor): 39 | x = tensor_list.tensors 40 | mask = tensor_list.mask 41 | assert mask is not None 42 | not_mask = ~mask 43 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 44 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 45 | if self.normalize: 46 | eps = 1e-6 47 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale 48 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale 49 | 50 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 51 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 52 | 53 | pos_x = x_embed[:, :, :, None] / dim_t 54 | pos_y = y_embed[:, :, :, None] / dim_t 55 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 56 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 57 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 58 | return pos 59 | 60 | 61 | class PositionEmbeddingLearned(nn.Module): 62 | """ 63 | Absolute pos embedding, learned. 64 | """ 65 | def __init__(self, num_pos_feats=256): 66 | super().__init__() 67 | self.row_embed = nn.Embedding(50, num_pos_feats) 68 | self.col_embed = nn.Embedding(50, num_pos_feats) 69 | self.reset_parameters() 70 | 71 | def reset_parameters(self): 72 | nn.init.uniform_(self.row_embed.weight) 73 | nn.init.uniform_(self.col_embed.weight) 74 | 75 | def forward(self, tensor_list: NestedTensor): 76 | x = tensor_list.tensors 77 | h, w = x.shape[-2:] 78 | i = torch.arange(w, device=x.device) 79 | j = torch.arange(h, device=x.device) 80 | x_emb = self.col_embed(i) 81 | y_emb = self.row_embed(j) 82 | pos = torch.cat([ 83 | x_emb.unsqueeze(0).repeat(h, 1, 1), 84 | y_emb.unsqueeze(1).repeat(1, w, 1), 85 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) 86 | return pos 87 | 88 | 89 | def build_position_encoding(args): 90 | N_steps = args.hidden_dim // 2 91 | if args.position_embedding in ('v2', 'sine'): 92 | # TODO find a better way of exposing other arguments 93 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 94 | elif args.position_embedding in ('v3', 'learned'): 95 | position_embedding = PositionEmbeddingLearned(N_steps) 96 | else: 97 | raise ValueError(f"not supported {args.position_embedding}") 98 | 99 | return position_embedding 100 | -------------------------------------------------------------------------------- /models/structures/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Modified from Detectron2 (https://github.com/facebookresearch/detectron2) 3 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 4 | # ------------------------------------------------------------------------ 5 | from .boxes import Boxes, BoxMode, pairwise_iou, pairwise_ioa, matched_boxlist_iou 6 | from .instances import Instances 7 | 8 | __all__ = [k for k in globals().keys() if not k.startswith("_")] -------------------------------------------------------------------------------- /preprocess/convert_cityperson_to_coco.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import json 4 | from PIL import Image 5 | 6 | DATA_PATH = "data/Cityscapes/" 7 | DATA_FILE_PATH = "data/data_path/citypersons.train" 8 | OUT_PATH = DATA_PATH + "annotations/" 9 | 10 | 11 | def load_paths(data_path): 12 | with open(data_path, "r") as file: 13 | img_files = file.readlines() 14 | img_files = [x.replace("\n", "") for x in img_files] 15 | img_files = list(filter(lambda x: len(x) > 0, img_files)) 16 | label_files = [ 17 | x.replace("images", "labels_with_ids") 18 | .replace(".png", ".txt") 19 | .replace(".jpg", ".txt") 20 | for x in img_files 21 | ] 22 | return img_files, label_files 23 | 24 | 25 | if __name__ == "__main__": 26 | if not os.path.exists(OUT_PATH): 27 | os.mkdir(OUT_PATH) 28 | 29 | out_path = OUT_PATH + "train.json" 30 | out = {"images": [], "annotations": [], "categories": [{"id": 1, "name": "person"}]} 31 | img_paths, label_paths = load_paths(DATA_FILE_PATH) 32 | image_cnt = 0 33 | ann_cnt = 0 34 | video_cnt = 0 35 | for img_path, label_path in zip(img_paths, label_paths): 36 | image_cnt += 1 37 | im = Image.open(os.path.join("data", img_path)) 38 | image_info = { 39 | "file_name": img_path, 40 | "id": image_cnt, 41 | "height": im.size[1], 42 | "width": im.size[0], 43 | } 44 | out["images"].append(image_info) 45 | # Load labels 46 | if os.path.isfile(os.path.join("data", label_path)): 47 | labels0 = np.loadtxt( 48 | os.path.join("data", label_path), dtype=np.float32 49 | ).reshape(-1, 6) 50 | # Normalized xywh to pixel xyxy format 51 | labels = labels0.copy() 52 | labels[:, 2] = image_info["width"] * (labels0[:, 2] - labels0[:, 4] / 2) 53 | labels[:, 3] = image_info["height"] * (labels0[:, 3] - labels0[:, 5] / 2) 54 | labels[:, 4] = image_info["width"] * labels0[:, 4] 55 | labels[:, 5] = image_info["height"] * labels0[:, 5] 56 | else: 57 | labels = np.array([]) 58 | for i in range(len(labels)): 59 | ann_cnt += 1 60 | fbox = labels[i, 2:6].tolist() 61 | ann = { 62 | "id": ann_cnt, 63 | "category_id": 1, 64 | "image_id": image_cnt, 65 | "track_id": -1, 66 | "bbox": fbox, 67 | "area": fbox[2] * fbox[3], 68 | "iscrowd": 0, 69 | } 70 | out["annotations"].append(ann) 71 | print( 72 | "loaded train for {} images and {} samples".format( 73 | len(out["images"]), len(out["annotations"]) 74 | ) 75 | ) 76 | json.dump(out, open(out_path, "w")) 77 | -------------------------------------------------------------------------------- /preprocess/convert_crowdhuman_to_coco.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import json 4 | from PIL import Image 5 | 6 | DATA_PATH = "data/crowdhuman/" 7 | OUT_PATH = DATA_PATH + "annotations/" 8 | SPLITS = ["val", "train"] 9 | DEBUG = False 10 | 11 | 12 | def load_func(fpath): 13 | print("fpath", fpath) 14 | assert os.path.exists(fpath) 15 | with open(fpath, "r") as fid: 16 | lines = fid.readlines() 17 | records = [json.loads(line.strip("\n")) for line in lines] 18 | return records 19 | 20 | 21 | if __name__ == "__main__": 22 | if not os.path.exists(OUT_PATH): 23 | os.mkdir(OUT_PATH) 24 | for split in SPLITS: 25 | data_path = DATA_PATH + split 26 | out_path = OUT_PATH + "{}.json".format(split) 27 | out = { 28 | "images": [], 29 | "annotations": [], 30 | "categories": [{"id": 1, "name": "person"}], 31 | } 32 | ann_path = DATA_PATH + "annotation_{}.odgt".format(split) 33 | anns_data = load_func(ann_path) 34 | image_cnt = 0 35 | ann_cnt = 0 36 | video_cnt = 0 37 | for ann_data in anns_data: 38 | image_cnt += 1 39 | file_path = ( 40 | DATA_PATH 41 | + "CrowdHuman_{}/".format(split) 42 | + "{}.jpg".format(ann_data["ID"]) 43 | ) 44 | im = Image.open(file_path) 45 | image_info = { 46 | "file_name": "{}.jpg".format(ann_data["ID"]), 47 | "id": image_cnt, 48 | "height": im.size[1], 49 | "width": im.size[0], 50 | } 51 | out["images"].append(image_info) 52 | if split != "test": 53 | anns = ann_data["gtboxes"] 54 | for i in range(len(anns)): 55 | ann_cnt += 1 56 | fbox = anns[i]["fbox"] 57 | ann = { 58 | "id": ann_cnt, 59 | "category_id": 1, 60 | "image_id": image_cnt, 61 | "track_id": -1, 62 | "bbox_vis": anns[i]["vbox"], 63 | "bbox": fbox, 64 | "area": fbox[2] * fbox[3], 65 | "iscrowd": 1 66 | if "extra" in anns[i] 67 | and "ignore" in anns[i]["extra"] 68 | and anns[i]["extra"]["ignore"] == 1 69 | else 0, 70 | } 71 | out["annotations"].append(ann) 72 | print( 73 | "loaded {} for {} images and {} samples".format( 74 | split, len(out["images"]), len(out["annotations"]) 75 | ) 76 | ) 77 | json.dump(out, open(out_path, "w")) 78 | -------------------------------------------------------------------------------- /preprocess/convert_ethz_to_coco.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import json 4 | from PIL import Image 5 | 6 | DATA_PATH = "data/ETHZ/" 7 | DATA_FILE_PATH = "data/data_path/eth.train" 8 | OUT_PATH = DATA_PATH + "annotations/" 9 | 10 | 11 | def load_paths(data_path): 12 | with open(data_path, "r") as file: 13 | img_files = file.readlines() 14 | img_files = [x.replace("\n", "") for x in img_files] 15 | img_files = list(filter(lambda x: len(x) > 0, img_files)) 16 | label_files = [ 17 | x.replace("images", "labels_with_ids") 18 | .replace(".png", ".txt") 19 | .replace(".jpg", ".txt") 20 | for x in img_files 21 | ] 22 | return img_files, label_files 23 | 24 | 25 | if __name__ == "__main__": 26 | if not os.path.exists(OUT_PATH): 27 | os.mkdir(OUT_PATH) 28 | 29 | out_path = OUT_PATH + "train.json" 30 | out = {"images": [], "annotations": [], "categories": [{"id": 1, "name": "person"}]} 31 | img_paths, label_paths = load_paths(DATA_FILE_PATH) 32 | image_cnt = 0 33 | ann_cnt = 0 34 | video_cnt = 0 35 | for img_path, label_path in zip(img_paths, label_paths): 36 | image_cnt += 1 37 | im = Image.open(os.path.join("data", img_path)) 38 | image_info = { 39 | "file_name": img_path, 40 | "id": image_cnt, 41 | "height": im.size[1], 42 | "width": im.size[0], 43 | } 44 | out["images"].append(image_info) 45 | # Load labels 46 | if os.path.isfile(os.path.join("data", label_path)): 47 | labels0 = np.loadtxt( 48 | os.path.join("data", label_path), dtype=np.float32 49 | ).reshape(-1, 6) 50 | # Normalized xywh to pixel xyxy format 51 | labels = labels0.copy() 52 | labels[:, 2] = image_info["width"] * (labels0[:, 2] - labels0[:, 4] / 2) 53 | labels[:, 3] = image_info["height"] * (labels0[:, 3] - labels0[:, 5] / 2) 54 | labels[:, 4] = image_info["width"] * labels0[:, 4] 55 | labels[:, 5] = image_info["height"] * labels0[:, 5] 56 | else: 57 | labels = np.array([]) 58 | for i in range(len(labels)): 59 | ann_cnt += 1 60 | fbox = labels[i, 2:6].tolist() 61 | ann = { 62 | "id": ann_cnt, 63 | "category_id": 1, 64 | "image_id": image_cnt, 65 | "track_id": -1, 66 | "bbox": fbox, 67 | "area": fbox[2] * fbox[3], 68 | "iscrowd": 0, 69 | } 70 | out["annotations"].append(ann) 71 | print( 72 | "loaded train for {} images and {} samples".format( 73 | len(out["images"]), len(out["annotations"]) 74 | ) 75 | ) 76 | json.dump(out, open(out_path, "w")) 77 | -------------------------------------------------------------------------------- /preprocess/data_preprocess.sh: -------------------------------------------------------------------------------- 1 | python3 preprocess/convert_mot17_to_coco.py 2 | python3 preprocess/convert_mot20_to_coco.py 3 | python3 preprocess/convert_crowdhuman_to_coco.py 4 | python3 preprocess/convert_cityperson_to_coco.py 5 | python3 preprocess/convert_ethz_to_coco.py 6 | 7 | bash preprocess/make_mixed_dirs.sh 8 | python3 preprocess/mix_data_ablation.py 9 | python3 preprocess/mix_data_test_mot17.py 10 | python3 preprocess/mix_data_test_mot20.py -------------------------------------------------------------------------------- /preprocess/make_mixed_dirs.sh: -------------------------------------------------------------------------------- 1 | cd data 2 | mkdir -p mix_mot_ch/annotations 3 | cp mot/annotations/val_half.json mix_mot_ch/annotations/val_half.json 4 | cp mot/annotations/test.json mix_mot_ch/annotations/test.json 5 | cd mix_mot_ch 6 | ln -s ../mot/train mot_train 7 | ln -s ../crowdhuman/CrowdHuman_train crowdhuman_train 8 | ln -s ../crowdhuman/CrowdHuman_val crowdhuman_val 9 | cd .. 10 | 11 | mkdir -p mix_det/annotations 12 | cp mot/annotations/val_half.json mix_det/annotations/val_half.json 13 | cp mot/annotations/test.json mix_det/annotations/test.json 14 | cd mix_det 15 | ln -s ../mot/train mot_train 16 | ln -s ../crowdhuman/CrowdHuman_train crowdhuman_train 17 | ln -s ../crowdhuman/CrowdHuman_val crowdhuman_val 18 | ln -s ../Cityscapes cp_train 19 | ln -s ../ETHZ ethz_train 20 | cd .. 21 | 22 | mkdir -p mix_mot20_ch/annotations 23 | cp MOT20/annotations/val_half.json mix_mot20_ch/annotations/val_half.json 24 | cp MOT20/annotations/test.json mix_mot20_ch/annotations/test.json 25 | cd mix_mot20_ch 26 | ln -s ../MOT20/train mot20_train 27 | ln -s ../crowdhuman/CrowdHuman_train crowdhuman_train 28 | ln -s ../crowdhuman/CrowdHuman_val crowdhuman_val 29 | cd .. -------------------------------------------------------------------------------- /preprocess/mix_data_ablation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | """ 6 | cd data 7 | mkdir -p mix_mot_ch/annotations 8 | cp mot/annotations/val_half.json mix_mot_ch/annotations/val_half.json 9 | cp mot/annotations/test.json mix_mot_ch/annotations/test.json 10 | cd mix_mot_ch 11 | ln -s ../mot/train mot_train 12 | ln -s ../crowdhuman/CrowdHuman_train crowdhuman_train 13 | ln -s ../crowdhuman/CrowdHuman_val crowdhuman_val 14 | cd .. 15 | """ 16 | 17 | mot_json = json.load(open("data/mot/annotations/train_half.json", "r")) 18 | 19 | img_list = list() 20 | for img in mot_json["images"]: 21 | img["file_name"] = "mot_train/" + img["file_name"] 22 | img_list.append(img) 23 | 24 | ann_list = list() 25 | for ann in mot_json["annotations"]: 26 | ann_list.append(ann) 27 | 28 | video_list = mot_json["videos"] 29 | category_list = mot_json["categories"] 30 | 31 | print("mot17") 32 | 33 | max_img = 10000 34 | max_ann = 2000000 35 | max_video = 10 36 | 37 | crowdhuman_json = json.load(open("data/crowdhuman/annotations/train.json", "r")) 38 | img_id_count = 0 39 | for img in crowdhuman_json["images"]: 40 | img_id_count += 1 41 | img["file_name"] = "crowdhuman_train/" + img["file_name"] 42 | img["frame_id"] = img_id_count 43 | img["prev_image_id"] = img["id"] + max_img 44 | img["next_image_id"] = img["id"] + max_img 45 | img["id"] = img["id"] + max_img 46 | img["video_id"] = max_video 47 | img_list.append(img) 48 | 49 | for ann in crowdhuman_json["annotations"]: 50 | ann["id"] = ann["id"] + max_ann 51 | ann["image_id"] = ann["image_id"] + max_img 52 | ann_list.append(ann) 53 | 54 | video_list.append({"id": max_video, "file_name": "crowdhuman_train"}) 55 | 56 | print("crowdhuman_train") 57 | 58 | max_img = 30000 59 | max_ann = 10000000 60 | 61 | crowdhuman_val_json = json.load(open("data/crowdhuman/annotations/val.json", "r")) 62 | img_id_count = 0 63 | for img in crowdhuman_val_json["images"]: 64 | img_id_count += 1 65 | img["file_name"] = "crowdhuman_val/" + img["file_name"] 66 | img["frame_id"] = img_id_count 67 | img["prev_image_id"] = img["id"] + max_img 68 | img["next_image_id"] = img["id"] + max_img 69 | img["id"] = img["id"] + max_img 70 | img["video_id"] = max_video 71 | img_list.append(img) 72 | 73 | for ann in crowdhuman_val_json["annotations"]: 74 | ann["id"] = ann["id"] + max_ann 75 | ann["image_id"] = ann["image_id"] + max_img 76 | ann_list.append(ann) 77 | 78 | video_list.append({"id": max_video, "file_name": "crowdhuman_val"}) 79 | 80 | print("crowdhuman_val") 81 | 82 | mix_json = dict() 83 | mix_json["images"] = img_list 84 | mix_json["annotations"] = ann_list 85 | mix_json["videos"] = video_list 86 | mix_json["categories"] = category_list 87 | json.dump(mix_json, open("data/mix_mot_ch/annotations/train.json", "w")) 88 | -------------------------------------------------------------------------------- /preprocess/mix_data_test_mot17.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | mot_json = json.load(open("data/mot/annotations/train_half.json", "r")) 5 | 6 | img_list = list() 7 | for img in mot_json["images"]: 8 | img["file_name"] = "mot_train/" + img["file_name"] 9 | img_list.append(img) 10 | 11 | ann_list = list() 12 | for ann in mot_json["annotations"]: 13 | ann_list.append(ann) 14 | 15 | video_list = mot_json["videos"] 16 | category_list = mot_json["categories"] 17 | 18 | 19 | print("mot17") 20 | 21 | max_img = 10000 22 | max_ann = 2000000 23 | max_video = 10 24 | 25 | crowdhuman_json = json.load(open("data/crowdhuman/annotations/train.json", "r")) 26 | img_id_count = 0 27 | for img in crowdhuman_json["images"]: 28 | img_id_count += 1 29 | img["file_name"] = "crowdhuman_train/" + img["file_name"] 30 | img["frame_id"] = img_id_count 31 | img["prev_image_id"] = img["id"] + max_img 32 | img["next_image_id"] = img["id"] + max_img 33 | img["id"] = img["id"] + max_img 34 | img["video_id"] = max_video 35 | img_list.append(img) 36 | 37 | for ann in crowdhuman_json["annotations"]: 38 | ann["id"] = ann["id"] + max_ann 39 | ann["image_id"] = ann["image_id"] + max_img 40 | ann_list.append(ann) 41 | 42 | print("crowdhuman_train") 43 | 44 | video_list.append({"id": max_video, "file_name": "crowdhuman_train"}) 45 | 46 | 47 | max_img = 30000 48 | max_ann = 10000000 49 | 50 | crowdhuman_val_json = json.load(open("data/crowdhuman/annotations/val.json", "r")) 51 | img_id_count = 0 52 | for img in crowdhuman_val_json["images"]: 53 | img_id_count += 1 54 | img["file_name"] = "crowdhuman_val/" + img["file_name"] 55 | img["frame_id"] = img_id_count 56 | img["prev_image_id"] = img["id"] + max_img 57 | img["next_image_id"] = img["id"] + max_img 58 | img["id"] = img["id"] + max_img 59 | img["video_id"] = max_video 60 | img_list.append(img) 61 | 62 | for ann in crowdhuman_val_json["annotations"]: 63 | ann["id"] = ann["id"] + max_ann 64 | ann["image_id"] = ann["image_id"] + max_img 65 | ann_list.append(ann) 66 | 67 | print("crowdhuman_val") 68 | 69 | video_list.append({"id": max_video, "file_name": "crowdhuman_val"}) 70 | 71 | max_img = 40000 72 | max_ann = 20000000 73 | 74 | ethz_json = json.load(open("data/ETHZ/annotations/train.json", "r")) 75 | img_id_count = 0 76 | for img in ethz_json["images"]: 77 | img_id_count += 1 78 | img["file_name"] = "ethz_train/" + img["file_name"][5:] 79 | img["frame_id"] = img_id_count 80 | img["prev_image_id"] = img["id"] + max_img 81 | img["next_image_id"] = img["id"] + max_img 82 | img["id"] = img["id"] + max_img 83 | img["video_id"] = max_video 84 | img_list.append(img) 85 | 86 | for ann in ethz_json["annotations"]: 87 | ann["id"] = ann["id"] + max_ann 88 | ann["image_id"] = ann["image_id"] + max_img 89 | ann_list.append(ann) 90 | 91 | print("ETHZ") 92 | 93 | video_list.append({"id": max_video, "file_name": "ethz"}) 94 | 95 | max_img = 50000 96 | max_ann = 25000000 97 | 98 | cp_json = json.load(open("data/Cityscapes/annotations/train.json", "r")) 99 | img_id_count = 0 100 | for img in cp_json["images"]: 101 | img_id_count += 1 102 | img["file_name"] = "cp_train/" + img["file_name"][11:] 103 | img["frame_id"] = img_id_count 104 | img["prev_image_id"] = img["id"] + max_img 105 | img["next_image_id"] = img["id"] + max_img 106 | img["id"] = img["id"] + max_img 107 | img["video_id"] = max_video 108 | img_list.append(img) 109 | 110 | for ann in cp_json["annotations"]: 111 | ann["id"] = ann["id"] + max_ann 112 | ann["image_id"] = ann["image_id"] + max_img 113 | ann_list.append(ann) 114 | 115 | print("Cityscapes") 116 | 117 | video_list.append({"id": max_video, "file_name": "cityperson"}) 118 | 119 | mix_json = dict() 120 | mix_json["images"] = img_list 121 | mix_json["annotations"] = ann_list 122 | mix_json["videos"] = video_list 123 | mix_json["categories"] = category_list 124 | json.dump(mix_json, open("data/mix_det/annotations/train.json", "w")) 125 | -------------------------------------------------------------------------------- /preprocess/mix_data_test_mot20.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | """ 6 | cd data 7 | mkdir -p mix_mot20_ch/annotations 8 | cp MOT20/annotations/val_half.json mix_mot20_ch/annotations/val_half.json 9 | cp MOT20/annotations/test.json mix_mot20_ch/annotations/test.json 10 | cd mix_mot20_ch 11 | ln -s ../MOT20/train mot20_train 12 | ln -s ../crowdhuman/CrowdHuman_train crowdhuman_train 13 | ln -s ../crowdhuman/CrowdHuman_val crowdhuman_val 14 | cd .. 15 | """ 16 | 17 | mot_json = json.load(open("data/MOT20/annotations/train.json", "r")) 18 | 19 | img_list = list() 20 | for img in mot_json["images"]: 21 | img["file_name"] = "mot20_train/" + img["file_name"] 22 | img_list.append(img) 23 | 24 | ann_list = list() 25 | for ann in mot_json["annotations"]: 26 | ann_list.append(ann) 27 | 28 | video_list = mot_json["videos"] 29 | category_list = mot_json["categories"] 30 | 31 | 32 | max_img = 10000 33 | max_ann = 2000000 34 | max_video = 10 35 | 36 | crowdhuman_json = json.load(open("data/crowdhuman/annotations/train.json", "r")) 37 | img_id_count = 0 38 | for img in crowdhuman_json["images"]: 39 | img_id_count += 1 40 | img["file_name"] = "crowdhuman_train/" + img["file_name"] 41 | img["frame_id"] = img_id_count 42 | img["prev_image_id"] = img["id"] + max_img 43 | img["next_image_id"] = img["id"] + max_img 44 | img["id"] = img["id"] + max_img 45 | img["video_id"] = max_video 46 | img_list.append(img) 47 | 48 | for ann in crowdhuman_json["annotations"]: 49 | ann["id"] = ann["id"] + max_ann 50 | ann["image_id"] = ann["image_id"] + max_img 51 | ann_list.append(ann) 52 | 53 | video_list.append({"id": max_video, "file_name": "crowdhuman_train"}) 54 | 55 | 56 | max_img = 30000 57 | max_ann = 10000000 58 | 59 | crowdhuman_val_json = json.load(open("data/crowdhuman/annotations/val.json", "r")) 60 | img_id_count = 0 61 | for img in crowdhuman_val_json["images"]: 62 | img_id_count += 1 63 | img["file_name"] = "crowdhuman_val/" + img["file_name"] 64 | img["frame_id"] = img_id_count 65 | img["prev_image_id"] = img["id"] + max_img 66 | img["next_image_id"] = img["id"] + max_img 67 | img["id"] = img["id"] + max_img 68 | img["video_id"] = max_video 69 | img_list.append(img) 70 | 71 | for ann in crowdhuman_val_json["annotations"]: 72 | ann["id"] = ann["id"] + max_ann 73 | ann["image_id"] = ann["image_id"] + max_img 74 | ann_list.append(ann) 75 | 76 | video_list.append({"id": max_video, "file_name": "crowdhuman_val"}) 77 | 78 | mix_json = dict() 79 | mix_json["images"] = img_list 80 | mix_json["annotations"] = ann_list 81 | mix_json["videos"] = video_list 82 | mix_json["categories"] = category_list 83 | json.dump(mix_json, open("data/mix_mot20_ch/annotations/train.json", "w")) 84 | -------------------------------------------------------------------------------- /pretrained: -------------------------------------------------------------------------------- 1 | /data/P3AFormer/pretrained/ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pycocotools 2 | tqdm 3 | cython 4 | scipy 5 | lap 6 | motmetrics -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | -------------------------------------------------------------------------------- /tools/add_train_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | mot_path = "/data/dataset/mot" 5 | sub_dir = 'train' 6 | seq_nums = os.listdir('/data/dataset/mot/train') 7 | accs = [] 8 | seqs = [] 9 | predict_path = "/data/dataset/mot/train_result" 10 | for seq_num in seq_nums: 11 | shutil.copyfile(os.path.join(mot_path, sub_dir, f'{seq_num}/gt/gt.txt'), 12 | os.path.join(predict_path, f'{seq_num}.txt')) 13 | -------------------------------------------------------------------------------- /tools/combine_labels_mot.py: -------------------------------------------------------------------------------- 1 | all_path2labels = {} 2 | print("Trying loading all files ...") 3 | for label_path in dataset_train.label_files: 4 | if osp.isfile(label_path): 5 | labels0 = np.loadtxt(label_path, dtype=np.float32).reshape(-1, 6).tolist() 6 | all_path2labels[label_path] = labels0 7 | else: 8 | raise ValueError('invalid label path: {}'.format(label_path)) 9 | for label_path in dataset_val.label_files: 10 | if osp.isfile(label_path): 11 | labels0 = np.loadtxt(label_path, dtype=np.float32).reshape(-1, 6).tolist() 12 | all_path2labels[label_path] = labels0 13 | else: 14 | raise ValueError('invalid label path: {}'.format(label_path)) 15 | import json 16 | 17 | json.dump(all_path2labels, open("datasets/data_path/mot.json", 'w')) -------------------------------------------------------------------------------- /tools/gen_labels_MOT17.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os.path as osp 16 | import os 17 | import numpy as np 18 | 19 | MOT_data = '/data/dataset/mot' 20 | 21 | # choose a data in ['MOT15', 'MOT16', 'MOT17', 'MOT20'] 22 | # or your custom data (prepare it following the 'docs/tutorials/PrepareMOTDataSet.md') 23 | 24 | 25 | def mkdirs(d): 26 | if not osp.exists(d): 27 | os.makedirs(d) 28 | 29 | 30 | seq_root = '{}/train'.format(MOT_data) 31 | label_root = '{}/labels_with_ids/train'.format(MOT_data) 32 | mkdirs(label_root) 33 | seqs = [s for s in os.listdir(seq_root)] 34 | 35 | tid_curr = 0 36 | tid_last = -1 37 | for seq in seqs: 38 | seq_info = open(osp.join(seq_root, seq, 'seqinfo.ini')).read() 39 | seq_width = int(seq_info[seq_info.find('imWidth=') + 8:seq_info.find( 40 | '\nimHeight')]) 41 | seq_height = int(seq_info[seq_info.find('imHeight=') + 9:seq_info.find( 42 | '\nimExt')]) 43 | 44 | gt_txt = osp.join(seq_root, seq, 'gt', 'gt.txt') 45 | gt = np.loadtxt(gt_txt, dtype=np.float64, delimiter=',') 46 | print(len(gt)) 47 | seq_label_root = osp.join(label_root, seq, 'img1') 48 | mkdirs(seq_label_root) 49 | 50 | for fid, tid, x, y, w, h, mark, label, _ in gt: 51 | if mark == 0 or not label == 1: 52 | continue 53 | fid = int(fid) 54 | tid = int(tid) 55 | if not tid == tid_last: 56 | tid_curr += 1 57 | tid_last = tid 58 | x += w / 2 59 | y += h / 2 60 | label_fpath = osp.join(seq_label_root, '{:06d}.txt'.format(fid)) 61 | label_str = '0 {:d} {:.6f} {:.6f} {:.6f} {:.6f}\n'.format( 62 | tid_curr, x / seq_width, y / seq_height, w / seq_width, 63 | h / seq_height) 64 | with open(label_fpath, 'a') as f: 65 | f.write(label_str) 66 | -------------------------------------------------------------------------------- /tools/gen_labels_mot15.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import os 3 | import numpy as np 4 | 5 | 6 | def mkdirs(d): 7 | if not osp.exists(d): 8 | os.makedirs(d) 9 | 10 | 11 | seq_root = '/data/dataset/MOT15/images/train' 12 | label_root = '/data/dataset/MOT15/labels_with_ids/train' 13 | mkdirs(label_root) 14 | #seqs = [s for s in os.listdir(seq_root)] 15 | seqs = ['ADL-Rundle-6', 'ETH-Bahnhof', 'KITTI-13', 'PETS09-S2L1', 'TUD-Stadtmitte', 'ADL-Rundle-8', 'KITTI-17', 16 | 'ETH-Pedcross2', 'ETH-Sunnyday', 'TUD-Campus', 'Venice-2'] 17 | 18 | tid_curr = 0 19 | tid_last = -1 20 | for seq in seqs: 21 | seq_info = open(osp.join(seq_root, seq, 'seqinfo.ini')).read() 22 | seq_width = int(seq_info[seq_info.find('imWidth=') + 8:seq_info.find('\nimHeight')]) 23 | seq_height = int(seq_info[seq_info.find('imHeight=') + 9:seq_info.find('\nimExt')]) 24 | 25 | gt_txt = osp.join(seq_root, seq, 'gt', 'gt.txt') 26 | gt = np.loadtxt(gt_txt, dtype=np.float64, delimiter=',') 27 | idx = np.lexsort(gt.T[:2, :]) 28 | gt = gt[idx, :] 29 | 30 | seq_label_root = osp.join(label_root, seq, 'img1') 31 | mkdirs(seq_label_root) 32 | 33 | for fid, tid, x, y, w, h, mark, _, _, _ in gt: 34 | if mark == 0: 35 | continue 36 | fid = int(fid) 37 | tid = int(tid) 38 | if not tid == tid_last: 39 | tid_curr += 1 40 | tid_last = tid 41 | x += w / 2 42 | y += h / 2 43 | label_fpath = osp.join(seq_label_root, '{:06d}.txt'.format(fid)) 44 | label_str = '0 {:d} {:.6f} {:.6f} {:.6f} {:.6f}\n'.format( 45 | tid_curr, x / seq_width, y / seq_height, w / seq_width, h / seq_height) 46 | with open(label_fpath, 'a') as f: 47 | f.write(label_str) -------------------------------------------------------------------------------- /tools/run_dist_launch.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 6 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 7 | # ------------------------------------------------------------------------ 8 | # Modified from DETR (https://github.com/facebookresearch/detr) 9 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 10 | # ------------------------------------------------------------------------ 11 | 12 | 13 | set -x 14 | 15 | GPUS=$1 16 | RUN_COMMAND=${@:2} 17 | if [ $GPUS -lt 8 ]; then 18 | GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS} 19 | else 20 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 21 | fi 22 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 23 | MASTER_PORT=${MASTER_PORT:-"29500"} 24 | NODE_RANK=${NODE_RANK:-0} 25 | 26 | let "NNODES=GPUS/GPUS_PER_NODE" 27 | 28 | python3 ./tools/launch.py \ 29 | --nnodes ${NNODES} \ 30 | --node_rank ${NODE_RANK} \ 31 | --master_addr ${MASTER_ADDR} \ 32 | --master_port ${MASTER_PORT} \ 33 | --nproc_per_node ${GPUS_PER_NODE} \ 34 | ${RUN_COMMAND} -------------------------------------------------------------------------------- /tools/run_dist_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 4 | # ------------------------------------------------------------------------ 5 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 6 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 7 | # ------------------------------------------------------------------------ 8 | # Modified from DETR (https://github.com/facebookresearch/detr) 9 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 10 | # ------------------------------------------------------------------------ 11 | 12 | 13 | set -x 14 | 15 | PARTITION=$1 16 | JOB_NAME=$2 17 | GPUS=$3 18 | RUN_COMMAND=${@:4} 19 | if [ $GPUS -lt 8 ]; then 20 | GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS} 21 | else 22 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 23 | fi 24 | CPUS_PER_TASK=${CPUS_PER_TASK:-4} 25 | SRUN_ARGS=${SRUN_ARGS:-""} 26 | 27 | srun -p ${PARTITION} \ 28 | --job-name=${JOB_NAME} \ 29 | --gres=gpu:${GPUS_PER_NODE} \ 30 | --ntasks=${GPUS} \ 31 | --ntasks-per-node=${GPUS_PER_NODE} \ 32 | --cpus-per-task=${CPUS_PER_TASK} \ 33 | --kill-on-bad-exit=1 \ 34 | ${SRUN_ARGS} \ 35 | ${RUN_COMMAND} 36 | 37 | -------------------------------------------------------------------------------- /tools/visualize_validation_gt_mot17.py: -------------------------------------------------------------------------------- 1 | import os 2 | from visualization_tool import Visualizer 3 | import cv2 4 | import pycocotools.coco as coco 5 | import pdb 6 | from collections import defaultdict 7 | 8 | dataset_root = "/data/dataset/mot" 9 | ann_path = os.path.join(dataset_root, "annotations", "val_half.json") 10 | img_dir = os.path.join(dataset_root, "train") 11 | output_path = "/data/cache" 12 | os.makedirs(output_path, exist_ok=True) 13 | 14 | coco_obj = coco.COCO(ann_path) 15 | video_info = coco_obj.dataset["videos"] 16 | VidtoVname = {} 17 | for v_info in video_info: 18 | VidtoVname[v_info["id"]] = v_info["file_name"] 19 | video_to_images = defaultdict(list) 20 | for image in coco_obj.dataset["images"]: 21 | if image["video_id"] not in VidtoVname.keys(): 22 | continue 23 | video_to_images[VidtoVname[image["video_id"]]].append(image) 24 | image_id_to_filename = {} 25 | for one_image in coco_obj.dataset["images"]: 26 | image_id_to_filename[one_image["id"]] = one_image["file_name"] 27 | 28 | image_file_name_to_anns = defaultdict(list) 29 | for anns in coco_obj.dataset["annotations"]: 30 | image_file_name = image_id_to_filename[anns["image_id"]] 31 | image_file_name_to_anns[image_file_name].append(anns) 32 | for video_id in video_to_images: 33 | print(f"Visualizing video: {video_id} ...") 34 | visualizer = Visualizer() 35 | for idx, image_d in enumerate(video_to_images[video_id]): 36 | print(f"Stepping frame {idx} / {len(video_to_images[video_id])} ...") 37 | img_path = os.path.join(img_dir, image_d["file_name"]) 38 | assert os.path.exists(img_path), f"{img_path} does not exist!" 39 | img = cv2.imread(img_path) 40 | visualizer.add_img(img, img_id=idx) 41 | anns = image_file_name_to_anns[image_d["file_name"]] 42 | for jdx, cur_anns in enumerate(anns): 43 | track_id = cur_anns["track_id"] 44 | bbox = [ 45 | cur_anns["bbox"][0], 46 | cur_anns["bbox"][1], 47 | cur_anns["bbox"][0] + cur_anns["bbox"][2], 48 | cur_anns["bbox"][1] + cur_anns["bbox"][3], 49 | ] 50 | if track_id > 100000: 51 | track_id -= 100000 52 | visualizer.add_coco_bbox(bbox, 0, conf=track_id, add_txt="", img_id=idx) 53 | visualizer.save_video(path=output_path, name=video_id) 54 | 55 | -------------------------------------------------------------------------------- /tracker/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/tracker/.DS_Store -------------------------------------------------------------------------------- /tracker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/tracker/__init__.py -------------------------------------------------------------------------------- /tracker/byte_tracker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/tracker/byte_tracker/__init__.py -------------------------------------------------------------------------------- /tracker/byte_tracker/mot_online/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/tracker/byte_tracker/mot_online/__init__.py -------------------------------------------------------------------------------- /tracker/byte_tracker/mot_online/basetrack.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import OrderedDict 3 | 4 | 5 | class TrackState(object): 6 | New = 0 7 | Tracked = 1 8 | Lost = 2 9 | Removed = 3 10 | 11 | 12 | class BaseTrack(object): 13 | _count = 0 14 | 15 | track_id = 0 16 | is_activated = False 17 | state = TrackState.New 18 | 19 | history = OrderedDict() 20 | features = [] 21 | curr_feature = None 22 | score = 0 23 | start_frame = 0 24 | frame_id = 0 25 | time_since_update = 0 26 | 27 | # multi-camera 28 | location = (np.inf, np.inf) 29 | 30 | @property 31 | def end_frame(self): 32 | return self.frame_id 33 | 34 | @staticmethod 35 | def next_id(): 36 | BaseTrack._count += 1 37 | return BaseTrack._count 38 | 39 | def activate(self, *args): 40 | raise NotImplementedError 41 | 42 | def predict(self): 43 | raise NotImplementedError 44 | 45 | def update(self, *args, **kwargs): 46 | raise NotImplementedError 47 | 48 | def mark_lost(self): 49 | self.state = TrackState.Lost 50 | 51 | def mark_removed(self): 52 | self.state = TrackState.Removed 53 | -------------------------------------------------------------------------------- /tracker/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/tracker/common/__init__.py -------------------------------------------------------------------------------- /tracker/common/track_structure_transfer.py: -------------------------------------------------------------------------------- 1 | from os import curdir 2 | import pdb 3 | import numpy as np 4 | 5 | def frame_first_to_id_first(frame_first): 6 | """ 7 | Frame first result: {Frame ID: a list of [x1, y1, x2, y2, score, ...]} 8 | Track first result: {Track ID: {Frame ID: [x1, y1, x2, y2, score, ...]}} 9 | """ 10 | results = {} 11 | for frameid, bbs in frame_first.items(): 12 | for one_bb in bbs: 13 | x1, y1, x2, y2, score, cur_id = one_bb[0], one_bb[1], one_bb[2], one_bb[3], one_bb[4], one_bb[5] 14 | if cur_id not in results: 15 | results[cur_id] = {} 16 | results[cur_id][frameid] = np.array([x1, y1, x2, y2, score]) 17 | return results 18 | 19 | 20 | def id_first_to_frame_first(id_first): 21 | """ 22 | Frame first result: {Frame ID: a list of [x1, y1, x2, y2, score, ...]} 23 | Track first result: {Track ID: {Frame ID: [x1, y1, x2, y2, score, ...]}} 24 | """ 25 | results = {} 26 | for i, track in id_first.items(): 27 | for frame, bb in track.items(): 28 | if frame not in results: 29 | results[frame] = [] 30 | x1 = bb[0] 31 | y1 = bb[1] 32 | x2 = bb[2] 33 | y2 = bb[3] 34 | score = bb[4] 35 | results[frame].append([x1, y1, x2, y2, score, i+1]) 36 | return results -------------------------------------------------------------------------------- /tracker/d2_p3aformer/write_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import csv 4 | from shutil import copyfile 5 | 6 | 7 | def write_results(all_tracks, out_dir, seq_name=None, frame_offset=0, verbose=False): 8 | output_dir = out_dir + "/txt/" 9 | """Write the tracks in the format for MOT16/MOT17 submission 10 | all_tracks: dictionary with 1 dictionary for every track with {..., i:np.array([x1,y1,x2,y2]), ...} at key track_num if frame_first=False, 11 | Each file contains these lines: 12 | , , , , , , , , , 13 | """ 14 | # format_str = "{}, -1, {}, {}, {}, {}, {}, -1, -1, -1" 15 | assert seq_name is not None, "[!] No seq_name, probably using combined database" 16 | if not os.path.exists(output_dir): 17 | os.makedirs(output_dir) 18 | save_path = osp.join(output_dir, seq_name + ".txt") 19 | with open(save_path, "w") as of: 20 | writer = csv.writer(of, delimiter=",") 21 | for i in sorted(all_tracks): 22 | track = all_tracks[i] 23 | for frame, bb in track.items(): 24 | x1 = bb[0] 25 | y1 = bb[1] 26 | x2 = bb[2] 27 | y2 = bb[3] 28 | writer.writerow( 29 | [ 30 | frame + frame_offset, 31 | i + 1, 32 | x1 + 1, 33 | y1 + 1, 34 | x2 - x1 + 1, 35 | y2 - y1 + 1, 36 | -1, 37 | -1, 38 | -1, 39 | -1, 40 | ] 41 | ) 42 | # TODO: validate this in MOT15 43 | # copy to FRCNN, DPM.txt, private setting 44 | copyfile(save_path, save_path[:-7] + "FRCNN.txt") 45 | copyfile(save_path, save_path[:-7] + "DPM.txt") 46 | if verbose: 47 | print("Write txt results at: ", save_path, ".") 48 | return save_path 49 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | -------------------------------------------------------------------------------- /util/box_ops.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | 12 | """ 13 | Utilities for bounding box manipulation and GIoU. 14 | """ 15 | import torch 16 | from torchvision.ops.boxes import box_area 17 | 18 | 19 | def box_cxcywh_to_xyxy(x): 20 | x_c, y_c, w, h = x.unbind(-1) 21 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 22 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 23 | return torch.stack(b, dim=-1) 24 | 25 | 26 | def box_xyxy_to_cxcywh(x): 27 | x0, y0, x1, y1 = x.unbind(-1) 28 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 29 | (x1 - x0), (y1 - y0)] 30 | return torch.stack(b, dim=-1) 31 | 32 | 33 | # modified from torchvision to also return the union 34 | def box_iou(boxes1, boxes2): 35 | area1 = box_area(boxes1) 36 | area2 = box_area(boxes2) 37 | 38 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 39 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 40 | 41 | wh = (rb - lt).clamp(min=0) # [N,M,2] 42 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 43 | 44 | union = area1[:, None] + area2 - inter 45 | 46 | iou = inter / union 47 | return iou, union 48 | 49 | 50 | def generalized_box_iou(boxes1, boxes2): 51 | """ 52 | Generalized IoU from https://giou.stanford.edu/ 53 | 54 | The boxes should be in [x0, y0, x1, y1] format 55 | 56 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 57 | and M = len(boxes2) 58 | """ 59 | # degenerate boxes gives inf / nan results 60 | # so do an early check 61 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 62 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 63 | iou, union = box_iou(boxes1, boxes2) 64 | 65 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 66 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 67 | 68 | wh = (rb - lt).clamp(min=0) # [N,M,2] 69 | area = wh[:, :, 0] * wh[:, :, 1] 70 | 71 | return iou - (area - union) / area 72 | 73 | 74 | def masks_to_boxes(masks): 75 | """Compute the bounding boxes around the provided masks 76 | 77 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 78 | 79 | Returns a [N, 4] tensors, with the boxes in xyxy format 80 | """ 81 | if masks.numel() == 0: 82 | return torch.zeros((0, 4), device=masks.device) 83 | 84 | h, w = masks.shape[-2:] 85 | 86 | y = torch.arange(0, h, dtype=torch.float) 87 | x = torch.arange(0, w, dtype=torch.float) 88 | y, x = torch.meshgrid(y, x) 89 | 90 | x_mask = (masks * x.unsqueeze(0)) 91 | x_max = x_mask.flatten(1).max(-1)[0] 92 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 93 | 94 | y_mask = (masks * y.unsqueeze(0)) 95 | y_max = y_mask.flatten(1).max(-1)[0] 96 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 97 | 98 | return torch.stack([x_min, y_min, x_max, y_max], 1) 99 | -------------------------------------------------------------------------------- /util/p3aformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/util/p3aformer/__init__.py -------------------------------------------------------------------------------- /util/system.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import glob 4 | 5 | def remove_files_under_folder(folder, select_str): 6 | files = glob.glob(os.path.join(folder, '*')) 7 | for f in files: 8 | if os.path.isdir(f): 9 | continue 10 | if select_str is not None and select_str in f: 11 | os.remove(f) 12 | return -------------------------------------------------------------------------------- /util/tool.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved. 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # ------------------------------------------------------------------------ 7 | # Modified from DETR (https://github.com/facebookresearch/detr) 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | # ------------------------------------------------------------------------ 10 | 11 | import torch 12 | import numpy as np 13 | 14 | 15 | def load_model(model, model_path, optimizer=None, resume=False, 16 | lr=None, lr_step=None): 17 | start_epoch = 0 18 | checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) 19 | print(f'loaded {model_path}') 20 | state_dict = checkpoint['model'] 21 | model_state_dict = model.state_dict() 22 | 23 | # check loaded parameters and created model parameters 24 | msg = 'If you see this, your model does not fully load the ' + \ 25 | 'pre-trained weight. Please make sure ' + \ 26 | 'you set the correct --num_classes for your own dataset.' 27 | for k in state_dict: 28 | if k in model_state_dict: 29 | if state_dict[k].shape != model_state_dict[k].shape: 30 | print('Skip loading parameter {}, required shape{}, ' \ 31 | 'loaded shape{}. {}'.format( 32 | k, model_state_dict[k].shape, state_dict[k].shape, msg)) 33 | if 'class_embed' in k: 34 | print("load class_embed: {} shape={}".format(k, state_dict[k].shape)) 35 | if model_state_dict[k].shape[0] == 1: 36 | state_dict[k] = state_dict[k][1:2] 37 | elif model_state_dict[k].shape[0] == 2: 38 | state_dict[k] = state_dict[k][1:3] 39 | elif model_state_dict[k].shape[0] == 3: 40 | state_dict[k] = state_dict[k][1:4] 41 | else: 42 | raise NotImplementedError('invalid shape: {}'.format(model_state_dict[k].shape)) 43 | continue 44 | state_dict[k] = model_state_dict[k] 45 | else: 46 | print('Drop parameter {}.'.format(k) + msg) 47 | for k in model_state_dict: 48 | if not (k in state_dict): 49 | print('No param {}.'.format(k) + msg) 50 | state_dict[k] = model_state_dict[k] 51 | model.load_state_dict(state_dict, strict=False) 52 | 53 | # resume optimizer parameters 54 | if optimizer is not None and resume: 55 | if 'optimizer' in checkpoint: 56 | optimizer.load_state_dict(checkpoint['optimizer']) 57 | start_epoch = checkpoint['epoch'] 58 | start_lr = lr 59 | for step in lr_step: 60 | if start_epoch >= step: 61 | start_lr *= 0.1 62 | for param_group in optimizer.param_groups: 63 | param_group['lr'] = start_lr 64 | print('Resumed optimizer with start lr', start_lr) 65 | else: 66 | print('No optimizer parameters in checkpoint.') 67 | if optimizer is not None: 68 | return model, optimizer, start_epoch 69 | else: 70 | return model 71 | 72 | 73 | 74 | --------------------------------------------------------------------------------