├── .gitignore
├── LICENSE
├── README.md
├── benchmark.py
├── configs
    ├── detracker_reidV3.yaml
    ├── mot_detectron2
    │   ├── d2_2080ti_p3aformer_train.sh
    │   ├── d2_2080ti_p3aformer_val.sh
    │   ├── d2_debug_p3aformer_train.sh
    │   ├── d2_v100_p3aformer_train.sh
    │   ├── d2_v100_p3aformer_val.sh
    │   ├── p3aformer_base.yaml
    │   ├── p3aformer_big.yaml
    │   ├── p3aformer_config_init.py
    │   └── p3aformer_small.yaml
    ├── reid.yaml
    └── standard
    │   ├── 2080ti_debug_train_coco.sh
    │   ├── 2080ti_debug_train_crowdhuman.sh
    │   ├── 2080ti_debug_train_mot17.sh
    │   ├── interpolation.sh
    │   ├── v100_mot17_coco.sh
    │   ├── v100_mot17_crowdhuman.sh
    │   ├── v100_mot17_fine_tune_mot17.sh
    │   ├── v100_submit_mot17.sh
    │   ├── v100_test_mot15.sh
    │   └── v100_test_mot17.sh
├── d2_main.py
├── datasets
    ├── __init__.py
    ├── byte_mot_half.py
    ├── coco.py
    ├── coco_eval.py
    ├── coco_panoptic.py
    ├── d2_p3aformer_dataset
    │   ├── d2_generic_dataset_val.py
    │   ├── d2_mot15_val_dataset.py
    │   ├── d2_mot17_mixed_dataset.py
    │   └── d2_mot17_val_dataset.py
    ├── data_path
    │   ├── bdd100k.train
    │   ├── bdd100k.val
    │   ├── crowdhuman.train
    │   ├── crowdhuman.val
    │   ├── detmot16.train
    │   ├── detmot17.train
    │   ├── gen_bdd100k_mot.py
    │   ├── gen_labels_15.py
    │   ├── gen_labels_16.py
    │   ├── joint.train
    │   ├── mot16.train
    │   ├── mot17.train
    │   └── prepare.py
    ├── data_prefetcher.py
    ├── detmot.py
    ├── joint.py
    ├── p3aformer_dataset
    │   ├── __init__.py
    │   ├── coco.py
    │   ├── crowdhuman.py
    │   ├── generic_dataset_test_save_mem.py
    │   ├── generic_dataset_train.py
    │   ├── mot15_val_save_mem.py
    │   ├── mot17_train.py
    │   ├── mot17_val_save_mem.py
    │   └── mot20_val_save_mem.py
    ├── p3aformer_eval.py
    ├── panoptic_eval.py
    ├── samplers.py
    ├── static_detmot.py
    ├── torchvision_datasets
    │   ├── __init__.py
    │   └── coco.py
    └── transforms.py
├── engine.py
├── eval.py
├── exps
├── figs
    ├── P3AFormerModel_v12.png
    ├── model_mind_flow.png
    └── pixelwise_association_v8.png
├── interpolation.py
├── main.py
├── models
    ├── __init__.py
    ├── backbone.py
    ├── d2_p3aformer
    │   ├── __init__.py
    │   ├── d2_p3aformer_model.py
    │   ├── d2_postprocess.py
    │   ├── mask2former_modeling
    │   │   ├── __init__.py
    │   │   ├── backbone
    │   │   │   ├── __init__.py
    │   │   │   └── swin.py
    │   │   ├── criterion.py
    │   │   ├── matcher.py
    │   │   ├── meta_arch
    │   │   │   ├── __init__.py
    │   │   │   ├── mask_former_head.py
    │   │   │   └── per_pixel_baseline.py
    │   │   ├── pixel_decoder
    │   │   │   ├── __init__.py
    │   │   │   ├── fpn.py
    │   │   │   ├── msdeformattn.py
    │   │   │   └── ops
    │   │   │   │   ├── functions
    │   │   │   │       ├── __init__.py
    │   │   │   │       └── ms_deform_attn_func.py
    │   │   │   │   ├── make.sh
    │   │   │   │   ├── modules
    │   │   │   │       ├── __init__.py
    │   │   │   │       └── ms_deform_attn.py
    │   │   │   │   ├── setup.py
    │   │   │   │   ├── src
    │   │   │   │       ├── cpu
    │   │   │   │       │   ├── ms_deform_attn_cpu.cpp
    │   │   │   │       │   └── ms_deform_attn_cpu.h
    │   │   │   │       ├── cuda
    │   │   │   │       │   ├── ms_deform_attn_cuda.cu
    │   │   │   │       │   ├── ms_deform_attn_cuda.h
    │   │   │   │       │   └── ms_deform_im2col_cuda.cuh
    │   │   │   │       ├── ms_deform_attn.h
    │   │   │   │       └── vision.cpp
    │   │   │   │   └── test.py
    │   │   └── transformer_decoder
    │   │   │   ├── __init__.py
    │   │   │   ├── mask2former_transformer_decoder.py
    │   │   │   ├── maskformer_transformer_decoder.py
    │   │   │   ├── position_encoding.py
    │   │   │   └── transformer.py
    │   ├── p3aformer_deformable_transformer.py
    │   ├── transcenter_backbone.py
    │   ├── transcenter_dla.py
    │   ├── transcenter_losses
    │   │   ├── losses.py
    │   │   └── utils.py
    │   ├── transcenter_position_encoding.py
    │   └── transcenter_post_processing
    │   │   ├── decode.py
    │   │   ├── post_process.py
    │   │   └── utils.py
    ├── deformable_detr.py
    ├── deformable_transformer.py
    ├── deformable_transformer_plus.py
    ├── matcher.py
    ├── memory_bank.py
    ├── motr.py
    ├── ops
    │   ├── functions
    │   │   ├── __init__.py
    │   │   └── ms_deform_attn_func.py
    │   ├── make.sh
    │   ├── modules
    │   │   ├── __init__.py
    │   │   └── ms_deform_attn.py
    │   ├── server_make.sh
    │   ├── setup.py
    │   ├── src
    │   │   ├── cpu
    │   │   │   ├── ms_deform_attn_cpu.cpp
    │   │   │   └── ms_deform_attn_cpu.h
    │   │   ├── cuda
    │   │   │   ├── ms_deform_attn_cuda.cu
    │   │   │   ├── ms_deform_attn_cuda.h
    │   │   │   └── ms_deform_im2col_cuda.cuh
    │   │   ├── ms_deform_attn.h
    │   │   └── vision.cpp
    │   └── test.py
    ├── p3aformer
    │   ├── __init__.py
    │   ├── p3aformer.py
    │   ├── p3aformer_backbone.py
    │   ├── p3aformer_deformable_transformer.py
    │   ├── p3aformer_dla.py
    │   ├── p3aformer_liteflownet
    │   │   ├── __init__.py
    │   │   ├── correlation_package
    │   │   │   ├── __init__.py
    │   │   │   ├── correlation.py
    │   │   │   ├── correlation_cuda.cc
    │   │   │   ├── correlation_cuda_kernel.cu
    │   │   │   ├── correlation_cuda_kernel.cuh
    │   │   │   ├── pyproject.toml
    │   │   │   └── setup.py
    │   │   └── light_flownet.py
    │   ├── p3aformer_losses
    │   │   ├── losses.py
    │   │   └── utils.py
    │   ├── p3aformer_post_processing
    │   │   ├── decode.py
    │   │   ├── post_process.py
    │   │   └── utils.py
    │   └── p3aformer_reid
    │   │   ├── resnet.py
    │   │   ├── slover.py
    │   │   └── triplet_loss.py
    ├── position_encoding.py
    ├── qim.py
    ├── segmentation.py
    └── structures
    │   ├── __init__.py
    │   ├── boxes.py
    │   └── instances.py
├── motr_demo.py
├── preprocess
    ├── convert_cityperson_to_coco.py
    ├── convert_crowdhuman_to_coco.py
    ├── convert_ethz_to_coco.py
    ├── convert_mot17_to_coco.py
    ├── convert_mot20_to_coco.py
    ├── data_preprocess.sh
    ├── make_mixed_dirs.sh
    ├── mix_data_ablation.py
    ├── mix_data_test_mot17.py
    └── mix_data_test_mot20.py
├── pretrained
├── requirements.txt
├── submit.py
├── tools
    ├── __init__.py
    ├── add_train_for_submission.py
    ├── combine_labels_mot.py
    ├── gen_labels_MOT17.py
    ├── gen_labels_mot15.py
    ├── launch.py
    ├── run_dist_launch.sh
    ├── run_dist_slurm.sh
    ├── transcenter_mot15_to_coco.py
    ├── visualization_tool.py
    └── visualize_validation_gt_mot17.py
├── tracker
    ├── .DS_Store
    ├── __init__.py
    ├── byte_tracker
    │   ├── __init__.py
    │   ├── byte_tracker.py
    │   └── mot_online
    │   │   ├── __init__.py
    │   │   ├── basetrack.py
    │   │   ├── kalman_filter.py
    │   │   └── matching.py
    ├── common
    │   ├── __init__.py
    │   └── track_structure_transfer.py
    ├── d2_p3aformer
    │   ├── d2_p3aformer_tracker.py
    │   └── write_results.py
    └── dense_tracker
    │   └── dense_tracker.py
└── util
    ├── __init__.py
    ├── box_ops.py
    ├── evaluation.py
    ├── image.py
    ├── misc.py
    ├── motdet_eval.py
    ├── p3aformer
        ├── __init__.py
        ├── p3aformer_misc.py
        └── tracker_util.py
    ├── plot_utils.py
    ├── system.py
    ├── tool.py
    └── vis_utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/benchmark.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Benchmark inference speed of Deformable DETR.
 3 | """
 4 | import os
 5 | import time
 6 | import argparse
 7 | 
 8 | import torch
 9 | 
10 | from main import get_args_parser as get_main_args_parser
11 | from models import build_model
12 | from datasets import build_dataset
13 | from util.misc import nested_tensor_from_tensor_list
14 | 
15 | 
16 | def get_benckmark_arg_parser():
17 |     parser = argparse.ArgumentParser('Benchmark inference speed of Deformable DETR.')
18 |     parser.add_argument('--num_iters', type=int, default=300, help='total iters to benchmark speed')
19 |     parser.add_argument('--warm_iters', type=int, default=5, help='ignore first several iters that are very slow')
20 |     parser.add_argument('--batch_size', type=int, default=1, help='batch size in inference')
21 |     parser.add_argument('--resume', type=str, help='load the pre-trained checkpoint')
22 |     return parser
23 | 
24 | 
25 | @torch.no_grad()
26 | def measure_average_inference_time(model, inputs, num_iters=100, warm_iters=5):
27 |     ts = []
28 |     for iter_ in range(num_iters):
29 |         torch.cuda.synchronize()
30 |         t_ = time.perf_counter()
31 |         model(inputs)
32 |         torch.cuda.synchronize()
33 |         t = time.perf_counter() - t_
34 |         if iter_ >= warm_iters:
35 |           ts.append(t)
36 |     print(ts)
37 |     return sum(ts) / len(ts)
38 | 
39 | 
40 | def benchmark():
41 |     args, _ = get_benckmark_arg_parser().parse_known_args()
42 |     main_args = get_main_args_parser().parse_args(_)
43 |     assert args.warm_iters < args.num_iters and args.num_iters > 0 and args.warm_iters >= 0
44 |     assert args.batch_size > 0
45 |     assert args.resume is None or os.path.exists(args.resume)
46 |     dataset = build_dataset('val', main_args)
47 |     model, _, _ = build_model(main_args)
48 |     model.cuda()
49 |     model.eval()
50 |     if args.resume is not None:
51 |         ckpt = torch.load(args.resume, map_location=lambda storage, loc: storage)
52 |         model.load_state_dict(ckpt['model'])
53 |     inputs = nested_tensor_from_tensor_list([dataset.__getitem__(0)[0].cuda() for _ in range(args.batch_size)])
54 |     t = measure_average_inference_time(model, inputs, args.num_iters, args.warm_iters)
55 |     return 1.0 / t * args.batch_size
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     fps = benchmark()
60 |     print(f'Inference Speed: {fps:.1f} FPS')
61 | 
62 | 


--------------------------------------------------------------------------------
/configs/detracker_reidV3.yaml:
--------------------------------------------------------------------------------
 1 | tracktor:
 2 |   name: P3AFormer
 3 |   module_name: MOT
 4 |   desription: transformer
 5 |   seed: 12345
 6 |   network: transformer
 7 | 
 8 |   output_dir: .
 9 |   obj_detect_model: .
10 | 
11 |   reid_weights: ResNet_iter_25245.pth
12 | 
13 |   frame_split: [0.0, 1.0]
14 | 
15 |   tracker:
16 |     track_thresh: 0.5
17 |     out_thresh: 0.3
18 |     pre_thresh: 0.5
19 |     new_thresh: 0.3
20 |     detection_nms_thresh: 0.5
21 |     motion_model:
22 |       enabled: False
23 |       # average velocity over last n_steps steps
24 |       n_steps: 1
25 |       # if true, only model the movement of the bounding box center. If false, width and height are also modeled.
26 |       center_only: True
27 |     # 0 tells the tracker to use private detections (Faster R-CNN)
28 |     public_detections: False
29 |     # How much last appearance features are to keep
30 |     max_features_num: 10
31 |     # Do camera motion compensation
32 |     do_align: True
33 |     # Use siamese network to do reid
34 |     do_reid: True
35 |     # How much timesteps dead tracks are kept and cnosidered for reid
36 |     inactive_patience: 60
37 |     # How similar do image and old track need to be to be considered the same person
38 |     reid_sim_threshold: 2.0
39 |     # How much IoU do track and image need to be considered for matching
40 |     reid_iou_threshold: 0.1


--------------------------------------------------------------------------------
/configs/mot_detectron2/d2_2080ti_p3aformer_train.sh:
--------------------------------------------------------------------------------
1 | export OUTPUT_DIR="output/jun3_2080ti"
2 | python d2_main.py \
3 |   --config-file configs/mot_detectron2/p3aformer_small.yaml \
4 |   --num-gpus 8 SOLVER.IMS_PER_BATCH 16 SOLVER.MAX_ITER 83100 OUTPUT_DIR ${OUTPUT_DIR} INPUT.VAL_DATA_DIR "/data/dataset/mot" MODEL.DENSETRACK.ENC_LAYERS 2 MODEL.DENSETRACK.DEC_LAYERS 3


--------------------------------------------------------------------------------
/configs/mot_detectron2/d2_2080ti_p3aformer_val.sh:
--------------------------------------------------------------------------------
 1 | #  validation mot 15
 2 | MODEL_DIR="output/jun3_2080ti"
 3 | python d2_main.py \
 4 |   --config-file configs/mot_detectron2/p3aformer_small.yaml --eval-only  \
 5 |   --num-gpus 1 DATALOADER.NUM_WORKERS 0 SOLVER.IMS_PER_BATCH 2 SOLVER.MAX_ITER 83100 MODEL.WEIGHTS ${MODEL_DIR}"/model_final.pth" INPUT.VAL_DATA_DIR "/data/dataset/MOT15" MODEL.DENSETRACK.ENC_LAYERS 2 MODEL.DENSETRACK.DEC_LAYERS 3 OUTPUT_DIR ${MODEL_DIR} TRACK.DENSETRACK.TRACK_THRE 0.2
 6 | 
 7 | # validation mot17
 8 | MODEL_DIR="output/jun3_2080ti"
 9 | SPLIT="val_half"
10 | MODEL_NAME=${MODEL_DIR}"/model_final.pth"
11 | python d2_main.py \
12 |   --config-file configs/mot_detectron2/p3aformer_small.yaml --eval-only  \
13 |   --num-gpus 1 DATALOADER.NUM_WORKERS 0 SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHTS ${MODEL_NAME} INPUT.VAL_DATA_DIR "/data/dataset/mot" DATASETS.TEST '("MOT17",)' MODEL.DENSETRACK.ENC_LAYERS 2 MODEL.DENSETRACK.DEC_LAYERS 3 OUTPUT_DIR ${MODEL_DIR} TRACK.DENSETRACK.TRACK_THRE 0.5 TRACK.DENSETRACK.PRE_THRE 0.0 TRACK.DENSETRACK.OUT_THRE 0.0
14 |   
15 |   #  TRACK.VIS True
16 | 
17 | # debug only
18 | MODEL_DIR="output/april17_2080ti"
19 | SPLIT="val_half"
20 | MODEL_NAME=${MODEL_DIR}"/model_0039999.pth"
21 | python d2_main.py \
22 |   --config-file configs/mot_detectron2/p3aformer_small.yaml --eval-only  \
23 |   --num-gpus 1 DATALOADER.NUM_WORKERS 0 SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHTS ${MODEL_NAME} INPUT.VAL_DATA_DIR "/data/dataset/mot" DATASETS.TEST '("MOT17",)' MODEL.DENSETRACK.ENC_LAYERS 2 MODEL.DENSETRACK.DEC_LAYERS 3 OUTPUT_DIR ${MODEL_DIR} TRACK.DENSETRACK.TRACK_THRE 0.0 TRACK.DENSETRACK.PRE_THRE 0.0 TRACK.DENSETRACK.OUT_THRE 0.0 TRACK.VIS True
24 | 


--------------------------------------------------------------------------------
/configs/mot_detectron2/d2_debug_p3aformer_train.sh:
--------------------------------------------------------------------------------
1 | python d2_main.py \
2 |   --config-file configs/mot_detectron2/p3aformer_small.yaml \
3 |   --num-gpus 1 DATALOADER.NUM_WORKERS 0 DATASETS.TEST '("MOT17",)' INPUT.VAL_DATA_DIR "/data/dataset/mot" TRACK.VIS True


--------------------------------------------------------------------------------
/configs/mot_detectron2/d2_v100_p3aformer_train.sh:
--------------------------------------------------------------------------------
1 | export OUTPUT_DIR="output/June2Mixed"
2 | python d2_main.py \
3 |   --config-file configs/mot_detectron2/p3aformer_big.yaml \
4 |   --num-gpus 4 SOLVER.IMS_PER_BATCH 16 SOLVER.MAX_ITER 83100 OUTPUT_DIR ${OUTPUT_DIR} MODEL.DENSETRACK.ENC_LAYERS 5 MODEL.DENSETRACK.DEC_LAYERS 5


--------------------------------------------------------------------------------
/configs/mot_detectron2/d2_v100_p3aformer_val.sh:
--------------------------------------------------------------------------------
 1 | # # validation MOT15
 2 | MODEL_NAME="output/feb15_v100/model_final.pth"
 3 | python d2_main.py \
 4 |   --config-file configs/mot_detectron2/p3aformer_small.yaml --eval-only  \
 5 |   --num-gpus 1 DATALOADER.NUM_WORKERS 0 SOLVER.IMS_PER_BATCH 1 SOLVER.MAX_ITER 83100 MODEL.WEIGHTS ${MODEL_NAME} INPUT.VAL_DATA_DIR "/data/dataset/MOT15" MODEL.DENSETRACK.ENC_LAYERS 6 MODEL.DENSETRACK.DEC_LAYERS 6
 6 | 
 7 | # open visualization on MOT17
 8 | MODEL_DIR="output/June2Mixed"
 9 | SPLIT="val_half"
10 | MODEL_NAME=${MODEL_DIR}"/model_final.pth"
11 | OUTPUT_DIR="output/June2Mixed/model_final"
12 | python d2_main.py \
13 |   --config-file configs/mot_detectron2/p3aformer_big.yaml --eval-only  \
14 |   --num-gpus 1 DATALOADER.NUM_WORKERS 0 SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHTS ${MODEL_NAME} INPUT.VAL_DATA_DIR "/data/dataset/mot" DATASETS.TEST '("MOT17",)' MODEL.DENSETRACK.ENC_LAYERS 5 MODEL.DENSETRACK.DEC_LAYERS 5 OUTPUT_DIR ${OUTPUT_DIR}
15 |     # TRACK.VIS True 
16 | 


--------------------------------------------------------------------------------
/configs/mot_detectron2/p3aformer_base.yaml:
--------------------------------------------------------------------------------
 1 | # MODEL:
 2 | #   BACKBONE:
 3 | #     FREEZE_AT: 0
 4 | #     NAME: "build_resnet_backbone"
 5 | #   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 | #   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 | #   PIXEL_STD: [58.395, 57.120, 57.375]
 8 | #   RESNETS:
 9 | #     DEPTH: 50
10 | #     STEM_TYPE: "basic"  # not used
11 | #     STEM_OUT_CHANNELS: 64
12 | #     STRIDE_IN_1X1: False
13 | #     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | #     # NORM: "SyncBN"
15 | #     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS: # not used
17 |   TRAIN: ("coco_2017_train_panoptic",) # not used
18 |   TEST: ("MOT15",)  # to evaluate instance and semantic performance as well
19 | SOLVER:
20 |   LR_BACKBONE: 2e-5
21 | OUTPUT_DIR: "output/debug"
22 | #   IMS_PER_BATCH: 16
23 | #   BASE_LR: 0.0001
24 | #   STEPS: (327778, 355092)
25 | #   MAX_ITER: 368750
26 | #   WARMUP_FACTOR: 1.0
27 | #   WARMUP_ITERS: 10
28 | #   WEIGHT_DECAY: 0.05
29 | #   OPTIMIZER: "ADAMW"
30 | #   BACKBONE_MULTIPLIER: 0.1
31 | #   CLIP_GRADIENTS:
32 | #     ENABLED: True
33 | #     CLIP_TYPE: "full_model"
34 | #     CLIP_VALUE: 0.01
35 | #     NORM_TYPE: 2.0
36 | #   AMP:
37 | #     ENABLED: True
38 | # INPUT:
39 | #   IMAGE_SIZE: 1024
40 | #   MIN_SCALE: 0.1
41 | #   MAX_SCALE: 2.0
42 | #   FORMAT: "RGB"
43 | #   DATASET_MAPPER_NAME: "coco_panoptic_lsj"
44 | # TEST:
45 | #   EVAL_PERIOD: 5000
46 | # DATALOADER:
47 | #   FILTER_EMPTY_ANNOTATIONS: True
48 | #   NUM_WORKERS: 4
49 | VERSION: 2
50 | 


--------------------------------------------------------------------------------
/configs/mot_detectron2/p3aformer_big.yaml:
--------------------------------------------------------------------------------
  1 | _BASE_: p3aformer_base.yaml
  2 | MODEL:
  3 |   META_ARCHITECTURE: "D2P3AFormer"
  4 |   DENSETRACK:
  5 |     ENC_LAYERS: 2
  6 |     DEC_LAYERS: 2
  7 |     NUM_FEATURE_LEVELS: 4
  8 |     DIM_FEEDFORWARD: 1024
  9 |     HIDDEN_DIM: 256
 10 |     POSITION_EMBEDDING: "sine"
 11 |     BACKBONE: "resnet50"
 12 |     DILATION: False
 13 |     DROPOUT: 0.1
 14 |     DEC_N_POINTS: 4
 15 |     ENC_N_POINTS: 4
 16 |     TRACKING: True
 17 |     SAME_AUG_PRE: True
 18 |     PRE_HM: True
 19 |     HM_WEIGHT: 1.0
 20 |     OFF_WEIGHT: 1.0
 21 |     WH_WEIGHT: 0.1
 22 |     BOXES_WEIGHT: 0.5
 23 |     GIOU_WEIGHT: 0.4
 24 |     CT_OFFSET_WEIGHT: 0.1
 25 |     TRACKING_WEIGHT: 1.0
 26 |     NORM_FACTOR: 1.0
 27 |     DEFAULT_RESOLUTION: [640, 1088]
 28 | SOLVER:
 29 |   OPTIMIZER: "ADAMW"
 30 |   AUX_LOSS: False
 31 |   IMS_PER_BATCH: 2
 32 |   BASE_LR: 1e-4
 33 |   MAX_ITER: 160000
 34 |   WARMUP_FACTOR: 1.0
 35 |   WARMUP_ITERS: 0
 36 |   WEIGHT_DECAY: 0.05
 37 |   OPTIMIZER: "ADAMW"
 38 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
 39 |   BACKBONE_MULTIPLIER: 0.1
 40 | # '''
 41 | # python main.py \
 42 | #    --meta_arch p3aformer \
 43 | #    --data_dir /data/dataset/mix_det \
 44 | #    --dataset_name MOT17 \
 45 | #    --dataset_file p3aformer_mixed \
 46 | #    --batch_size=2  \
 47 | #    --output_dir=./output/whole_MOT17_from_CH \
 48 | #    --num_workers=16 \
 49 | #    --pre_hm \
 50 | #    --tracking \
 51 | #    --same_aug_pre \
 52 | #    --image_blur_aug \
 53 | #    --lr 1e-4 \
 54 | #    --lr_backbone_names ["backbone.0"] \
 55 | #    --lr_backbone 2e-5 \
 56 | #    --lr_linear_proj_names ['reference_points', 'sampling_offsets',] \
 57 | #    --lr_linear_proj_mult 0.1 \
 58 | #    --lr_drop 40 \
 59 | #    --epochs 5 \
 60 | #    --weight_decay 1e-4 \
 61 | #    --clip_max_norm 0.1 \
 62 | #    --backbone 'resnet50' \
 63 | #    --position_embedding 'sine' \
 64 | #    --num_feature_levels 3 \
 65 | #    --enc_layers 2 \
 66 | #    --dec_layers 2 \
 67 | #    --dim_feedforward 1024 \
 68 | #    --hidden_dim 256 \
 69 | #    --shift 0.05 \
 70 | #    --scale 0.05 \
 71 | #    --rotate 0 \
 72 | #    --flip 0.5 \
 73 | #    --hm_disturb 0.05 \
 74 | #    --lost_disturb 0.4 \
 75 | #    --fp_disturb 0.1 \
 76 | #    --track_thresh 0.3 \
 77 | #    --new_thresh 0.3 \
 78 | #    --ltrb_amodal_weight 0.1
 79 | # '''
 80 |   # SEM_SEG_HEAD:
 81 |   #   NAME: "MaskFormerHead"
 82 |   #   IN_FEATURES: ["res2", "res3", "res4", "res5"]
 83 |   #   IGNORE_VALUE: 255
 84 |   #   NUM_CLASSES: 133
 85 |   #   LOSS_WEIGHT: 1.0
 86 |   #   CONVS_DIM: 256
 87 |   #   MASK_DIM: 256
 88 |   #   MASK_DIM: 256
 89 |   #   NORM: "GN"
 90 |   #   # pixel decoder
 91 |   #   PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
 92 |   #   IN_FEATURES: ["res2", "res3", "res4", "res5"]
 93 |   #   DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
 94 |   #   COMMON_STRIDE: 4
 95 |   #   TRANSFORMER_ENC_LAYERS: 6
 96 |   # MASK_FORMER:
 97 |   #   TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
 98 |   #   TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
 99 |   #   DEEP_SUPERVISION: True
100 |   #   NO_OBJECT_WEIGHT: 0.1
101 |   #   CLASS_WEIGHT: 2.0
102 |   #   MASK_WEIGHT: 5.0
103 |   #   DICE_WEIGHT: 5.0
104 |   #   HIDDEN_DIM: 256
105 |   #   NUM_OBJECT_QUERIES: 100
106 |   #   NHEADS: 8
107 |   #   DROPOUT: 0.0
108 |   #   DIM_FEEDFORWARD: 2048
109 |   #   ENC_LAYERS: 0
110 |   #   PRE_NORM: False
111 |   #   ENFORCE_INPUT_PROJ: False
112 |   #   SIZE_DIVISIBILITY: 32
113 |   #   DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
114 |   #   TRAIN_NUM_POINTS: 12544
115 |   #   OVERSAMPLE_RATIO: 3.0
116 |   #   IMPORTANCE_SAMPLE_RATIO: 0.75
117 |   #   TEST:
118 |   #     SEMANTIC_ON: True
119 |   #     INSTANCE_ON: True
120 |   #     PANOPTIC_ON: True
121 |   #     OVERLAP_THRESHOLD: 0.8
122 |   #     OBJECT_MASK_THRESHOLD: 0.8
123 | 


--------------------------------------------------------------------------------
/configs/mot_detectron2/p3aformer_small.yaml:
--------------------------------------------------------------------------------
  1 | _BASE_: p3aformer_base.yaml
  2 | MODEL:
  3 |   META_ARCHITECTURE: "D2P3AFormer"
  4 |   DENSETRACK:
  5 |     ENC_LAYERS: 2
  6 |     DEC_LAYERS: 2
  7 |     NUM_FEATURE_LEVELS: 4
  8 |     DIM_FEEDFORWARD: 1024
  9 |     HIDDEN_DIM: 256
 10 |     POSITION_EMBEDDING: "sine"
 11 |     BACKBONE: "resnet50"
 12 |     DILATION: False
 13 |     DROPOUT: 0.1
 14 |     DEC_N_POINTS: 4
 15 |     ENC_N_POINTS: 4
 16 |     TRACKING: True
 17 |     SAME_AUG_PRE: True
 18 |     PRE_HM: True
 19 |     HM_WEIGHT: 1.0
 20 |     OFF_WEIGHT: 1.0
 21 |     WH_WEIGHT: 0.1
 22 |     BOXES_WEIGHT: 0.5
 23 |     GIOU_WEIGHT: 0.4
 24 |     CT_OFFSET_WEIGHT: 0.1
 25 |     TRACKING_WEIGHT: 1.0
 26 |     NORM_FACTOR: 1.0
 27 |     DEFAULT_RESOLUTION: [640, 1088]
 28 | SOLVER:
 29 |   OPTIMIZER: "ADAMW"
 30 |   AUX_LOSS: False
 31 |   IMS_PER_BATCH: 2
 32 |   BASE_LR: 1e-4
 33 |   MAX_ITER: 160000
 34 |   WARMUP_FACTOR: 1.0
 35 |   WARMUP_ITERS: 0
 36 |   WEIGHT_DECAY: 0.05
 37 |   OPTIMIZER: "ADAMW"
 38 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
 39 |   BACKBONE_MULTIPLIER: 0.1
 40 | # '''
 41 | # python main.py \
 42 | #    --meta_arch p3aformer \
 43 | #    --data_dir /data/dataset/mix_det \
 44 | #    --dataset_name MOT17 \
 45 | #    --dataset_file p3aformer_mixed \
 46 | #    --batch_size=2  \
 47 | #    --output_dir=./output/whole_MOT17_from_CH \
 48 | #    --num_workers=16 \
 49 | #    --pre_hm \
 50 | #    --tracking \
 51 | #    --same_aug_pre \
 52 | #    --image_blur_aug \
 53 | #    --lr 1e-4 \
 54 | #    --lr_backbone_names ["backbone.0"] \
 55 | #    --lr_backbone 2e-5 \
 56 | #    --lr_linear_proj_names ['reference_points', 'sampling_offsets',] \
 57 | #    --lr_linear_proj_mult 0.1 \
 58 | #    --lr_drop 40 \
 59 | #    --epochs 5 \
 60 | #    --weight_decay 1e-4 \
 61 | #    --clip_max_norm 0.1 \
 62 | #    --backbone 'resnet50' \
 63 | #    --position_embedding 'sine' \
 64 | #    --num_feature_levels 3 \
 65 | #    --enc_layers 2 \
 66 | #    --dec_layers 2 \
 67 | #    --dim_feedforward 1024 \
 68 | #    --hidden_dim 256 \
 69 | #    --shift 0.05 \
 70 | #    --scale 0.05 \
 71 | #    --rotate 0 \
 72 | #    --flip 0.5 \
 73 | #    --hm_disturb 0.05 \
 74 | #    --lost_disturb 0.4 \
 75 | #    --fp_disturb 0.1 \
 76 | #    --track_thresh 0.3 \
 77 | #    --new_thresh 0.3 \
 78 | #    --ltrb_amodal_weight 0.1
 79 | # '''
 80 |   # SEM_SEG_HEAD:
 81 |   #   NAME: "MaskFormerHead"
 82 |   #   IN_FEATURES: ["res2", "res3", "res4", "res5"]
 83 |   #   IGNORE_VALUE: 255
 84 |   #   NUM_CLASSES: 133
 85 |   #   LOSS_WEIGHT: 1.0
 86 |   #   CONVS_DIM: 256
 87 |   #   MASK_DIM: 256
 88 |   #   MASK_DIM: 256
 89 |   #   NORM: "GN"
 90 |   #   # pixel decoder
 91 |   #   PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
 92 |   #   IN_FEATURES: ["res2", "res3", "res4", "res5"]
 93 |   #   DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
 94 |   #   COMMON_STRIDE: 4
 95 |   #   TRANSFORMER_ENC_LAYERS: 6
 96 |   # MASK_FORMER:
 97 |   #   TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
 98 |   #   TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
 99 |   #   DEEP_SUPERVISION: True
100 |   #   NO_OBJECT_WEIGHT: 0.1
101 |   #   CLASS_WEIGHT: 2.0
102 |   #   MASK_WEIGHT: 5.0
103 |   #   DICE_WEIGHT: 5.0
104 |   #   HIDDEN_DIM: 256
105 |   #   NUM_OBJECT_QUERIES: 100
106 |   #   NHEADS: 8
107 |   #   DROPOUT: 0.0
108 |   #   DIM_FEEDFORWARD: 2048
109 |   #   ENC_LAYERS: 0
110 |   #   PRE_NORM: False
111 |   #   ENFORCE_INPUT_PROJ: False
112 |   #   SIZE_DIVISIBILITY: 32
113 |   #   DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
114 |   #   TRAIN_NUM_POINTS: 12544
115 |   #   OVERSAMPLE_RATIO: 3.0
116 |   #   IMPORTANCE_SAMPLE_RATIO: 0.75
117 |   #   TEST:
118 |   #     SEMANTIC_ON: True
119 |   #     INSTANCE_ON: True
120 |   #     PANOPTIC_ON: True
121 |   #     OVERLAP_THRESHOLD: 0.8
122 |   #     OBJECT_MASK_THRESHOLD: 0.8
123 | 


--------------------------------------------------------------------------------
/configs/reid.yaml:
--------------------------------------------------------------------------------
 1 | reid:
 2 |   name: test
 3 |   module_name: reid
 4 |   desription:
 5 |   seed: 12345
 6 |   # smth like MOT_train, KITTI_train_Pedestrian
 7 |   db_train: mot_reid_small_train
 8 |   db_val: False
 9 | 
10 |   model_args:
11 |     # Recommended for loss: batch_all, batch_hard
12 |     # Unstable, no guarantee they are working: weighted_triplet, cross_entropy
13 |     loss: batch_hard
14 |     margin: 0.2
15 |     # Plot prec at k to tensorboard, 0 for off
16 |     prec_at_k: 3
17 | 
18 |   solver:
19 |     optim: Adam
20 |     optim_args:
21 |       lr: 0.0003
22 |       weight_decay: 0.0000
23 | 
24 |   dataloader:
25 |     # all targets with visibility lower than this are filtered out, for kitti set it to
26 |     # a sequence with maximal [truncation, occlusion] levels
27 |     vis_threshold: 0.3
28 |     P: 18
29 |     K: 4
30 |     # limit maximum number of images per identity
31 |     max_per_person: 1000
32 |     crop_H: 256
33 |     crop_W: 128
34 |     # center: just a center crop, random: random crop and 0.5 horizontal flip probability
35 |     transform: random
36 |     normalize_mean:
37 |       - 0.485
38 |       - 0.456
39 |       - 0.406
40 |     normalize_std:
41 |       - 0.229
42 |       - 0.224
43 |       - 0.225
44 | 
45 |   cnn:
46 |     output_dim: 128


--------------------------------------------------------------------------------
/configs/standard/2080ti_debug_train_coco.sh:
--------------------------------------------------------------------------------
1 | python main.py \
2 | --dataset_name MOT17 --dataset_file coco \
3 | --output_dir=./output/jul19_whole_coco --batch_size=1 --num_workers=0 --pre_hm --tracking --data_dir=/data/dataset/coco --scale 0.05 --shift 0.05 --flip 0.5 --meta_arch p3aformer --resume="" --num_feature_levels 3 --enc_layers 2 --dec_layers 2


--------------------------------------------------------------------------------
/configs/standard/2080ti_debug_train_crowdhuman.sh:
--------------------------------------------------------------------------------
1 | python main.py --output_dir=./output/debug_train_ch \
2 | --dataset_name MOT17 --dataset_file crowdHuman --meta_arch p3aformer --batch_size=1 --num_workers=1 --pre_hm --tracking --data_dir=/data/dataset/crowdhuman  --num_feature_levels 3 --enc_layers 2 --dec_layers 2


--------------------------------------------------------------------------------
/configs/standard/2080ti_debug_train_mot17.sh:
--------------------------------------------------------------------------------
 1 | # debug, new dataset, reduced model size
 2 | python main.py \
 3 |    --meta_arch p3aformer \
 4 |    --data_dir /data/dataset/mot \
 5 |    --dataset_name MOT17 \
 6 |    --dataset_file p3aformer_mot \
 7 |    --batch_size=2  \
 8 |    --output_dir=./output/debug \
 9 |    --num_workers=20 \
10 |    --resume="" \
11 |    --pre_hm \
12 |    --tracking \
13 |    --same_aug_pre \
14 |    --image_blur_aug \
15 |    --lr 1e-4 \
16 |    --lr_backbone_names ["backbone.0"] \
17 |    --lr_backbone 2e-5 \
18 |    --lr_linear_proj_names ['reference_points', 'sampling_offsets',] \
19 |    --lr_linear_proj_mult 0.1 \
20 |    --lr_drop 40 \
21 |    --epochs 23 \
22 |    --weight_decay 1e-4 \
23 |    --clip_max_norm 0.1 \
24 |    --backbone 'resnet50' \
25 |    --position_embedding 'sine' \
26 |    --num_feature_levels 3 \
27 |    --enc_layers 2 \
28 |    --dec_layers 2 \
29 |    --dim_feedforward 1024 \
30 |    --hidden_dim 256 \
31 |    --shift 0.05 \
32 |    --scale 0.05 \
33 |    --rotate 0 \
34 |    --flip 0.5 \
35 |    --hm_disturb 0.05 \
36 |    --lost_disturb 0.4 \
37 |    --fp_disturb 0.1 \
38 |    --track_thresh 0.3 \
39 |    --new_thresh 0.3 \
40 |    --ltrb_amodal_weight 0.1


--------------------------------------------------------------------------------
/configs/standard/interpolation.sh:
--------------------------------------------------------------------------------
 1 | # MOT 15
 2 | EXP_DIR=exps/p3aformer_trained
 3 | EXP_ID=''
 4 | python3 interpolation.py \
 5 |      --dataset_name MOT15 \
 6 |      --data_dir /data/dataset/MOT15/ \
 7 |      --input_txt_dir ${EXP_DIR}/${EXP_ID}/txt \
 8 |      --output_txt_dir ${EXP_DIR}/${EXP_ID}/txt_interpolated
 9 | 
10 | # MOT 17
11 | EXP_DIR=exps/p3aformer_trained
12 | EXP_ID='p3aformer_trained'
13 | python3 interpolation.py \
14 |      --dataset_name MOT17 \
15 |      --data_dir /data/dataset/mot/ \
16 |      --input_txt_dir ${EXP_DIR}/${EXP_ID}/txt \
17 |      --output_txt_dir ${EXP_DIR}/${EXP_ID}/txt_interpolated


--------------------------------------------------------------------------------
/configs/standard/v100_mot17_coco.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.launch \
2 | --nproc_per_node=8 --use_env main.py \
3 | --dataset_name MOT17 --dataset_file coco \
4 | --output_dir=./output/jul19_whole_coco --batch_size=3 --num_workers=20 --pre_hm \
5 | --tracking --data_dir=/data/dataset/coco --scale 0.05 --shift 0.05 --flip 0.5 --meta_arch p3aformer --resume=""


--------------------------------------------------------------------------------
/configs/standard/v100_mot17_crowdhuman.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.launch \
2 | --nproc_per_node=8 --use_env main.py \
3 | --dataset_name MOT17 --dataset_file crowdHuman --output_dir=./output/jul14_whole_ch_from_COCO --batch_size=1 \
4 | --num_workers=4 --resume=./output/whole_coco/checkpoint0049.pth --pre_hm --tracking \
5 | --data_dir=/data/dataset/crowdhuman --meta_arch p3aformer
6 | 


--------------------------------------------------------------------------------
/configs/standard/v100_mot17_fine_tune_mot17.sh:
--------------------------------------------------------------------------------
 1 | python -m torch.distributed.launch --nproc_per_node=8 \
 2 |    --use_env main.py \
 3 |    --meta_arch p3aformer \
 4 |    --data_dir /data/dataset/mot \
 5 |    --dataset_name MOT17 \
 6 |    --dataset_file p3aformer_mot \
 7 |    --batch_size=3  \
 8 |    --output_dir=./output/jul16_mot17_finetune \
 9 |    --num_workers=20 \
10 |    --resume=output/jul14_whole_ch_from_COCO/checkpoint0049.pth \
11 |    --pre_hm \
12 |    --tracking \
13 |    --same_aug_pre \
14 |    --image_blur_aug \
15 |    --lr 1e-4 \
16 |    --lr_backbone_names ["backbone.0"] \
17 |    --lr_backbone 2e-5 \
18 |    --lr_linear_proj_names ['reference_points', 'sampling_offsets',] \
19 |    --lr_linear_proj_mult 0.1 \
20 |    --lr_drop 40 \
21 |    --epochs 50 \
22 |    --weight_decay 1e-4 \
23 |    --clip_max_norm 0.1 \
24 |    --backbone 'resnet50' \
25 |    --position_embedding 'sine' \
26 |    --num_feature_levels 4 \
27 |    --enc_layers 6 \
28 |    --dec_layers 6 \
29 |    --dim_feedforward 1024 \
30 |    --hidden_dim 256 \
31 |    --shift 0.05 \
32 |    --scale 0.05 \
33 |    --rotate 0 \
34 |    --flip 0.5 \
35 |    --hm_disturb 0.05 \
36 |    --lost_disturb 0.4 \
37 |    --fp_disturb 0.1 \
38 |    --track_thresh 0.3 \
39 |    --new_thresh 0.3 \
40 |    --ltrb_amodal_weight 0.1


--------------------------------------------------------------------------------
/configs/standard/v100_submit_mot17.sh:
--------------------------------------------------------------------------------
 1 | EXP_DIR=output/july5_mot17_finetune
 2 | EXP_ID='Jul7WholeFineProcedure'
 3 | MODEL_NAME=checkpoint0049.pth  # our trained
 4 | python3 eval.py \
 5 |      --meta_arch p3aformer \
 6 |      --dataset_file e2e_joint \
 7 |      --dataset_name MOT17 \
 8 |      --epoch 200 \
 9 |      --with_box_refine \
10 |      --lr_drop 100 \
11 |      --lr 2e-4 \
12 |      --lr_backbone 2e-5 \
13 |      --pretrained ${EXP_DIR}/{MODEL_NAME} \
14 |      --output_dir ${EXP_DIR}/${EXP_ID} \
15 |      --batch_size 1 \
16 |      --sample_mode 'random_interval' \
17 |      --sample_interval 10 \
18 |      --sampler_steps 50 90 120 \
19 |      --sampler_lengths 2 3 4 5 \
20 |      --update_query_pos \
21 |      --merger_dropout 0 \
22 |      --dropout 0 \
23 |      --random_drop 0.1 \
24 |      --fp_ratio 0.3 \
25 |      --query_interaction_layer 'QIM' \
26 |      --extra_track_attn \
27 |      --resume ${EXP_DIR}/${MODEL_NAME} \
28 |      --mot_path datasets \
29 |      --detr_path ${EXP_DIR}/${MODEL_NAME} \
30 |      --reid_path ${EXP_DIR}/ResNet_iter_25245.pth \
31 |      --data_dir=/data/dataset/mot/ \
32 |      --track_thre 0.5 \
33 |      --low_thre 0.2 \
34 |      --submit


--------------------------------------------------------------------------------
/configs/standard/v100_test_mot15.sh:
--------------------------------------------------------------------------------
 1 | # validate our trained
 2 | # for MOT15
 3 | EXP_DIR=output/feb9_long_epoch
 4 | EXP_ID='mot17_p3aformer_long_epoch'
 5 | MODEL_NAME='checkpoint0199.pth'
 6 | python3 eval.py \
 7 |      --meta_arch p3aformer \
 8 |      --dataset_name MOT15 \
 9 |      --epoch 200 \
10 |      --with_box_refine \
11 |      --lr_drop 100 \
12 |      --lr 2e-4 \
13 |      --lr_backbone 2e-5 \
14 |      --pretrained ${EXP_DIR}/${MODEL_NAME} \
15 |      --output_dir ${EXP_DIR}/${EXP_ID} \
16 |      --batch_size 1 \
17 |      --sample_mode 'random_interval' \
18 |      --sample_interval 10 \
19 |      --sampler_steps 50 90 120 \
20 |      --sampler_lengths 2 3 4 5 \
21 |      --update_query_pos \
22 |      --merger_dropout 0 \
23 |      --dropout 0 \
24 |      --random_drop 0.1 \
25 |      --fp_ratio 0.3 \
26 |      --query_interaction_layer 'QIM' \
27 |      --extra_track_attn \
28 |      --resume ${EXP_DIR}/${MODEL_NAME} \
29 |      --mot_path datasets \
30 |      --detr_path ${EXP_DIR}/${MODEL_NAME} \
31 |      --reid_path ${EXP_DIR}/ResNet_iter_25245.pth \
32 |      --data_dir=/data/dataset/MOT15/ \
33 |      --track_thre 0.65 \
34 |      --low_thre 0.2 \
35 |      --first_assign_thre 0.9 \
36 |      --second_assign_thre 0.5


--------------------------------------------------------------------------------
/configs/standard/v100_test_mot17.sh:
--------------------------------------------------------------------------------
 1 | # for MOT17
 2 | EXP_DIR=output/jul16_mot17_finetune
 3 | EXP_ID='Jul18Validate'
 4 | MODEL_NAME=checkpoint0049.pth  # our trained
 5 | python3 eval.py \
 6 |      --dataset_file p3aformer_mot \
 7 |      --meta_arch p3aformer \
 8 |      --dataset_name MOT17 \
 9 |      --epoch 200 \
10 |      --with_box_refine \
11 |      --lr_drop 100 \
12 |      --lr 2e-4 \
13 |      --lr_backbone 2e-5 \
14 |      --pretrained ${EXP_DIR}/{MODEL_NAME} \
15 |      --output_dir ${EXP_DIR}/${EXP_ID} \
16 |      --batch_size 1 \
17 |      --sample_mode 'random_interval' \
18 |      --sample_interval 10 \
19 |      --sampler_steps 50 90 120 \
20 |      --sampler_lengths 2 3 4 5 \
21 |      --update_query_pos \
22 |      --merger_dropout 0 \
23 |      --dropout 0 \
24 |      --random_drop 0.1 \
25 |      --fp_ratio 0.3 \
26 |      --query_interaction_layer 'QIM' \
27 |      --extra_track_attn \
28 |      --resume ${EXP_DIR}/${MODEL_NAME} \
29 |      --mot_path datasets \
30 |      --detr_path ${EXP_DIR}/${MODEL_NAME} \
31 |      --reid_path ${EXP_DIR}/ResNet_iter_25245.pth \
32 |      --data_dir=/data/dataset/mot/ \
33 |      --track_thre 0.5 \
34 |      --low_thre 0.2
35 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch.utils.data
 2 | from .detmot import build as build_e2e_mot
 3 | from .static_detmot import build as build_e2e_static_mot
 4 | from .joint import build as build_e2e_joint
 5 | from .torchvision_datasets import CocoDetection
 6 | from .byte_mot_half import build as build_byte_mot_val
 7 | from .p3aformer_dataset.coco import build as build_coco
 8 | from .p3aformer_dataset.mot17_train import build as build_p3aformer_mot
 9 | from .p3aformer_dataset.crowdhuman import CrowdHuman, build_crowdhuman
10 | from .p3aformer_dataset.mot17_train import build as build_p3aformer_mot_mixed
11 | 
12 | 
13 | def get_coco_api_from_dataset(dataset):
14 |     for _ in range(10):
15 |         # if isinstance(dataset, torchvision.datasets.CocoDetection):
16 |         #     break
17 |         if isinstance(dataset, torch.utils.data.Subset):
18 |             dataset = dataset.dataset
19 |     if isinstance(dataset, CocoDetection):
20 |         return dataset.coco
21 | 
22 | 
23 | def build_dataset(image_set, args):
24 |     if args.dataset_file == "coco":
25 |         return build_coco(image_set, args)
26 |     if args.dataset_file == "coco_panoptic":
27 |         # to avoid making panopticapi required for coco
28 |         from .coco_panoptic import build as build_coco_panoptic
29 | 
30 |         return build_coco_panoptic(image_set, args)
31 |     if args.dataset_file == "e2e_joint":  # default dataset for MOT task
32 |         return build_e2e_joint(image_set, args)
33 |     if args.dataset_file == "e2e_static_mot":
34 |         return build_e2e_static_mot(image_set, args)
35 |     if args.dataset_file == "e2e_mot":
36 |         return build_e2e_mot(image_set, args)
37 |     if args.dataset_file == "byte_mot_half":
38 |         return build_byte_mot_val(image_set, args)
39 |     if args.dataset_file == "p3aformer_mot":
40 |         return build_p3aformer_mot(image_set, args)
41 |     if args.dataset_file == "p3aformer_mixed":
42 |         return build_p3aformer_mot_mixed(image_set, args)
43 |     if args.dataset_file == "crowdHuman":
44 |         return build_crowdhuman(image_set, args)
45 |     raise ValueError(f"dataset {args.dataset_file} not supported")
46 | 


--------------------------------------------------------------------------------
/datasets/coco_panoptic.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  6 | # ------------------------------------------------------------------------
  7 | # Modified from DETR (https://github.com/facebookresearch/detr)
  8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | 
 12 | import json
 13 | from pathlib import Path
 14 | 
 15 | import numpy as np
 16 | import torch
 17 | from PIL import Image
 18 | 
 19 | from panopticapi.utils import rgb2id
 20 | from util.box_ops import masks_to_boxes
 21 | 
 22 | from .coco import make_coco_transforms
 23 | 
 24 | 
 25 | class CocoPanoptic:
 26 |     def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True):
 27 |         with open(ann_file, 'r') as f:
 28 |             self.coco = json.load(f)
 29 | 
 30 |         # sort 'images' field so that they are aligned with 'annotations'
 31 |         # i.e., in alphabetical order
 32 |         self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id'])
 33 |         # sanity check
 34 |         if "annotations" in self.coco:
 35 |             for img, ann in zip(self.coco['images'], self.coco['annotations']):
 36 |                 assert img['file_name'][:-4] == ann['file_name'][:-4]
 37 | 
 38 |         self.img_folder = img_folder
 39 |         self.ann_folder = ann_folder
 40 |         self.ann_file = ann_file
 41 |         self.transforms = transforms
 42 |         self.return_masks = return_masks
 43 | 
 44 |     def __getitem__(self, idx):
 45 |         ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx]
 46 |         img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg')
 47 |         ann_path = Path(self.ann_folder) / ann_info['file_name']
 48 | 
 49 |         img = Image.open(img_path).convert('RGB')
 50 |         w, h = img.size
 51 |         if "segments_info" in ann_info:
 52 |             masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
 53 |             masks = rgb2id(masks)
 54 | 
 55 |             ids = np.array([ann['id'] for ann in ann_info['segments_info']])
 56 |             masks = masks == ids[:, None, None]
 57 | 
 58 |             masks = torch.as_tensor(masks, dtype=torch.uint8)
 59 |             labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64)
 60 | 
 61 |         target = {}
 62 |         target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]])
 63 |         if self.return_masks:
 64 |             target['masks'] = masks
 65 |         target['labels'] = labels
 66 | 
 67 |         target["boxes"] = masks_to_boxes(masks)
 68 | 
 69 |         target['size'] = torch.as_tensor([int(h), int(w)])
 70 |         target['orig_size'] = torch.as_tensor([int(h), int(w)])
 71 |         if "segments_info" in ann_info:
 72 |             for name in ['iscrowd', 'area']:
 73 |                 target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']])
 74 | 
 75 |         if self.transforms is not None:
 76 |             img, target = self.transforms(img, target)
 77 | 
 78 |         return img, target
 79 | 
 80 |     def __len__(self):
 81 |         return len(self.coco['images'])
 82 | 
 83 |     def get_height_and_width(self, idx):
 84 |         img_info = self.coco['images'][idx]
 85 |         height = img_info['height']
 86 |         width = img_info['width']
 87 |         return height, width
 88 | 
 89 | 
 90 | def build(image_set, args):
 91 |     img_folder_root = Path(args.coco_path)
 92 |     ann_folder_root = Path(args.coco_panoptic_path)
 93 |     assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist'
 94 |     assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist'
 95 |     mode = 'panoptic'
 96 |     PATHS = {
 97 |         "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'),
 98 |         "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'),
 99 |     }
100 | 
101 |     img_folder, ann_file = PATHS[image_set]
102 |     img_folder_path = img_folder_root / img_folder
103 |     ann_folder = ann_folder_root / f'{mode}_{img_folder}'
104 |     ann_file = ann_folder_root / ann_file
105 | 
106 |     dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file,
107 |                            transforms=make_coco_transforms(image_set), return_masks=args.masks)
108 | 
109 |     return dataset
110 | 


--------------------------------------------------------------------------------
/datasets/d2_p3aformer_dataset/d2_mot15_val_dataset.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import json
  6 | import os
  7 | 
  8 | try:
  9 |     from datasets.d2_p3aformer_dataset.d2_generic_dataset_val import GenericDataset_val
 10 | except:
 11 |     from datasets.d2_p3aformer_dataset.d2_generic_dataset_val import GenericDataset_val
 12 | from detectron2.config import configurable
 13 | 
 14 | 
 15 | class MOT15_val(GenericDataset_val):
 16 |     num_classes = 1
 17 |     default_resolution = [640, 1088]
 18 |     max_objs = 300
 19 |     class_name = ["person"]
 20 |     cat_ids = {1: 1}
 21 | 
 22 |     @configurable
 23 |     def __init__(self, data_dir, split, input_w, input_h, output_w, output_h, private):
 24 |         assert split == "train", "We use MOT15 training split for validation."
 25 |         img_dir = os.path.join(data_dir, "images", "train")
 26 |         if split == "train":
 27 |             ann_path = os.path.join(data_dir, "annotations", "{}.json").format(split)
 28 |         elif split == "val":
 29 |             ann_path = os.path.join(data_dir, "annotations", "{}_last25.json").format(
 30 |                 split
 31 |             )
 32 |         else:  # testset
 33 |             ann_path = os.path.join(data_dir, "annotations", "{}.json").format(split)
 34 |         print(f"==> initializing MOT15 {split} data from ann_path {ann_path}.")
 35 |         self.is_mot17 = False
 36 |         self.images = None
 37 |         super(MOT15_val, self).__init__(
 38 |             input_w=input_w,
 39 |             input_h=input_h,
 40 |             output_w=output_w,
 41 |             output_h=output_h,
 42 |             split=split,
 43 |             ann_path=ann_path,
 44 |             img_dir=img_dir,
 45 |             private=private,
 46 |         )
 47 |         # load image list and coco
 48 |         self.num_samples = len(self.video_list)
 49 |         print("Loaded {} {} videos.".format(split, self.num_samples))
 50 | 
 51 |     @classmethod
 52 |     def from_config(cls, cfg):
 53 |         input_h, input_w = (
 54 |             cfg.MODEL.DENSETRACK.DEFAULT_RESOLUTION[0],
 55 |             cfg.MODEL.DENSETRACK.DEFAULT_RESOLUTION[1],
 56 |         )
 57 |         output_h = input_h // cfg.MODEL.DENSETRACK.DOWN_RATIO
 58 |         output_w = input_w // cfg.MODEL.DENSETRACK.DOWN_RATIO
 59 |         ret = {
 60 |             "data_dir": cfg.INPUT.VAL_DATA_DIR,
 61 |             "split": "train",
 62 |             "input_w": input_w,
 63 |             "input_h": input_h,
 64 |             "output_w": output_w,
 65 |             "output_h": output_h,
 66 |             "private": cfg.TRACK.DENSETRACK.PRIVATE,
 67 |         }
 68 |         return ret
 69 | 
 70 |     def _to_float(self, x):
 71 |         return float("{:.2f}".format(x))
 72 | 
 73 |     def _save_results(self, records, fpath):
 74 |         with open(fpath, "w") as fid:
 75 |             for record in records:
 76 |                 line = json.dumps(record) + "\n"
 77 |                 fid.write(line)
 78 |         return fpath
 79 | 
 80 |     def convert_eval_format(self, all_bboxes):
 81 |         detections = []
 82 |         person_id = 1
 83 |         for image_id in all_bboxes:
 84 |             if type(all_bboxes[image_id]) != type({}):
 85 |                 # newest format
 86 |                 dtboxes = []
 87 |                 for j in range(len(all_bboxes[image_id])):
 88 |                     item = all_bboxes[image_id][j]
 89 |                     if item["class"] != person_id:
 90 |                         continue
 91 |                     bbox = item["bbox"]
 92 |                     bbox[2] -= bbox[0]
 93 |                     bbox[3] -= bbox[1]
 94 |                     bbox_out = list(map(self._to_float, bbox[0:4]))
 95 |                     detection = {
 96 |                         "tag": 1,
 97 |                         "box": bbox_out,
 98 |                         "score": float("{:.2f}".format(item["score"])),
 99 |                     }
100 |                     dtboxes.append(detection)
101 |             img_info = self.coco.loadImgs(ids=[image_id])[0]
102 |             file_name = img_info["file_name"]
103 |             detections.append({"ID": file_name[:-4], "dtboxes": dtboxes})
104 |         return detections
105 | 
106 |     def __len__(self):
107 |         return self.num_samples
108 | 
109 |     def save_results(self, results, save_dir):
110 |         self._save_results(
111 |             self.convert_eval_format(results),
112 |             "{}/results_crowdhuman.odgt".format(save_dir),
113 |         )
114 | 
115 |     def run_eval(self, results, save_dir):
116 |         self.save_results(results, save_dir)
117 |         try:
118 |             os.system(
119 |                 "python tools/crowdhuman_eval/demo.py "
120 |                 + "../data/crowdhuman/annotation_val.odgt "
121 |                 + "{}/results_crowdhuman.odgt".format(save_dir)
122 |             )
123 |         except:
124 |             print("Crowdhuman evaluation not setup!")
125 | 


--------------------------------------------------------------------------------
/datasets/data_path/gen_labels_15.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import os
 3 | import numpy as np
 4 | import cv2
 5 | from tqdm import tqdm
 6 | 
 7 | def mkdirs(d):
 8 |     if not osp.exists(d):
 9 |         os.makedirs(d)
10 | 
11 | seq_root = '/data/workspace/datasets/mot/MOT15/images/train'
12 | label_root = '/data/workspace/datasets/mot/MOT15/labels_with_ids/train'
13 | mkdirs(label_root)
14 | seqs = ['ADL-Rundle-6', 'ETH-Bahnhof', 'KITTI-13', 'PETS09-S2L1', 'TUD-Stadtmitte', 'ADL-Rundle-8', 'KITTI-17',
15 |         'ETH-Pedcross2', 'ETH-Sunnyday', 'TUD-Campus', 'Venice-2']
16 | 
17 | tid_curr = 0
18 | tid_last = -1
19 | for seq in tqdm(seqs):
20 | 
21 |     # seq_info = open(osp.join(seq_root, seq, 'seqinfo.ini')).read()
22 |     # seq_width = int(seq_info[seq_info.find('imWidth=') + 8:seq_info.find('\nimHeight')])
23 |     # seq_height = int(seq_info[seq_info.find('imHeight=') + 9:seq_info.find('\nimExt')])
24 | 
25 |     all_imgs = os.listdir(osp.join(seq_root, seq, 'img1'))
26 |     fm = cv2.imread(osp.join(seq_root, seq, 'img1', all_imgs[0]))
27 |     seq_height, seq_width, c = fm.shape
28 | 
29 |     gt_txt = osp.join(seq_root, seq, 'gt', 'gt.txt')
30 |     gt = np.loadtxt(gt_txt, dtype=np.float64, delimiter=',')
31 |     idx = np.lexsort(gt.T[:2, :])
32 |     gt = gt[idx, :]
33 | 
34 |     seq_label_root = osp.join(label_root, seq, 'img1')
35 |     mkdirs(seq_label_root)
36 | 
37 |     for fid, tid, x, y, w, h, mark, _, _, _ in gt:
38 |         if mark == 0:
39 |             continue
40 |         fid = int(fid)
41 |         tid = int(tid)
42 |         if not tid == tid_last:
43 |             tid_curr += 1
44 |             tid_last = tid
45 |         x += w / 2
46 |         y += h / 2
47 |         label_fpath = osp.join(seq_label_root, '{:06d}.txt'.format(fid))
48 |         label_str = '0 {:d} {:.6f} {:.6f} {:.6f} {:.6f}\n'.format(
49 |             tid_curr, x / seq_width, y / seq_height, w / seq_width, h / seq_height)
50 |         with open(label_fpath, 'a') as f:
51 |             f.write(label_str)


--------------------------------------------------------------------------------
/datasets/data_path/gen_labels_16.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import os
 3 | import numpy as np
 4 | def mkdirs(d):
 5 |     if not osp.exists(d):
 6 |         os.makedirs(d)
 7 | 
 8 | seq_root = '/data/workspace/datasets/mot/MOT16/images/train'
 9 | label_root = '/data/workspace/datasets/mot/MOT16/labels_with_ids/train'
10 | mkdirs(label_root)
11 | seqs = [s for s in os.listdir(seq_root)]
12 | 
13 | tid_curr = 0
14 | tid_last = -1
15 | for seq in seqs:
16 |     seq_info = open(osp.join(seq_root, seq, 'seqinfo.ini')).read()
17 |     seq_width = int(seq_info[seq_info.find('imWidth=') + 8:seq_info.find('\nimHeight')])
18 |     seq_height = int(seq_info[seq_info.find('imHeight=') + 9:seq_info.find('\nimExt')])
19 | 
20 |     gt_txt = osp.join(seq_root, seq, 'gt', 'gt.txt')
21 |     gt = np.loadtxt(gt_txt, dtype=np.float64, delimiter=',')
22 |     idx = np.lexsort(gt.T[:2, :])
23 |     gt = gt[idx, :]
24 | 
25 |     seq_label_root = osp.join(label_root, seq, 'img1')
26 |     mkdirs(seq_label_root)
27 | 
28 |     for fid, tid, x, y, w, h, mark, _, _ in gt:
29 |         if mark == 0:
30 |             continue
31 |         fid = int(fid)
32 |         tid = int(tid)
33 |         if not tid == tid_last:
34 |             tid_curr += 1
35 |             tid_last = tid
36 |         x += w / 2
37 |         y += h / 2
38 |         label_fpath = osp.join(seq_label_root, '{:06d}.txt'.format(fid))
39 |         label_str = '0 {:d} {:.6f} {:.6f} {:.6f} {:.6f}\n'.format(
40 |             tid_curr, x / seq_width, y / seq_height, w / seq_width, h / seq_height)
41 |         with open(label_fpath, 'a') as f:
42 |             f.write(label_str)


--------------------------------------------------------------------------------
/datasets/data_path/prepare.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from functools import partial
 3 | from typing import List
 4 | 
 5 | 
 6 | def solve_MOT_train(root, year):
 7 |     assert year in [15, 16, 17]
 8 |     dataset_path = 'MOT{}/images/train'.format(year)
 9 |     data_root = os.path.join(root, dataset_path)
10 |     if year == 17:
11 |         video_paths = []
12 |         for video_name in os.listdir(data_root):
13 |             if 'SDP' in video_name:
14 |                 video_paths.append(video_name)
15 |     else:
16 |         video_paths = os.listdir(data_root)
17 | 
18 |     frames = []
19 |     for video_name in video_paths:
20 |         files = os.listdir(os.path.join(data_root, video_name, 'img1'))
21 |         files.sort()
22 |         for i in range(1, len(files) + 1):
23 |             frames.append(os.path.join(dataset_path, video_name, 'img1', '%06d.jpg' % i))
24 |     return frames
25 | 
26 | 
27 | def solve_CUHK(root):
28 |     dataset_path = 'ethz/CUHK-SYSU'
29 |     data_root = os.path.join(root, dataset_path)
30 |     file_names = os.listdir(os.path.join(data_root, 'images'))
31 |     file_names.sort()
32 | 
33 |     frames = []
34 |     for i in range(len(file_names)):
35 |         if os.path.exists(os.path.join(root, 'ethz/CUHK-SYSU/labels_with_ids', f's{i + 1}.txt')):
36 |             if os.path.exists(os.path.join(root, 'ethz/CUHK-SYSU/images', f's{i + 1}.jpg')):
37 |                 frames.append(os.path.join('ethz/CUHK-SYSU/images', f's{i + 1}.jpg'))
38 |     return frames
39 | 
40 | def solve_ETHZ(root):
41 |     dataset_path = 'ethz/ETHZ'
42 |     data_root = os.path.join(root, dataset_path)
43 |     video_paths = []
44 |     for name in os.listdir(data_root):
45 |         if name not in ['eth01', 'eth03']:
46 |             video_paths.append(name)
47 | 
48 |     frames = []
49 |     for video_path in video_paths:
50 |         files = os.listdir(os.path.join(data_root, video_path, 'images'))
51 |         files.sort()
52 |         for img_name in files:
53 |             if os.path.exists(os.path.join(data_root, video_path, 'labels_with_ids', img_name.replace('.png', '.txt'))):
54 |                 if os.path.exists(os.path.join(data_root, video_path, 'images', img_name)):
55 |                     frames.append(os.path.join('ethz/ETHZ', video_path, 'images', img_name))
56 |     return frames
57 | 
58 | 
59 | def solve_PRW(root):
60 |     dataset_path = 'ethz/PRW'
61 |     data_root = os.path.join(root, dataset_path)
62 |     frame_paths = os.listdir(os.path.join(data_root, 'images'))
63 |     frame_paths.sort()
64 |     frames = []
65 |     for i in range(len(frame_paths)):
66 |         if os.path.exists(os.path.join(data_root, 'labels_with_ids', frame_paths[i].split('.')[0] + '.txt')):
67 |             if os.path.exists(os.path.join(data_root, 'images', frame_paths[i])):
68 |                 frames.append(os.path.join(dataset_path, 'images', frame_paths[i]))
69 |     return frames
70 | 
71 | 
72 | dataset_catalog = {
73 |     'MOT15': partial(solve_MOT_train, year=15),
74 |     'MOT16': partial(solve_MOT_train, year=16),
75 |     'MOT17': partial(solve_MOT_train, year=17),
76 |     'CUHK-SYSU': solve_CUHK,
77 |     'ETHZ': solve_ETHZ,
78 |     'PRW': solve_PRW,
79 | }
80 | 
81 | 
82 | def solve(dataset_list: List[str], root, save_path):
83 |     all_frames = []
84 |     for dataset_name in dataset_list:
85 |         dataset_frames = dataset_catalog[dataset_name](root)
86 |         print("solve {} frames from dataset:{} ".format(len(dataset_frames), dataset_name))
87 |         all_frames.extend(dataset_frames)
88 |     print("totally {} frames are solved.".format(len(all_frames)))
89 |     with open(save_path, 'w') as f:
90 |         for u in all_frames:
91 |             line = '{}'.format(u) + '\n'
92 |             f.writelines(line)
93 | 
94 | root = '/data/workspace/datasets/mot' 
95 | save_path = '/data/workspace/detr-mot/datasets/data_path/mot17.train' # for fangao
96 | dataset_list = ['MOT17', ]
97 | 
98 | solve(dataset_list, root, save_path)
99 | 


--------------------------------------------------------------------------------
/datasets/data_prefetcher.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  6 | # ------------------------------------------------------------------------
  7 | # Modified from DETR (https://github.com/facebookresearch/detr)
  8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | 
 12 | import torch
 13 | from functools import partial
 14 | from models.structures import Instances
 15 | 
 16 | def to_cuda(samples, targets, device):
 17 |     samples = samples.to(device, non_blocking=True)
 18 |     targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets]
 19 |     return samples, targets
 20 | 
 21 | 
 22 | def tensor_to_cuda(tensor: torch.Tensor, device):
 23 |     return tensor.to(device)
 24 | 
 25 | 
 26 | def is_tensor_or_instances(data):
 27 |     return isinstance(data, torch.Tensor) or isinstance(data, Instances)
 28 | 
 29 | 
 30 | def data_apply(data, check_func, apply_func):
 31 |     if isinstance(data, dict):
 32 |         for k in data.keys():
 33 |             if check_func(data[k]):
 34 |                 data[k] = apply_func(data[k])
 35 |             elif isinstance(data[k], dict) or isinstance(data[k], list):
 36 |                 data_apply(data[k], check_func, apply_func)
 37 |             else:
 38 |                 raise ValueError()
 39 |     elif isinstance(data, list):
 40 |         for i in range(len(data)):
 41 |             if check_func(data[i]):
 42 |                 data[i] = apply_func(data[i])
 43 |             elif isinstance(data[i], dict) or isinstance(data[i], list):
 44 |                 data_apply(data[i], check_func, apply_func)
 45 |             else:
 46 |                 raise ValueError("invalid type {}".format(type(data[i])))
 47 |     else:
 48 |         raise ValueError("invalid type {}".format(type(data)))
 49 |     return data
 50 | 
 51 | 
 52 | def data_dict_to_cuda(data_dict, device):
 53 |     return data_apply(data_dict, is_tensor_or_instances, partial(tensor_to_cuda, device=device))
 54 | 
 55 | 
 56 | class data_prefetcher():
 57 |     def __init__(self, loader, device, prefetch=True):
 58 |         self.loader = iter(loader)
 59 |         self.prefetch = prefetch
 60 |         self.device = device
 61 |         if prefetch:
 62 |             self.stream = torch.cuda.Stream()
 63 |             self.preload()
 64 | 
 65 |     def preload(self):
 66 |         try:
 67 |             self.next_samples, self.next_targets = next(self.loader)
 68 |         except StopIteration:
 69 |             self.next_samples = None
 70 |             self.next_targets = None
 71 |             return
 72 |         # if record_stream() doesn't work, another option is to make sure device inputs are created
 73 |         # on the main stream.
 74 |         # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda')
 75 |         # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda')
 76 |         # Need to make sure the memory allocated for next_* is not still in use by the main stream
 77 |         # at the time we start copying to next_*:
 78 |         # self.stream.wait_stream(torch.cuda.current_stream())
 79 |         with torch.cuda.stream(self.stream):
 80 |             self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device)
 81 |             # more code for the alternative if record_stream() doesn't work:
 82 |             # copy_ will record the use of the pinned source tensor in this side stream.
 83 |             # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
 84 |             # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
 85 |             # self.next_input = self.next_input_gpu
 86 |             # self.next_target = self.next_target_gpu
 87 | 
 88 |             # With Amp, it isn't necessary to manually convert data to half.
 89 |             # if args.fp16:
 90 |             #     self.next_input = self.next_input.half()
 91 |             # else:
 92 | 
 93 |     def next(self):
 94 |         if self.prefetch:
 95 |             torch.cuda.current_stream().wait_stream(self.stream)
 96 |             samples = self.next_samples
 97 |             targets = self.next_targets
 98 |             if samples is not None:
 99 |                 samples.record_stream(torch.cuda.current_stream())
100 |             if targets is not None:
101 |                 for t in targets:
102 |                     for k, v in t.items():
103 |                         v.record_stream(torch.cuda.current_stream())
104 |             self.preload()
105 |         else:
106 |             try:
107 |                 samples, targets = next(self.loader)
108 |                 samples, targets = to_cuda(samples, targets, self.device)
109 |             except StopIteration:
110 |                 print("catch_stop_iter")
111 |                 samples = None
112 |                 targets = None
113 | 
114 |         return samples, targets
115 | 


--------------------------------------------------------------------------------
/datasets/p3aformer_dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/datasets/p3aformer_dataset/__init__.py


--------------------------------------------------------------------------------
/datasets/p3aformer_dataset/coco.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | from pycocotools.cocoeval import COCOeval
  6 | import json
  7 | import os
  8 | 
  9 | try:
 10 |     from .generic_dataset_train import GenericDataset
 11 | except:
 12 |     from generic_dataset_train import GenericDataset
 13 | 
 14 | 
 15 | class COCO(GenericDataset):
 16 |     default_resolution = [640, 1088]
 17 |     num_categories = 1
 18 |     class_name = ["person"]
 19 |     _valid_ids = [1]
 20 |     cat_ids = {v: i + 1 for i, v in enumerate(_valid_ids)}
 21 |     num_joints = 17
 22 |     flip_idx = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
 23 |     edges = [
 24 |         [0, 1],
 25 |         [0, 2],
 26 |         [1, 3],
 27 |         [2, 4],
 28 |         [4, 6],
 29 |         [3, 5],
 30 |         [5, 6],
 31 |         [5, 7],
 32 |         [7, 9],
 33 |         [6, 8],
 34 |         [8, 10],
 35 |         [6, 12],
 36 |         [5, 11],
 37 |         [11, 12],
 38 |         [12, 14],
 39 |         [14, 16],
 40 |         [11, 13],
 41 |         [13, 15],
 42 |     ]
 43 |     max_objs = 300
 44 | 
 45 |     def __init__(self, opt, split):
 46 |         # load annotations
 47 |         data_dir = os.path.join(opt.data_dir)
 48 |         img_dir = os.path.join(data_dir, "{}2017".format(split))
 49 |         ann_path = os.path.join(
 50 |             data_dir, "annotations", "instances_{}2017_person.json"
 51 |         ).format(split)
 52 | 
 53 |         self.images = None
 54 |         # load image list and coco
 55 |         super(COCO, self).__init__(opt, split, ann_path, img_dir)
 56 |         self.sf = 0.3
 57 |         self.num_samples = len(self.images)
 58 | 
 59 |         print("Loaded {} {} samples".format(split, self.num_samples))
 60 | 
 61 |     def _to_float(self, x):
 62 |         return float("{:.2f}".format(x))
 63 | 
 64 |     def convert_eval_format(self, all_bboxes):
 65 |         detections = []
 66 |         for image_id in all_bboxes:
 67 |             if type(all_bboxes[image_id]) != type({}):
 68 |                 # newest format
 69 |                 for j in range(len(all_bboxes[image_id])):
 70 |                     item = all_bboxes[image_id][j]
 71 |                     cat_id = item["class"] - 1
 72 |                     category_id = self._valid_ids[cat_id]
 73 |                     bbox = item["bbox"]
 74 |                     bbox[2] -= bbox[0]
 75 |                     bbox[3] -= bbox[1]
 76 |                     bbox_out = list(map(self._to_float, bbox[0:4]))
 77 |                     detection = {
 78 |                         "image_id": int(image_id),
 79 |                         "category_id": int(category_id),
 80 |                         "bbox": bbox_out,
 81 |                         "score": float("{:.2f}".format(item["score"])),
 82 |                     }
 83 |                     detections.append(detection)
 84 |         return detections
 85 | 
 86 |     def __len__(self):
 87 |         return self.num_samples
 88 | 
 89 |     def save_results(self, results, save_dir):
 90 |         json.dump(
 91 |             self.convert_eval_format(results),
 92 |             open("{}/results_coco.json".format(save_dir), "w"),
 93 |         )
 94 | 
 95 |     def run_eval(self, results, save_dir):
 96 |         self.save_results(results, save_dir)
 97 |         coco_dets = self.coco.loadRes("{}/results_coco.json".format(save_dir))
 98 |         coco_eval = COCOeval(self.coco, coco_dets, "bbox")
 99 |         coco_eval.evaluate()
100 |         coco_eval.accumulate()
101 |         coco_eval.summarize()
102 | 
103 | 
104 | def build(image_set, args):
105 |     d = COCO(args, image_set)
106 |     # input output shapes
107 |     args.input_h, args.input_w = d.default_resolution[0], d.default_resolution[1]
108 |     args.output_h = args.input_h // args.down_ratio
109 |     args.output_w = args.input_w // args.down_ratio
110 |     args.input_res = max(args.input_h, args.input_w)
111 |     args.output_res = max(args.output_h, args.output_w)
112 |     # threshold
113 |     args.out_thresh = max(args.track_thresh, args.out_thresh)
114 |     args.pre_thresh = max(args.track_thresh, args.pre_thresh)
115 |     args.new_thresh = max(args.track_thresh, args.new_thresh)
116 |     return d
117 | 


--------------------------------------------------------------------------------
/datasets/p3aformer_dataset/crowdhuman.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import json
  6 | import os
  7 | 
  8 | try:
  9 |     from .generic_dataset_train import GenericDataset
 10 | except:
 11 |     from generic_dataset_train import GenericDataset
 12 | 
 13 | 
 14 | class CrowdHuman(GenericDataset):
 15 |     num_classes = 1
 16 |     num_joints = 17
 17 |     default_resolution = [640, 1088]
 18 |     max_objs = 300
 19 |     class_name = ["person"]
 20 |     cat_ids = {1: 1}
 21 | 
 22 |     def __init__(self, opt, split):
 23 |         super(CrowdHuman, self).__init__()
 24 |         data_dir = opt.data_dir
 25 |         img_dir = os.path.join(data_dir, "Images")
 26 |         ann_path = os.path.join(data_dir, "annotations", "{}.json").format(split)
 27 | 
 28 |         print("==> initializing CrowdHuman {} data.".format(split))
 29 | 
 30 |         self.images = None
 31 |         # load image list and coco
 32 |         super(CrowdHuman, self).__init__(opt, split, ann_path, img_dir)
 33 |         self.sf = 0.3
 34 | 
 35 |         self.num_samples = len(self.images)
 36 | 
 37 |         print("Loaded {} {} samples".format(split, self.num_samples))
 38 | 
 39 |     def _to_float(self, x):
 40 |         return float("{:.2f}".format(x))
 41 | 
 42 |     def _save_results(self, records, fpath):
 43 |         with open(fpath, "w") as fid:
 44 |             for record in records:
 45 |                 line = json.dumps(record) + "\n"
 46 |                 fid.write(line)
 47 |         return fpath
 48 | 
 49 |     def convert_eval_format(self, all_bboxes):
 50 |         detections = []
 51 |         person_id = 1
 52 |         for image_id in all_bboxes:
 53 |             if type(all_bboxes[image_id]) != type({}):
 54 |                 # newest format
 55 |                 dtboxes = []
 56 |                 for j in range(len(all_bboxes[image_id])):
 57 |                     item = all_bboxes[image_id][j]
 58 |                     if item["class"] != person_id:
 59 |                         continue
 60 |                     bbox = item["bbox"]
 61 |                     bbox[2] -= bbox[0]
 62 |                     bbox[3] -= bbox[1]
 63 |                     bbox_out = list(map(self._to_float, bbox[0:4]))
 64 |                     detection = {
 65 |                         "tag": 1,
 66 |                         "box": bbox_out,
 67 |                         "score": float("{:.2f}".format(item["score"])),
 68 |                     }
 69 |                     dtboxes.append(detection)
 70 |             img_info = self.coco.loadImgs(ids=[image_id])[0]
 71 |             file_name = img_info["file_name"]
 72 |             detections.append({"ID": file_name[:-4], "dtboxes": dtboxes})
 73 |         return detections
 74 | 
 75 |     def __len__(self):
 76 |         return self.num_samples
 77 | 
 78 |     def save_results(self, results, save_dir):
 79 |         self._save_results(
 80 |             self.convert_eval_format(results),
 81 |             "{}/results_crowdhuman.odgt".format(save_dir),
 82 |         )
 83 | 
 84 |     def run_eval(self, results, save_dir):
 85 |         self.save_results(results, save_dir)
 86 |         try:
 87 |             os.system(
 88 |                 "python tools/crowdhuman_eval/demo.py "
 89 |                 + "../data/crowdhuman/annotation_val.odgt "
 90 |                 + "{}/results_crowdhuman.odgt".format(save_dir)
 91 |             )
 92 |         except:
 93 |             print("Crowdhuman evaluation not setup!")
 94 | 
 95 | 
 96 | def build_crowdhuman(image_set, args):
 97 |     d = CrowdHuman(args, image_set)
 98 |     # input output shapes
 99 |     args.input_h, args.input_w = d.default_resolution[0], d.default_resolution[1]
100 |     args.output_h = args.input_h // args.down_ratio
101 |     args.output_w = args.input_w // args.down_ratio
102 |     args.input_res = max(args.input_h, args.input_w)
103 |     args.output_res = max(args.output_h, args.output_w)
104 |     # threshold
105 |     args.out_thresh = max(args.track_thresh, args.out_thresh)
106 |     args.pre_thresh = max(args.track_thresh, args.pre_thresh)
107 |     args.new_thresh = max(args.track_thresh, args.new_thresh)
108 |     args.adaptive_clip = True
109 |     return d
110 | 


--------------------------------------------------------------------------------
/datasets/p3aformer_dataset/mot17_train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import json
  6 | import os
  7 | try:
  8 |   from .generic_dataset_train import GenericDataset
  9 | except:
 10 |   from generic_dataset_train import GenericDataset
 11 | 
 12 | 
 13 | class MOT17(GenericDataset):
 14 |   num_classes = 1
 15 |   num_joints = 17
 16 |   default_resolution = [640, 1088]
 17 |   max_objs = 300
 18 |   class_name = ['person']
 19 |   cat_ids = {1: 1}
 20 | 
 21 |   def __init__(self, opt, split):
 22 |     super(MOT17, self).__init__()
 23 |     data_dir = opt.data_dir
 24 |     if split == 'test':
 25 |       img_dir = os.path.join(
 26 |         data_dir, 'test')
 27 |     else:
 28 |       img_dir = os.path.join(
 29 |         data_dir, 'train')
 30 |     if opt.half_train:
 31 |       print("==> Using half of the MOT 17 data!")
 32 |     if split == 'train' and not opt.half_train:
 33 |       ann_path = os.path.join(data_dir, 'annotations_onlySDP', '{}.json').format(split)
 34 |     else:
 35 |       ann_path = os.path.join(data_dir, 'annotations_onlySDP', '{}_half.json').format(split)
 36 | 
 37 |     print('==> initializing MOT17 {} data.'.format(split))
 38 | 
 39 |     self.images = None
 40 |     # load image list and coco
 41 |     super(MOT17, self).__init__(opt, split, ann_path, img_dir)
 42 | 
 43 |     self.num_samples = len(self.images)
 44 | 
 45 |     print('Loaded {} {} samples'.format(split, self.num_samples))
 46 | 
 47 |   def _to_float(self, x):
 48 |     return float("{:.2f}".format(x))
 49 | 
 50 |   def _save_results(self, records, fpath):
 51 |     with open(fpath,'w') as fid:
 52 |       for record in records:
 53 |         line = json.dumps(record)+'\n'
 54 |         fid.write(line)
 55 |     return fpath
 56 | 
 57 |   def convert_eval_format(self, all_bboxes):
 58 |     detections = []
 59 |     person_id = 1
 60 |     for image_id in all_bboxes:
 61 |       if type(all_bboxes[image_id]) != type({}):
 62 |         # newest format
 63 |         dtboxes = []
 64 |         for j in range(len(all_bboxes[image_id])):
 65 |           item = all_bboxes[image_id][j]
 66 |           if item['class'] != person_id:
 67 |             continue
 68 |           bbox = item['bbox']
 69 |           bbox[2] -= bbox[0]
 70 |           bbox[3] -= bbox[1]
 71 |           bbox_out  = list(map(self._to_float, bbox[0:4]))
 72 |           detection = {
 73 |               "tag": 1,
 74 |               "box": bbox_out,
 75 |               "score": float("{:.2f}".format(item['score']))
 76 |           }
 77 |           dtboxes.append(detection)
 78 |       img_info = self.coco.loadImgs(ids=[image_id])[0]
 79 |       file_name = img_info['file_name']
 80 |       detections.append({'ID': file_name[:-4], 'dtboxes': dtboxes})
 81 |     return detections
 82 | 
 83 |   def __len__(self):
 84 |     return self.num_samples
 85 | 
 86 |   def save_results(self, results, save_dir):
 87 |     self._save_results(self.convert_eval_format(results),
 88 |                        '{}/results_crowdhuman.odgt'.format(save_dir))
 89 |   def run_eval(self, results, save_dir):
 90 |     self.save_results(results, save_dir)
 91 |     try:
 92 |       os.system('python tools/crowdhuman_eval/demo.py ' + \
 93 |                 '../data/crowdhuman/annotation_val.odgt ' + \
 94 |                 '{}/results_crowdhuman.odgt'.format(save_dir))
 95 |     except:
 96 |       print('Crowdhuman evaluation not setup!')
 97 | 
 98 | 
 99 | def build(image_set, args):
100 |   d = MOT17(args, image_set)
101 |   # input output shapes
102 |   args.input_h, args.input_w = d.default_resolution[0], d.default_resolution[1]
103 |   args.output_h = args.input_h // args.down_ratio
104 |   args.output_w = args.input_w // args.down_ratio
105 |   args.input_res = max(args.input_h, args.input_w)
106 |   args.output_res = max(args.output_h, args.output_w)
107 |   # threshold
108 |   args.out_thresh = max(args.track_thresh, args.out_thresh)
109 |   args.pre_thresh = max(args.track_thresh, args.pre_thresh)
110 |   args.new_thresh = max(args.track_thresh, args.new_thresh)
111 |   args.adaptive_clip = True
112 |   return d
113 | 


--------------------------------------------------------------------------------
/datasets/p3aformer_dataset/mot17_val_save_mem.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import json
  6 | import os
  7 | 
  8 | try:
  9 |     from datasets.p3aformer_dataset.generic_dataset_test_save_mem import (
 10 |         GenericDataset_val,
 11 |     )
 12 | except:
 13 |     from datasets.p3aformer_dataset.generic_dataset_test_save_mem import (
 14 |         GenericDataset_val,
 15 |     )
 16 | 
 17 | 
 18 | class MOT17_val(GenericDataset_val):
 19 |     num_classes = 1
 20 |     default_resolution = [640, 1088]
 21 |     max_objs = 300
 22 |     class_name = ["person"]
 23 |     cat_ids = {1: 1}
 24 | 
 25 |     def __init__(self, opt, split):
 26 |         super(MOT17_val, self).__init__()
 27 |         data_dir = opt.data_dir
 28 |         if split == "test":
 29 |             img_dir = os.path.join(data_dir, "test")
 30 |         else:
 31 |             img_dir = os.path.join(data_dir, "train")
 32 | 
 33 |         if split == "train":
 34 |             ann_path = os.path.join(data_dir, "annotations", "{}.json").format(split)
 35 |         elif split == "val":
 36 |             ann_path = os.path.join(data_dir, "annotations", "{}_last25.json").format(
 37 |                 split
 38 |             )
 39 |         else:  # testset
 40 |             ann_path = os.path.join(data_dir, "annotations", "{}.json").format(split)
 41 | 
 42 |         print("ann_path: ", ann_path)
 43 | 
 44 |         print("==> initializing MOT17 {} data.".format(split))
 45 | 
 46 |         self.images = None
 47 |         # load image list and coco
 48 |         super(MOT17_val, self).__init__(opt, split, ann_path, img_dir)
 49 | 
 50 |         self.num_samples = len(self.video_list)
 51 |         self.is_mot17 = True
 52 |         print("Loaded {} {} samples".format(split, self.num_samples))
 53 | 
 54 |     def _to_float(self, x):
 55 |         return float("{:.2f}".format(x))
 56 | 
 57 |     def _save_results(self, records, fpath):
 58 |         with open(fpath, "w") as fid:
 59 |             for record in records:
 60 |                 line = json.dumps(record) + "\n"
 61 |                 fid.write(line)
 62 |         return fpath
 63 | 
 64 |     def convert_eval_format(self, all_bboxes):
 65 |         detections = []
 66 |         person_id = 1
 67 |         for image_id in all_bboxes:
 68 |             if type(all_bboxes[image_id]) != type({}):
 69 |                 # newest format
 70 |                 dtboxes = []
 71 |                 for j in range(len(all_bboxes[image_id])):
 72 |                     item = all_bboxes[image_id][j]
 73 |                     if item["class"] != person_id:
 74 |                         continue
 75 |                     bbox = item["bbox"]
 76 |                     bbox[2] -= bbox[0]
 77 |                     bbox[3] -= bbox[1]
 78 |                     bbox_out = list(map(self._to_float, bbox[0:4]))
 79 |                     detection = {
 80 |                         "tag": 1,
 81 |                         "box": bbox_out,
 82 |                         "score": float("{:.2f}".format(item["score"])),
 83 |                     }
 84 |                     dtboxes.append(detection)
 85 |             img_info = self.coco.loadImgs(ids=[image_id])[0]
 86 |             file_name = img_info["file_name"]
 87 |             detections.append({"ID": file_name[:-4], "dtboxes": dtboxes})
 88 |         return detections
 89 | 
 90 |     def __len__(self):
 91 |         return self.num_samples
 92 | 
 93 |     def save_results(self, results, save_dir):
 94 |         self._save_results(
 95 |             self.convert_eval_format(results),
 96 |             "{}/results_crowdhuman.odgt".format(save_dir),
 97 |         )
 98 | 
 99 |     def run_eval(self, results, save_dir):
100 |         self.save_results(results, save_dir)
101 |         try:
102 |             os.system(
103 |                 "python tools/crowdhuman_eval/demo.py "
104 |                 + "../data/crowdhuman/annotation_val.odgt "
105 |                 + "{}/results_crowdhuman.odgt".format(save_dir)
106 |             )
107 |         except:
108 |             print("Crowdhuman evaluation not setup!")
109 | 


--------------------------------------------------------------------------------
/datasets/p3aformer_dataset/mot20_val_save_mem.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import json
  6 | import os
  7 | try:
  8 |   from .generic_dataset_test_save_mem import GenericDataset_val
  9 | except:
 10 |   from generic_dataset_test_save_mem import GenericDataset_val
 11 | 
 12 | 
 13 | class MOT20_val(GenericDataset_val):
 14 |   num_classes = 1
 15 |   default_resolution = [640, 1088]
 16 |   max_objs = 300
 17 |   class_name = ['person']
 18 |   cat_ids = {1: 1}
 19 | 
 20 |   def __init__(self, opt, split):
 21 |     super(MOT20_val, self).__init__()
 22 |     data_dir = opt.data_dir
 23 |     if split == 'test':
 24 |       img_dir = os.path.join(
 25 |         data_dir, 'test')
 26 |     else:
 27 |       img_dir = os.path.join(
 28 |         data_dir, 'train')
 29 | 
 30 |     if split == 'train':
 31 |       ann_path = os.path.join(data_dir, 'annotations',
 32 |         '{}.json').format(split)
 33 |     elif split == 'val':
 34 |       ann_path = os.path.join(data_dir, 'annotations',
 35 |                               '{}_last25.json').format(split)
 36 |     else:  #testset
 37 |       ann_path = os.path.join(data_dir, 'annotations',
 38 |                               '{}.json').format(split)
 39 | 
 40 |     print("ann_path: ", ann_path)
 41 | 
 42 |     print('==> initializing MOT20 {} data.'.format(split))
 43 | 
 44 |     self.images = None
 45 |     # load image list and coco
 46 |     super(MOT20_val, self).__init__(opt, split, ann_path, img_dir)
 47 | 
 48 |     self.num_samples = len(self.video_list)
 49 | 
 50 |     print('Loaded {} {} samples'.format(split, self.num_samples))
 51 | 
 52 |   def _to_float(self, x):
 53 |     return float("{:.2f}".format(x))
 54 | 
 55 |   def _save_results(self, records, fpath):
 56 |     with open(fpath,'w') as fid:
 57 |       for record in records:
 58 |         line = json.dumps(record)+'\n'
 59 |         fid.write(line)
 60 |     return fpath
 61 | 
 62 |   def convert_eval_format(self, all_bboxes):
 63 |     detections = []
 64 |     person_id = 1
 65 |     for image_id in all_bboxes:
 66 |       if type(all_bboxes[image_id]) != type({}):
 67 |         # newest format
 68 |         dtboxes = []
 69 |         for j in range(len(all_bboxes[image_id])):
 70 |           item = all_bboxes[image_id][j]
 71 |           if item['class'] != person_id:
 72 |             continue
 73 |           bbox = item['bbox']
 74 |           bbox[2] -= bbox[0]
 75 |           bbox[3] -= bbox[1]
 76 |           bbox_out  = list(map(self._to_float, bbox[0:4]))
 77 |           detection = {
 78 |               "tag": 1,
 79 |               "box": bbox_out,
 80 |               "score": float("{:.2f}".format(item['score']))
 81 |           }
 82 |           dtboxes.append(detection)
 83 |       img_info = self.coco.loadImgs(ids=[image_id])[0]
 84 |       file_name = img_info['file_name']
 85 |       detections.append({'ID': file_name[:-4], 'dtboxes': dtboxes})
 86 |     return detections
 87 | 
 88 |   def __len__(self):
 89 |     return self.num_samples
 90 | 
 91 |   def save_results(self, results, save_dir):
 92 |     self._save_results(self.convert_eval_format(results),
 93 |                        '{}/results_crowdhuman.odgt'.format(save_dir))
 94 |   def run_eval(self, results, save_dir):
 95 |     self.save_results(results, save_dir)
 96 |     try:
 97 |       os.system('python tools/crowdhuman_eval/demo.py ' + \
 98 |                 '../data/crowdhuman/annotation_val.odgt ' + \
 99 |                 '{}/results_crowdhuman.odgt'.format(save_dir))
100 |     except:
101 |       print('Crowdhuman evaluation not setup!')
102 | 


--------------------------------------------------------------------------------
/datasets/panoptic_eval.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | import json
13 | import os
14 | 
15 | import util.misc as utils
16 | 
17 | try:
18 |     from panopticapi.evaluation import pq_compute
19 | except ImportError:
20 |     pass
21 | 
22 | 
23 | class PanopticEvaluator(object):
24 |     def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"):
25 |         self.gt_json = ann_file
26 |         self.gt_folder = ann_folder
27 |         if utils.is_main_process():
28 |             if not os.path.exists(output_dir):
29 |                 os.mkdir(output_dir)
30 |         self.output_dir = output_dir
31 |         self.predictions = []
32 | 
33 |     def update(self, predictions):
34 |         for p in predictions:
35 |             with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
36 |                 f.write(p.pop("png_string"))
37 | 
38 |         self.predictions += predictions
39 | 
40 |     def synchronize_between_processes(self):
41 |         all_predictions = utils.all_gather(self.predictions)
42 |         merged_predictions = []
43 |         for p in all_predictions:
44 |             merged_predictions += p
45 |         self.predictions = merged_predictions
46 | 
47 |     def summarize(self):
48 |         if utils.is_main_process():
49 |             json_data = {"annotations": self.predictions}
50 |             predictions_json = os.path.join(self.output_dir, "predictions.json")
51 |             with open(predictions_json, "w") as f:
52 |                 f.write(json.dumps(json_data))
53 |             return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir)
54 |         return None
55 | 


--------------------------------------------------------------------------------
/datasets/torchvision_datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | from .coco import CocoDetection
13 | 


--------------------------------------------------------------------------------
/datasets/torchvision_datasets/coco.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | """
13 | Copy-Paste from torchvision, but add utility of caching images on memory
14 | """
15 | from torchvision.datasets.vision import VisionDataset
16 | from PIL import Image
17 | import os
18 | import os.path
19 | import tqdm
20 | from io import BytesIO
21 | 
22 | 
23 | class CocoDetection(VisionDataset):
24 |     """`MS Coco Detection <http://mscoco.org/dataset/#detections-challenge2016>`_ Dataset.
25 |     Args:
26 |         root (string): Root directory where images are downloaded to.
27 |         annFile (string): Path to json annotation file.
28 |         transform (callable, optional): A function/transform that  takes in an PIL image
29 |             and returns a transformed version. E.g, ``transforms.ToTensor``
30 |         target_transform (callable, optional): A function/transform that takes in the
31 |             target and transforms it.
32 |         transforms (callable, optional): A function/transform that takes input sample and its target as entry
33 |             and returns a transformed version.
34 |     """
35 | 
36 |     def __init__(self, root, annFile, transform=None, target_transform=None, transforms=None,
37 |                  cache_mode=False, local_rank=0, local_size=1):
38 |         super(CocoDetection, self).__init__(root, transforms, transform, target_transform)
39 |         from pycocotools.coco import COCO
40 |         self.coco = COCO(annFile)
41 |         self.ids = list(sorted(self.coco.imgs.keys()))
42 |         self.cache_mode = cache_mode
43 |         self.local_rank = local_rank
44 |         self.local_size = local_size
45 |         if cache_mode:
46 |             self.cache = {}
47 |             self.cache_images()
48 | 
49 |     def cache_images(self):
50 |         self.cache = {}
51 |         for index, img_id in zip(tqdm.trange(len(self.ids)), self.ids):
52 |             if index % self.local_size != self.local_rank:
53 |                 continue
54 |             path = self.coco.loadImgs(img_id)[0]['file_name']
55 |             with open(os.path.join(self.root, path), 'rb') as f:
56 |                 self.cache[path] = f.read()
57 | 
58 |     def get_image(self, path):
59 |         if self.cache_mode:
60 |             if path not in self.cache.keys():
61 |                 with open(os.path.join(self.root, path), 'rb') as f:
62 |                     self.cache[path] = f.read()
63 |             return Image.open(BytesIO(self.cache[path])).convert('RGB')
64 |         return Image.open(os.path.join(self.root, path)).convert('RGB')
65 | 
66 |     def __getitem__(self, index):
67 |         """
68 |         Args:
69 |             index (int): Index
70 |         Returns:
71 |             tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``.
72 |         """
73 |         coco = self.coco
74 |         img_id = self.ids[index]
75 |         ann_ids = coco.getAnnIds(imgIds=img_id)
76 |         target = coco.loadAnns(ann_ids)
77 | 
78 |         path = coco.loadImgs(img_id)[0]['file_name']
79 | 
80 |         img = self.get_image(path)
81 |         if self.transforms is not None:
82 |             img, target = self.transforms(img, target)
83 | 
84 |         return img, target
85 | 
86 |     def __len__(self):
87 |         return len(self.ids)
88 | 


--------------------------------------------------------------------------------
/exps:
--------------------------------------------------------------------------------
1 | /data/P3AFormer/exps


--------------------------------------------------------------------------------
/figs/P3AFormerModel_v12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/figs/P3AFormerModel_v12.png


--------------------------------------------------------------------------------
/figs/model_mind_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/figs/model_mind_flow.png


--------------------------------------------------------------------------------
/figs/pixelwise_association_v8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/figs/pixelwise_association_v8.png


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | # ------------------------------------------------------------------------
 9 | 
10 | from .deformable_detr import build as build_deformable_detr
11 | from .motr import build as build_motr
12 | from .p3aformer.p3aformer import build as build_p3aformer
13 | 
14 | 
15 | def build_model(args):
16 |     arch_catalog = {
17 |         "deformable_detr": build_deformable_detr,
18 |         "motr": build_motr,
19 |         "p3aformer": build_p3aformer,
20 |     }
21 |     assert args.meta_arch in arch_catalog, "invalid arch: {}".format(args.meta_arch)
22 |     build_func = arch_catalog[args.meta_arch]
23 |     return build_func(args)
24 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/__init__.py:
--------------------------------------------------------------------------------
1 | from .d2_p3aformer_model import D2P3AFormer
2 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .pixel_decoder.fpn import BasePixelDecoder
4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
5 | from .meta_arch.mask_former_head import MaskFormerHead
6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
7 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 | 
13 | python setup.py build install
14 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 | 
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 | 
23 | 
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 | 
30 | 
31 | torch.manual_seed(3)
32 | 
33 | 
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 |     value = torch.rand(N, S, M, D).cuda() * 0.01
37 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 |     im2col_step = 2
41 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 |     fwdok = torch.allclose(output_cuda, output_pytorch)
44 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
45 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 | 
47 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 | 
49 | 
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 |     value = torch.rand(N, S, M, D).cuda() * 0.01
53 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 |     im2col_step = 2
57 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
61 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 | 
63 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 | 
65 | 
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 | 
68 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
69 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 |     im2col_step = 2
73 |     func = MSDeformAttnFunction.apply
74 | 
75 |     value.requires_grad = grad_value
76 |     sampling_locations.requires_grad = grad_sampling_loc
77 |     attention_weights.requires_grad = grad_attn_weight
78 | 
79 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 | 
81 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     check_forward_equal_with_pytorch_double()
86 |     check_forward_equal_with_pytorch_float()
87 | 
88 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 |         check_gradient_numerical(channels, True, True, True)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/mask2former_modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         if mask is None:
31 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 |         not_mask = ~mask
33 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
34 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
35 |         if self.normalize:
36 |             eps = 1e-6
37 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 | 
40 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
41 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
42 | 
43 |         pos_x = x_embed[:, :, :, None] / dim_t
44 |         pos_y = y_embed[:, :, :, None] / dim_t
45 |         pos_x = torch.stack(
46 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 |         ).flatten(3)
48 |         pos_y = torch.stack(
49 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 |         return pos
53 |     
54 |     def __repr__(self, _repr_indent=4):
55 |         head = "Positional encoding " + self.__class__.__name__
56 |         body = [
57 |             "num_pos_feats: {}".format(self.num_pos_feats),
58 |             "temperature: {}".format(self.temperature),
59 |             "normalize: {}".format(self.normalize),
60 |             "scale: {}".format(self.scale),
61 |         ]
62 |         # _repr_indent = 4
63 |         lines = [head] + [" " * _repr_indent + line for line in body]
64 |         return "\n".join(lines)
65 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/transcenter_dla.py:
--------------------------------------------------------------------------------
 1 | ## TransCenter has code derived from
 2 | ## (1) 2020 fundamentalvision.(Apache License 2.0: https://github.com/fundamentalvision/Deformable-DETR)
 3 | ## (2) 2020 Philipp Bergmann, Tim Meinhardt. (GNU General Public License v3.0 Licence: https://github.com/phil-bergmann/tracking_wo_bnw)
 4 | ## (3) 2020 Facebook. (Apache License Version 2.0: https://github.com/facebookresearch/detr/)
 5 | ## (4) 2020 Xingyi Zhou.(MIT License: https://github.com/xingyizhou/CenterTrack)
 6 | ##
 7 | ## TransCenter uses packages from
 8 | ## (1) 2019 Charles Shang. (BSD 3-Clause Licence: https://github.com/CharlesShang/DCNv2)
 9 | ## (2) 2017 NVIDIA CORPORATION. (Apache License, Version 2.0: https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package)
10 | ## (3) 2019 Simon Niklaus. (GNU General Public License v3.0: https://github.com/sniklaus/pytorch-liteflownet)
11 | ## (4) 2018 Tak-Wai Hui. (Copyright (c), see details in the LICENSE file: https://github.com/twhui/LiteFlowNet)
12 | import torch.nn as nn
13 | from dcn_v2 import DCN
14 | import math
15 | import numpy as np
16 | import torch.nn.functional as F
17 | 
18 | BN_MOMENTUM = 0.1
19 | 
20 | 
21 | class DeformConv(nn.Module):
22 |     def __init__(self, chi, cho):
23 |         super(DeformConv, self).__init__()
24 |         self.actf = nn.Sequential(
25 |             nn.BatchNorm2d(cho, momentum=BN_MOMENTUM),
26 |             nn.ReLU(inplace=True)
27 |         )
28 |         self.conv = DCN(chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1)
29 | 
30 |     def forward(self, x):
31 |         x = self.conv(x)
32 |         x = self.actf(x)
33 |         return x
34 | 
35 | DLA_NODE = {
36 |     'dcn': (DeformConv, DeformConv)
37 | }
38 | 
39 | 
40 | def fill_fc_weights(layers):
41 |     for m in layers.modules():
42 |         if isinstance(m, nn.Conv2d):
43 |             if m.bias is not None:
44 |                 nn.init.constant_(m.bias, 0)
45 | 
46 | 
47 | def fill_up_weights(up):
48 |     w = up.weight.data
49 |     f = math.ceil(w.size(2) / 2)
50 |     c = (2 * f - 1 - f % 2) / (2. * f)
51 |     for i in range(w.size(2)):
52 |         for j in range(w.size(3)):
53 |             w[0, 0, i, j] = \
54 |                 (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
55 |     for c in range(1, w.size(0)):
56 |         w[c, 0, :, :] = w[0, 0, :, :]
57 | 
58 | 
59 | class IDAUpV3(nn.Module):
60 |     # bilinear upsampling version of IDA
61 |     def __init__(self, o, channels, up_f, node_type=(DeformConv, DeformConv)):
62 |         super(IDAUpV3, self).__init__()
63 |         self.up = nn.UpsamplingBilinear2d(scale_factor=2)  # no params
64 | 
65 |         for i in range(0, len(channels)):
66 |             c = channels[i]
67 |             if i == 0:
68 |                 node = node_type[1](c, o)
69 |             else:
70 |                 node = node_type[1](c, c)
71 |             setattr(self, 'node_' + str(i), node)
72 | 
73 |     def forward(self, layers, startp, endp):
74 |         for i in range(endp-1, startp, -1):
75 |             upsample = self.up
76 |             layers[i] = upsample(layers[i])  # ch 256-> 256
77 |             node = getattr(self, 'node_' + str(i))
78 |             layers[i-1] = node(layers[i] + layers[i - 1])
79 |         layers[startp] = self.up(layers[startp])  # 256=>256
80 |         node = getattr(self, 'node_' + str(startp))
81 |         layers[startp] = node(layers[startp])
82 |         return [layers[startp]]
83 | 
84 | 
85 | class Interpolate(nn.Module):
86 |     def __init__(self, scale, mode):
87 |         super(Interpolate, self).__init__()
88 |         self.scale = scale
89 |         self.mode = mode
90 | 
91 |     def forward(self, x):
92 |         x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False)
93 |         return x
94 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/transcenter_position_encoding.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  6 | # ------------------------------------------------------------------------
  7 | # Modified from DETR (https://github.com/facebookresearch/detr)
  8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | 
 12 | """
 13 | Various positional encodings for the transformer.
 14 | """
 15 | import math
 16 | import torch
 17 | from torch import nn
 18 | 
 19 | from util.misc import NestedTensor
 20 | 
 21 | 
 22 | class PositionEmbeddingSine(nn.Module):
 23 |     """
 24 |     This is a more standard version of the position embedding, very similar to the one
 25 |     used by the Attention is all you need paper, generalized to work on images.
 26 |     """
 27 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
 28 |         super().__init__()
 29 |         self.num_pos_feats = num_pos_feats
 30 |         self.temperature = temperature
 31 |         self.normalize = normalize
 32 |         if scale is not None and normalize is False:
 33 |             raise ValueError("normalize should be True if scale is passed")
 34 |         if scale is None:
 35 |             scale = 2 * math.pi
 36 |         self.scale = scale
 37 | 
 38 |     def forward(self, tensor_list: NestedTensor):
 39 |         x = tensor_list.tensors
 40 |         mask = tensor_list.mask
 41 |         assert mask is not None
 42 |         not_mask = ~mask
 43 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
 44 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
 45 |         if self.normalize:
 46 |             eps = 1e-6
 47 |             y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
 48 |             x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
 49 | 
 50 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
 51 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
 52 | 
 53 |         pos_x = x_embed[:, :, :, None] / dim_t
 54 |         pos_y = y_embed[:, :, :, None] / dim_t
 55 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
 56 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
 57 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
 58 |         return pos
 59 | 
 60 | 
 61 | class PositionEmbeddingLearned(nn.Module):
 62 |     """
 63 |     Absolute pos embedding, learned.
 64 |     """
 65 |     def __init__(self, num_pos_feats=256):
 66 |         super().__init__()
 67 |         self.row_embed = nn.Embedding(50, num_pos_feats)
 68 |         self.col_embed = nn.Embedding(50, num_pos_feats)
 69 |         self.reset_parameters()
 70 | 
 71 |     def reset_parameters(self):
 72 |         nn.init.uniform_(self.row_embed.weight)
 73 |         nn.init.uniform_(self.col_embed.weight)
 74 | 
 75 |     def forward(self, tensor_list: NestedTensor):
 76 |         x = tensor_list.tensors
 77 |         h, w = x.shape[-2:]
 78 |         i = torch.arange(w, device=x.device)
 79 |         j = torch.arange(h, device=x.device)
 80 |         x_emb = self.col_embed(i)
 81 |         y_emb = self.row_embed(j)
 82 |         pos = torch.cat([
 83 |             x_emb.unsqueeze(0).repeat(h, 1, 1),
 84 |             y_emb.unsqueeze(1).repeat(1, w, 1),
 85 |         ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
 86 |         return pos
 87 | 
 88 | 
 89 | def build_position_encoding(cfg):
 90 |     N_steps = cfg.MODEL.DENSETRACK.HIDDEN_DIM // 2
 91 |     if cfg.MODEL.DENSETRACK.POSITION_EMBEDDING in ('v2', 'sine'):
 92 |         # TODO find a better way of exposing other arguments
 93 |         position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
 94 |     elif cfg.MODEL.DENSETRACK.POSITION_EMBEDDING in ('v3', 'learned'):
 95 |         position_embedding = PositionEmbeddingLearned(N_steps)
 96 |     else:
 97 |         raise ValueError(f"not supported {cfg.MODEL.DENSETRACK.POSITION_EMBEDDING}.")
 98 | 
 99 |     return position_embedding
100 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/transcenter_post_processing/post_process.py:
--------------------------------------------------------------------------------
 1 | ## TransCenter: Transformers with Dense Queries for Multiple-Object Tracking
 2 | ## Copyright Inria
 3 | ## Year 2021
 4 | ## Contact : yihong.xu@inria.fr
 5 | ##
 6 | ## TransCenter is free software: you can redistribute it and/or modify
 7 | ## it under the terms of the GNU General Public License as published by
 8 | ## the Free Software Foundation, either version 3 of the License, or
 9 | ## (at your option) any later version.
10 | 
11 | ## TransCenter is distributed in the hope that it will be useful,
12 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | ## GNU General Public License for more details.
15 | ##
16 | ## You should have received a copy of the GNU General Public License
17 | ## along with this program, TransCenter.  If not, see <http://www.gnu.org/licenses/> and the LICENSE file.
18 | ##
19 | ##
20 | ## TransCenter has code derived from
21 | ## (1) 2020 fundamentalvision.(Apache License 2.0: https://github.com/fundamentalvision/Deformable-DETR)
22 | ## (2) 2020 Philipp Bergmann, Tim Meinhardt. (GNU General Public License v3.0 Licence: https://github.com/phil-bergmann/tracking_wo_bnw)
23 | ## (3) 2020 Facebook. (Apache License Version 2.0: https://github.com/facebookresearch/detr/)
24 | ## (4) 2020 Xingyi Zhou.(MIT License: https://github.com/xingyizhou/CenterTrack)
25 | ##
26 | ## TransCenter uses packages from
27 | ## (1) 2019 Charles Shang. (BSD 3-Clause Licence: https://github.com/CharlesShang/DCNv2)
28 | ## (2) 2017 NVIDIA CORPORATION. (Apache License, Version 2.0: https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package)
29 | ## (3) 2019 Simon Niklaus. (GNU General Public License v3.0: https://github.com/sniklaus/pytorch-liteflownet)
30 | ## (4) 2018 Tak-Wai Hui. (Copyright (c), see details in the LICENSE file: https://github.com/twhui/LiteFlowNet)
31 | from __future__ import absolute_import
32 | from __future__ import division
33 | from __future__ import print_function
34 | 
35 | import numpy as np
36 | import cv2
37 | import pdb
38 | from util.image import transform_preds_with_trans, get_affine_transform
39 | 
40 | 
41 | def get_alpha(rot):
42 |     # output: (B, 8) [bin1_cls[0], bin1_cls[1], bin1_sin, bin1_cos,
43 |     #                 bin2_cls[0], bin2_cls[1], bin2_sin, bin2_cos]
44 |     # return rot[:, 0]
45 |     idx = rot[:, 1] > rot[:, 5]
46 |     alpha1 = np.arctan2(rot[:, 2], rot[:, 3]) + (-0.5 * np.pi)
47 |     alpha2 = np.arctan2(rot[:, 6], rot[:, 7]) + (0.5 * np.pi)
48 |     return alpha1 * idx + alpha2 * (1 - idx)
49 | 
50 | 
51 | def generic_post_process(dets, c, s, h, w, filter_by_scores=0.3):
52 |     if not ("scores" in dets):
53 |         return [{}], [{}]
54 |     ret = []
55 | 
56 |     for i in range(len(dets["scores"])):
57 |         preds = []
58 |         trans = get_affine_transform(c[i], s[i], 0, (w, h), inv=1).astype(np.float32)
59 |         for j in range(len(dets["scores"][i])):
60 |             if dets["scores"][i][j] < filter_by_scores:
61 |                 break
62 | 
63 |             item = {}
64 |             item["score"] = dets["scores"][i][j]
65 |             item["class"] = int(dets["clses"][i][j]) + 1
66 |             item["ct"] = transform_preds_with_trans(
67 |                 (dets["cts"][i][j]).reshape(1, 2), trans
68 |             ).reshape(2)
69 | 
70 |             if "tracking" in dets:
71 |                 # displacement to original image space
72 |                 tracking = transform_preds_with_trans(
73 |                     (dets["tracking"][i][j] + dets["cts"][i][j]).reshape(1, 2), trans
74 |                 ).reshape(2)
75 |                 item["tracking"] = (
76 |                     tracking - item["ct"]
77 |                 )  # ct in the ct int in original image plan
78 |                 item["pre_cts"] = tracking
79 | 
80 |             if "bboxes" in dets:
81 |                 bbox = transform_preds_with_trans(
82 |                     dets["bboxes"][i][j].reshape(2, 2), trans
83 |                 ).reshape(4)
84 |                 item["bbox"] = bbox
85 | 
86 |             preds.append(item)
87 |         ret.append(preds)
88 |     return ret
89 | 


--------------------------------------------------------------------------------
/models/d2_p3aformer/transcenter_post_processing/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | from util.image import gaussian_radius
 8 | import math
 9 | import numpy as np
10 | 
11 | def _sigmoid(x):
12 |   y = torch.clamp(x.sigmoid_(), min=1e-4, max=1-1e-4)
13 |   return y
14 | 
15 | def _sigmoid12(x):
16 |   y = torch.clamp(x.sigmoid_(), 1e-12)
17 |   return y
18 | 
19 | def _gather_feat(feat, ind):
20 |   dim = feat.size(2)
21 |   ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
22 |   feat = feat.gather(1, ind)
23 |   return feat
24 | 
25 | def _tranpose_and_gather_feat(feat, ind):
26 |   feat = feat.permute(0, 2, 3, 1).contiguous()
27 |   feat = feat.view(feat.size(0), -1, feat.size(3))
28 |   feat = _gather_feat(feat, ind)
29 |   return feat
30 | 
31 | def flip_tensor(x):
32 |   return torch.flip(x, [3])
33 |   # tmp = x.detach().cpu().numpy()[..., ::-1].copy()
34 |   # return torch.from_numpy(tmp).to(x.device)
35 | 
36 | def flip_lr(x, flip_idx):
37 |   tmp = x.detach().cpu().numpy()[..., ::-1].copy()
38 |   shape = tmp.shape
39 |   for e in flip_idx:
40 |     tmp[:, e[0], ...], tmp[:, e[1], ...] = \
41 |       tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy()
42 |   return torch.from_numpy(tmp.reshape(shape)).to(x.device)
43 | 
44 | def flip_lr_off(x, flip_idx):
45 |   tmp = x.detach().cpu().numpy()[..., ::-1].copy()
46 |   shape = tmp.shape
47 |   tmp = tmp.reshape(tmp.shape[0], 17, 2, 
48 |                     tmp.shape[2], tmp.shape[3])
49 |   tmp[:, :, 0, :, :] *= -1
50 |   for e in flip_idx:
51 |     tmp[:, e[0], ...], tmp[:, e[1], ...] = \
52 |       tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy()
53 |   return torch.from_numpy(tmp.reshape(shape)).to(x.device)
54 | 
55 | def _nms(heat, kernel=3):
56 |   pad = (kernel - 1) // 2
57 | 
58 |   hmax = nn.functional.max_pool2d(
59 |       heat, (kernel, kernel), stride=1, padding=pad)
60 |   keep = (hmax == heat).float()
61 |   return heat * keep
62 | 
63 | def _topk_channel(scores, K=100):
64 |   batch, cat, height, width = scores.size()
65 |   
66 |   topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
67 | 
68 |   topk_inds = topk_inds % (height * width)
69 |   topk_ys   = (topk_inds / width).int().float()
70 |   topk_xs   = (topk_inds % width).int().float()
71 | 
72 |   return topk_scores, topk_inds, topk_ys, topk_xs
73 | 
74 | def _topk(scores, K=100):
75 |   batch, cat, height, width = scores.size()
76 | 
77 |   topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
78 | 
79 |   topk_inds = topk_inds % (height * width)
80 |   topk_ys   = (topk_inds / width).int().float()
81 |   topk_xs   = (topk_inds % width).int().float()
82 | 
83 |   topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
84 |   topk_clses = (topk_ind / K).int()
85 |   topk_inds = _gather_feat(
86 |       topk_inds.view(batch, -1, 1), topk_ind).view(batch, K)
87 |   topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K)
88 |   topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K)
89 | 
90 |   return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
91 | 


--------------------------------------------------------------------------------
/models/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/models/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | import MultiScaleDeformableAttention as MSDA
22 | 
23 | 
24 | class MSDeformAttnFunction(Function):
25 |     @staticmethod
26 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
27 |         ctx.im2col_step = im2col_step
28 |         output = MSDA.ms_deform_attn_forward(
29 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
30 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
31 |         return output
32 | 
33 |     @staticmethod
34 |     @once_differentiable
35 |     def backward(ctx, grad_output):
36 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
37 |         grad_value, grad_sampling_loc, grad_attn_weight = \
38 |             MSDA.ms_deform_attn_backward(
39 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
40 | 
41 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
42 | 
43 | 
44 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
45 |     # for debug and test only,
46 |     # need to use cuda version instead
47 |     N_, S_, M_, D_ = value.shape
48 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
49 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
50 |     sampling_grids = 2 * sampling_locations - 1
51 |     sampling_value_list = []
52 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
53 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
54 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
55 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
56 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
57 |         # N_*M_, D_, Lq_, P_
58 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
59 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
60 |         sampling_value_list.append(sampling_value_l_)
61 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
62 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
63 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
64 |     return output.transpose(1, 2).contiguous()
65 | 


--------------------------------------------------------------------------------
/models/ops/make.sh:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | python setup.py build install
10 | 


--------------------------------------------------------------------------------
/models/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/models/ops/server_make.sh:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | rlaunch --cpu 8 --gpu 8 --memory 100000 --charged-group v_detection \
 9 | --priority Medium --preemptible no \
10 | -- python setup.py build install
11 | 


--------------------------------------------------------------------------------
/models/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | import os
10 | import glob
11 | 
12 | import torch
13 | 
14 | from torch.utils.cpp_extension import CUDA_HOME
15 | from torch.utils.cpp_extension import CppExtension
16 | from torch.utils.cpp_extension import CUDAExtension
17 | 
18 | from setuptools import find_packages
19 | from setuptools import setup
20 | 
21 | requirements = ["torch", "torchvision"]
22 | 
23 | def get_extensions():
24 |     this_dir = os.path.dirname(os.path.abspath(__file__))
25 |     extensions_dir = os.path.join(this_dir, "src")
26 | 
27 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 | 
31 |     sources = main_file + source_cpu
32 |     extension = CppExtension
33 |     extra_compile_args = {"cxx": []}
34 |     define_macros = []
35 | 
36 |     if torch.cuda.is_available() and CUDA_HOME is not None:
37 |         extension = CUDAExtension
38 |         sources += source_cuda
39 |         define_macros += [("WITH_CUDA", None)]
40 |         extra_compile_args["nvcc"] = [
41 |             "-DCUDA_HAS_FP16=1",
42 |             "-D__CUDA_NO_HALF_OPERATORS__",
43 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
44 |             "-D__CUDA_NO_HALF2_OPERATORS__",
45 |         ]
46 |     else:
47 |         raise NotImplementedError('Cuda is not availabel')
48 | 
49 |     sources = [os.path.join(extensions_dir, s) for s in sources]
50 |     include_dirs = [extensions_dir]
51 |     ext_modules = [
52 |         extension(
53 |             "MultiScaleDeformableAttention",
54 |             sources,
55 |             include_dirs=include_dirs,
56 |             define_macros=define_macros,
57 |             extra_compile_args=extra_compile_args,
58 |         )
59 |     ]
60 |     return ext_modules
61 | 
62 | setup(
63 |     name="MultiScaleDeformableAttention",
64 |     version="1.0",
65 |     author="Weijie Su",
66 |     url="https://github.com/fundamentalvision/Deformable-DETR",
67 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
68 |     packages=find_packages(exclude=("configs", "tests",)),
69 |     ext_modules=get_extensions(),
70 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
71 | )
72 | 


--------------------------------------------------------------------------------
/models/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include <vector>
12 | 
13 | #include <ATen/ATen.h>
14 | #include <ATen/cuda/CUDAContext.h>
15 | 
16 | 
17 | at::Tensor
18 | ms_deform_attn_cpu_forward(
19 |     const at::Tensor &value, 
20 |     const at::Tensor &spatial_shapes,
21 |     const at::Tensor &level_start_index,
22 |     const at::Tensor &sampling_loc,
23 |     const at::Tensor &attn_weight,
24 |     const int im2col_step)
25 | {
26 |     AT_ERROR("Not implement on cpu");
27 | }
28 | 
29 | std::vector<at::Tensor>
30 | ms_deform_attn_cpu_backward(
31 |     const at::Tensor &value, 
32 |     const at::Tensor &spatial_shapes,
33 |     const at::Tensor &level_start_index,
34 |     const at::Tensor &sampling_loc,
35 |     const at::Tensor &attn_weight,
36 |     const at::Tensor &grad_output,
37 |     const int im2col_step)
38 | {
39 |     AT_ERROR("Not implement on cpu");
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/models/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor
15 | ms_deform_attn_cpu_forward(
16 |     const at::Tensor &value, 
17 |     const at::Tensor &spatial_shapes,
18 |     const at::Tensor &level_start_index,
19 |     const at::Tensor &sampling_loc,
20 |     const at::Tensor &attn_weight,
21 |     const int im2col_step);
22 | 
23 | std::vector<at::Tensor>
24 | ms_deform_attn_cpu_backward(
25 |     const at::Tensor &value, 
26 |     const at::Tensor &spatial_shapes,
27 |     const at::Tensor &level_start_index,
28 |     const at::Tensor &sampling_loc,
29 |     const at::Tensor &attn_weight,
30 |     const at::Tensor &grad_output,
31 |     const int im2col_step);
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/models/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor ms_deform_attn_cuda_forward(
15 |     const at::Tensor &value, 
16 |     const at::Tensor &spatial_shapes,
17 |     const at::Tensor &level_start_index,
18 |     const at::Tensor &sampling_loc,
19 |     const at::Tensor &attn_weight,
20 |     const int im2col_step);
21 | 
22 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
23 |     const at::Tensor &value, 
24 |     const at::Tensor &spatial_shapes,
25 |     const at::Tensor &level_start_index,
26 |     const at::Tensor &sampling_loc,
27 |     const at::Tensor &attn_weight,
28 |     const at::Tensor &grad_output,
29 |     const int im2col_step);
30 | 
31 | 


--------------------------------------------------------------------------------
/models/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | 
13 | #include "cpu/ms_deform_attn_cpu.h"
14 | 
15 | #ifdef WITH_CUDA
16 | #include "cuda/ms_deform_attn_cuda.h"
17 | #endif
18 | 
19 | 
20 | at::Tensor
21 | ms_deform_attn_forward(
22 |     const at::Tensor &value, 
23 |     const at::Tensor &spatial_shapes,
24 |     const at::Tensor &level_start_index,
25 |     const at::Tensor &sampling_loc,
26 |     const at::Tensor &attn_weight,
27 |     const int im2col_step)
28 | {
29 |     if (value.type().is_cuda())
30 |     {
31 | #ifdef WITH_CUDA
32 |         return ms_deform_attn_cuda_forward(
33 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34 | #else
35 |         AT_ERROR("Not compiled with GPU support");
36 | #endif
37 |     }
38 |     AT_ERROR("Not implemented on the CPU");
39 | }
40 | 
41 | std::vector<at::Tensor>
42 | ms_deform_attn_backward(
43 |     const at::Tensor &value, 
44 |     const at::Tensor &spatial_shapes,
45 |     const at::Tensor &level_start_index,
46 |     const at::Tensor &sampling_loc,
47 |     const at::Tensor &attn_weight,
48 |     const at::Tensor &grad_output,
49 |     const int im2col_step)
50 | {
51 |     if (value.type().is_cuda())
52 |     {
53 | #ifdef WITH_CUDA
54 |         return ms_deform_attn_cuda_backward(
55 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56 | #else
57 |         AT_ERROR("Not compiled with GPU support");
58 | #endif
59 |     }
60 |     AT_ERROR("Not implemented on the CPU");
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/models/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include "ms_deform_attn.h"
12 | 
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16 | }
17 | 


--------------------------------------------------------------------------------
/models/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import time
14 | import torch
15 | import torch.nn as nn
16 | from torch.autograd import gradcheck
17 | 
18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19 | 
20 | 
21 | N, M, D = 1, 2, 2
22 | Lq, L, P = 2, 2, 2
23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25 | S = sum([(H*W).item() for H, W in shapes])
26 | 
27 | 
28 | torch.manual_seed(3)
29 | 
30 | 
31 | @torch.no_grad()
32 | def check_forward_equal_with_pytorch_double():
33 |     value = torch.rand(N, S, M, D).cuda() * 0.01
34 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37 |     im2col_step = 2
38 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40 |     fwdok = torch.allclose(output_cuda, output_pytorch)
41 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
42 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43 | 
44 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45 | 
46 | 
47 | @torch.no_grad()
48 | def check_forward_equal_with_pytorch_float():
49 |     value = torch.rand(N, S, M, D).cuda() * 0.01
50 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
51 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
52 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53 |     im2col_step = 2
54 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
55 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
56 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
57 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
58 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
59 | 
60 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
61 | 
62 | 
63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
64 | 
65 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
66 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
67 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
68 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
69 |     im2col_step = 2
70 |     func = MSDeformAttnFunction.apply
71 | 
72 |     value.requires_grad = grad_value
73 |     sampling_locations.requires_grad = grad_sampling_loc
74 |     attention_weights.requires_grad = grad_attn_weight
75 | 
76 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
77 | 
78 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     check_forward_equal_with_pytorch_double()
83 |     check_forward_equal_with_pytorch_float()
84 | 
85 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
86 |         check_gradient_numerical(channels, True, True, True)
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/models/p3aformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/models/p3aformer/__init__.py


--------------------------------------------------------------------------------
/models/p3aformer/p3aformer_dla.py:
--------------------------------------------------------------------------------
 1 | ## TransCenter has code derived from
 2 | ## (1) 2020 fundamentalvision.(Apache License 2.0: https://github.com/fundamentalvision/Deformable-DETR)
 3 | ## (2) 2020 Philipp Bergmann, Tim Meinhardt. (GNU General Public License v3.0 Licence: https://github.com/phil-bergmann/tracking_wo_bnw)
 4 | ## (3) 2020 Facebook. (Apache License Version 2.0: https://github.com/facebookresearch/detr/)
 5 | ## (4) 2020 Xingyi Zhou.(MIT License: https://github.com/xingyizhou/CenterTrack)
 6 | ##
 7 | ## TransCenter uses packages from
 8 | ## (1) 2019 Charles Shang. (BSD 3-Clause Licence: https://github.com/CharlesShang/DCNv2)
 9 | ## (2) 2017 NVIDIA CORPORATION. (Apache License, Version 2.0: https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package)
10 | ## (3) 2019 Simon Niklaus. (GNU General Public License v3.0: https://github.com/sniklaus/pytorch-liteflownet)
11 | ## (4) 2018 Tak-Wai Hui. (Copyright (c), see details in the LICENSE file: https://github.com/twhui/LiteFlowNet)
12 | import torch.nn as nn
13 | from dcn_v2 import DCN
14 | import math
15 | import numpy as np
16 | import torch.nn.functional as F
17 | 
18 | BN_MOMENTUM = 0.1
19 | 
20 | 
21 | class DeformConv(nn.Module):
22 |     def __init__(self, chi, cho):
23 |         super(DeformConv, self).__init__()
24 |         self.actf = nn.Sequential(
25 |             nn.BatchNorm2d(cho, momentum=BN_MOMENTUM),
26 |             nn.ReLU(inplace=True)
27 |         )
28 |         self.conv = DCN(chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1)
29 | 
30 |     def forward(self, x):
31 |         x = self.conv(x)
32 |         x = self.actf(x)
33 |         return x
34 | 
35 | DLA_NODE = {
36 |     'dcn': (DeformConv, DeformConv)
37 | }
38 | 
39 | 
40 | def fill_fc_weights(layers):
41 |     for m in layers.modules():
42 |         if isinstance(m, nn.Conv2d):
43 |             if m.bias is not None:
44 |                 nn.init.constant_(m.bias, 0)
45 | 
46 | 
47 | def fill_up_weights(up):
48 |     w = up.weight.data
49 |     f = math.ceil(w.size(2) / 2)
50 |     c = (2 * f - 1 - f % 2) / (2. * f)
51 |     for i in range(w.size(2)):
52 |         for j in range(w.size(3)):
53 |             w[0, 0, i, j] = \
54 |                 (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
55 |     for c in range(1, w.size(0)):
56 |         w[c, 0, :, :] = w[0, 0, :, :]
57 | 
58 | 
59 | class IDAUpV3(nn.Module):
60 |     # bilinear upsampling version of IDA
61 |     def __init__(self, o, channels, up_f, node_type=(DeformConv, DeformConv)):
62 |         super(IDAUpV3, self).__init__()
63 |         self.up = nn.UpsamplingBilinear2d(scale_factor=2)  # no params
64 | 
65 |         for i in range(0, len(channels)):
66 |             c = channels[i]
67 |             if i == 0:
68 |                 node = node_type[1](c, o)
69 |             else:
70 |                 node = node_type[1](c, c)
71 |             setattr(self, 'node_' + str(i), node)
72 | 
73 |     def forward(self, layers, startp, endp):
74 |         for i in range(endp-1, startp, -1):
75 |             upsample = self.up
76 |             layers[i] = upsample(layers[i])  # ch 256-> 256
77 |             node = getattr(self, 'node_' + str(i))
78 |             layers[i-1] = node(layers[i] + layers[i - 1])
79 |         layers[startp] = self.up(layers[startp])  # 256=>256
80 |         node = getattr(self, 'node_' + str(startp))
81 |         layers[startp] = node(layers[startp])
82 |         return [layers[startp]]
83 | 
84 | 
85 | class Interpolate(nn.Module):
86 |     def __init__(self, scale, mode):
87 |         super(Interpolate, self).__init__()
88 |         self.scale = scale
89 |         self.mode = mode
90 | 
91 |     def forward(self, x):
92 |         x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False)
93 |         return x
94 | 


--------------------------------------------------------------------------------
/models/p3aformer/p3aformer_liteflownet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/models/p3aformer/p3aformer_liteflownet/__init__.py


--------------------------------------------------------------------------------
/models/p3aformer/p3aformer_liteflownet/correlation_package/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/models/p3aformer/p3aformer_liteflownet/correlation_package/__init__.py


--------------------------------------------------------------------------------
/models/p3aformer/p3aformer_liteflownet/correlation_package/correlation.py:
--------------------------------------------------------------------------------
  1 | ## TransCenter: Transformers with Dense Queries for Multiple-Object Tracking
  2 | ## Copyright Inria
  3 | ## Year 2021
  4 | ## Contact : yihong.xu@inria.fr
  5 | ##
  6 | ## TransCenter is free software: you can redistribute it and/or modify
  7 | ## it under the terms of the GNU General Public License as published by
  8 | ## the Free Software Foundation, either version 3 of the License, or
  9 | ## (at your option) any later version.
 10 | 
 11 | ## TransCenter is distributed in the hope that it will be useful,
 12 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | ## GNU General Public License for more details.
 15 | ##
 16 | ## You should have received a copy of the GNU General Public License
 17 | ## along with this program, TransCenter.  If not, see <http://www.gnu.org/licenses/> and the LICENSE file.
 18 | ##
 19 | ##
 20 | ## TransCenter has code derived from
 21 | ## (1) 2020 fundamentalvision.(Apache License 2.0: https://github.com/fundamentalvision/Deformable-DETR)
 22 | ## (2) 2020 Philipp Bergmann, Tim Meinhardt. (GNU General Public License v3.0 Licence: https://github.com/phil-bergmann/tracking_wo_bnw)
 23 | ## (3) 2020 Facebook. (Apache License Version 2.0: https://github.com/facebookresearch/detr/)
 24 | ## (4) 2020 Xingyi Zhou.(MIT License: https://github.com/xingyizhou/CenterTrack)
 25 | ##
 26 | ## TransCenter uses packages from
 27 | ## (1) 2019 Charles Shang. (BSD 3-Clause Licence: https://github.com/CharlesShang/DCNv2)
 28 | ## (2) 2017 NVIDIA CORPORATION. (Apache License, Version 2.0: https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package)
 29 | ## (3) 2019 Simon Niklaus. (GNU General Public License v3.0: https://github.com/sniklaus/pytorch-liteflownet)
 30 | ## (4) 2018 Tak-Wai Hui. (Copyright (c), see details in the LICENSE file: https://github.com/twhui/LiteFlowNet)
 31 | import torch
 32 | from torch.nn.modules.module import Module
 33 | from torch.autograd import Function
 34 | import correlation_cuda
 35 | 
 36 | class CorrelationFunction(Function):
 37 | 
 38 |     # def __init__(self, pad_size=3, kernel_size=3, max_displacement=20, stride1=1, stride2=2, corr_multiply=1):
 39 |     #     super(CorrelationFunction, self).__init__()
 40 |     #     self.pad_size = pad_size
 41 |     #     self.kernel_size = kernel_size
 42 |     #     self.max_displacement = max_displacement
 43 |     #     self.stride1 = stride1
 44 |     #     self.stride2 = stride2
 45 |     #     self.corr_multiply = corr_multiply
 46 |     #     # self.out_channel = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1)
 47 | 
 48 |     @staticmethod
 49 |     def forward(ctx, input1, input2, pad_size, kernel_size, max_displacement,stride1, stride2, corr_multiply):
 50 |         ctx.save_for_backward(input1, input2)
 51 |         ctx.pad_size = pad_size
 52 |         ctx.kernel_size = kernel_size
 53 |         ctx.max_displacement = max_displacement
 54 |         ctx.stride1 = stride1
 55 |         ctx.stride2 = stride2
 56 |         ctx.corr_multiply = corr_multiply
 57 | 
 58 |         with torch.cuda.device_of(input1):
 59 |             rbot1 = input1.new()
 60 |             rbot2 = input2.new()
 61 |             output = input1.new()
 62 | 
 63 |             correlation_cuda.forward(input1, input2, rbot1, rbot2, output,
 64 |                                      pad_size, kernel_size, max_displacement, stride1, stride2, corr_multiply)
 65 | 
 66 |         return output
 67 | 
 68 |     @staticmethod
 69 |     def backward(ctx, grad_output):
 70 |         input1, input2 = ctx.saved_tensors
 71 | 
 72 |         with torch.cuda.device_of(input1):
 73 |             rbot1 = input1.new()
 74 |             rbot2 = input2.new()
 75 | 
 76 |             grad_input1 = input1.new()
 77 |             grad_input2 = input2.new()
 78 | 
 79 |             correlation_cuda.backward(input1, input2, rbot1, rbot2, grad_output, grad_input1, grad_input2,
 80 |                                       ctx.pad_size, ctx.kernel_size, ctx.max_displacement, ctx.stride1, ctx.stride2, ctx.corr_multiply)
 81 | 
 82 |         return grad_input1, grad_input2, None, None, None, None, None, None
 83 | 
 84 | 
 85 | class Correlation(Module):
 86 |     def __init__(self, pad_size=0, kernel_size=0, max_displacement=0, stride1=1, stride2=2, corr_multiply=1):
 87 |         super(Correlation, self).__init__()
 88 |         self.pad_size = pad_size
 89 |         self.kernel_size = kernel_size
 90 |         self.max_displacement = max_displacement
 91 |         self.stride1 = stride1
 92 |         self.stride2 = stride2
 93 |         self.corr_multiply = corr_multiply
 94 | 
 95 |     # @staticmethod
 96 |     def forward(self, input1, input2):
 97 | 
 98 |         input1 = input1.contiguous()
 99 |         input2 = input2.contiguous()
100 |         # result = CorrelationFunction(self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)(input1, input2)
101 |         result = CorrelationFunction.apply(input1, input2, self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)
102 | 
103 |         return result
104 | 
105 | 


--------------------------------------------------------------------------------
/models/p3aformer/p3aformer_liteflownet/correlation_package/correlation_cuda_kernel.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/ATen.h>
 4 | #include <ATen/Context.h>
 5 | #include <cuda_runtime.h>
 6 | 
 7 | int correlation_forward_cuda_kernel(at::Tensor& output,
 8 |     int ob,
 9 |     int oc,
10 |     int oh,
11 |     int ow,
12 |     int osb,
13 |     int osc,
14 |     int osh,
15 |     int osw,
16 | 
17 |     at::Tensor& input1,
18 |     int ic,
19 |     int ih,
20 |     int iw,
21 |     int isb,
22 |     int isc,
23 |     int ish,
24 |     int isw,
25 | 
26 |     at::Tensor& input2,
27 |     int gc,
28 |     int gsb,
29 |     int gsc,
30 |     int gsh,
31 |     int gsw,
32 | 
33 |     at::Tensor& rInput1,
34 |     at::Tensor& rInput2,
35 |     int pad_size,
36 |     int kernel_size,
37 |     int max_displacement,
38 |     int stride1,
39 |     int stride2,
40 |     int corr_type_multiply,
41 |     cudaStream_t stream);
42 | 
43 | 
44 | int correlation_backward_cuda_kernel(   
45 |     at::Tensor& gradOutput,
46 |     int gob,
47 |     int goc,
48 |     int goh,
49 |     int gow,
50 |     int gosb,
51 |     int gosc,
52 |     int gosh,
53 |     int gosw,
54 | 
55 |     at::Tensor& input1,
56 |     int ic,
57 |     int ih,
58 |     int iw,
59 |     int isb,
60 |     int isc,
61 |     int ish,
62 |     int isw,
63 | 
64 |     at::Tensor& input2,
65 |     int gsb,
66 |     int gsc,
67 |     int gsh,
68 |     int gsw,
69 | 
70 |     at::Tensor& gradInput1, 
71 |     int gisb,
72 |     int gisc,
73 |     int gish,
74 |     int gisw,
75 | 
76 |     at::Tensor& gradInput2,
77 |     int ggc,
78 |     int ggsb,
79 |     int ggsc,
80 |     int ggsh,
81 |     int ggsw,
82 | 
83 |     at::Tensor& rInput1,
84 |     at::Tensor& rInput2,
85 |     int pad_size,
86 |     int kernel_size,
87 |     int max_displacement,
88 |     int stride1,
89 |     int stride2,
90 |     int corr_type_multiply,
91 |     cudaStream_t stream);
92 | 


--------------------------------------------------------------------------------
/models/p3aformer/p3aformer_liteflownet/correlation_package/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | # Minimum requirements for the build system to execute.
3 | requires = ["setuptools", "wheel", "numpy", "torch"]  # PEP 508 specifications.
4 | 


--------------------------------------------------------------------------------
/models/p3aformer/p3aformer_liteflownet/correlation_package/setup.py:
--------------------------------------------------------------------------------
 1 | ## TransCenter: Transformers with Dense Queries for Multiple-Object Tracking
 2 | ## Copyright Inria
 3 | ## Year 2021
 4 | ## Contact : yihong.xu@inria.fr
 5 | ##
 6 | ## TransCenter is free software: you can redistribute it and/or modify
 7 | ## it under the terms of the GNU General Public License as published by
 8 | ## the Free Software Foundation, either version 3 of the License, or
 9 | ## (at your option) any later version.
10 | 
11 | ## TransCenter is distributed in the hope that it will be useful,
12 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | ## GNU General Public License for more details.
15 | ##
16 | ## You should have received a copy of the GNU General Public License
17 | ## along with this program, TransCenter.  If not, see <http://www.gnu.org/licenses/> and the LICENSE file.
18 | ##
19 | ##
20 | ## TransCenter has code derived from
21 | ## (1) 2020 fundamentalvision.(Apache License 2.0: https://github.com/fundamentalvision/Deformable-DETR)
22 | ## (2) 2020 Philipp Bergmann, Tim Meinhardt. (GNU General Public License v3.0 Licence: https://github.com/phil-bergmann/tracking_wo_bnw)
23 | ## (3) 2020 Facebook. (Apache License Version 2.0: https://github.com/facebookresearch/detr/)
24 | ## (4) 2020 Xingyi Zhou.(MIT License: https://github.com/xingyizhou/CenterTrack)
25 | ##
26 | ## TransCenter uses packages from
27 | ## (1) 2019 Charles Shang. (BSD 3-Clause Licence: https://github.com/CharlesShang/DCNv2)
28 | ## (2) 2017 NVIDIA CORPORATION. (Apache License, Version 2.0: https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package)
29 | ## (3) 2019 Simon Niklaus. (GNU General Public License v3.0: https://github.com/sniklaus/pytorch-liteflownet)
30 | ## (4) 2018 Tak-Wai Hui. (Copyright (c), see details in the LICENSE file: https://github.com/twhui/LiteFlowNet)
31 | #!/usr/bin/env python3
32 | import os
33 | import torch
34 | 
35 | from setuptools import setup, find_packages
36 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
37 | 
38 | cxx_args = ['-std=c++14']
39 | 
40 | nvcc_args = [
41 |     '-gencode', 'arch=compute_50,code=sm_50',
42 |     '-gencode', 'arch=compute_52,code=sm_52',
43 |     '-gencode', 'arch=compute_60,code=sm_60',
44 |     '-gencode', 'arch=compute_61,code=sm_61',
45 |     '-gencode', 'arch=compute_70,code=sm_70',
46 |     '-gencode', 'arch=compute_70,code=compute_70',
47 |     '-gencode', 'arch=compute_75,code=compute_75',
48 |     '-gencode', 'arch=compute_80,code=compute_80',
49 |     '-gencode', 'arch=compute_86,code=compute_86',
50 | 
51 | ]
52 | 
53 | setup(
54 |     name='correlation_cuda',
55 |     ext_modules=[
56 |         CUDAExtension('correlation_cuda', [
57 |             'correlation_cuda.cc',
58 |             'correlation_cuda_kernel.cu'
59 |         ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
60 |     ],
61 |     cmdclass={
62 |         'build_ext': BuildExtension
63 |     })
64 | 


--------------------------------------------------------------------------------
/models/p3aformer/p3aformer_post_processing/post_process.py:
--------------------------------------------------------------------------------
 1 | ## TransCenter: Transformers with Dense Queries for Multiple-Object Tracking
 2 | ## Copyright Inria
 3 | ## Year 2021
 4 | ## Contact : yihong.xu@inria.fr
 5 | ##
 6 | ## TransCenter is free software: you can redistribute it and/or modify
 7 | ## it under the terms of the GNU General Public License as published by
 8 | ## the Free Software Foundation, either version 3 of the License, or
 9 | ## (at your option) any later version.
10 | 
11 | ## TransCenter is distributed in the hope that it will be useful,
12 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | ## GNU General Public License for more details.
15 | ##
16 | ## You should have received a copy of the GNU General Public License
17 | ## along with this program, TransCenter.  If not, see <http://www.gnu.org/licenses/> and the LICENSE file.
18 | ##
19 | ##
20 | ## TransCenter has code derived from
21 | ## (1) 2020 fundamentalvision.(Apache License 2.0: https://github.com/fundamentalvision/Deformable-DETR)
22 | ## (2) 2020 Philipp Bergmann, Tim Meinhardt. (GNU General Public License v3.0 Licence: https://github.com/phil-bergmann/tracking_wo_bnw)
23 | ## (3) 2020 Facebook. (Apache License Version 2.0: https://github.com/facebookresearch/detr/)
24 | ## (4) 2020 Xingyi Zhou.(MIT License: https://github.com/xingyizhou/CenterTrack)
25 | ##
26 | ## TransCenter uses packages from
27 | ## (1) 2019 Charles Shang. (BSD 3-Clause Licence: https://github.com/CharlesShang/DCNv2)
28 | ## (2) 2017 NVIDIA CORPORATION. (Apache License, Version 2.0: https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package)
29 | ## (3) 2019 Simon Niklaus. (GNU General Public License v3.0: https://github.com/sniklaus/pytorch-liteflownet)
30 | ## (4) 2018 Tak-Wai Hui. (Copyright (c), see details in the LICENSE file: https://github.com/twhui/LiteFlowNet)
31 | from __future__ import absolute_import
32 | from __future__ import division
33 | from __future__ import print_function
34 | 
35 | import numpy as np
36 | import cv2
37 | from util.image import transform_preds_with_trans, get_affine_transform
38 | 
39 | 
40 | def get_alpha(rot):
41 |   # output: (B, 8) [bin1_cls[0], bin1_cls[1], bin1_sin, bin1_cos, 
42 |   #                 bin2_cls[0], bin2_cls[1], bin2_sin, bin2_cos]
43 |   # return rot[:, 0]
44 |   idx = rot[:, 1] > rot[:, 5]
45 |   alpha1 = np.arctan2(rot[:, 2], rot[:, 3]) + (-0.5 * np.pi)
46 |   alpha2 = np.arctan2(rot[:, 6], rot[:, 7]) + ( 0.5 * np.pi)
47 |   return alpha1 * idx + alpha2 * (1 - idx)
48 | 
49 | 
50 | def generic_post_process(opt, dets, c, s, h, w, filter_by_scores=0.3):
51 |   if not ('scores' in dets):
52 |     return [{}], [{}]
53 |   ret = []
54 | 
55 |   for i in range(len(dets['scores'])):
56 |     preds = []
57 |     trans = get_affine_transform(
58 |       c[i], s[i], 0, (w, h), inv=1).astype(np.float32)
59 |     for j in range(len(dets['scores'][i])):
60 |       if dets['scores'][i][j] < filter_by_scores:
61 |         break
62 | 
63 |       item = {}
64 |       item['score'] = dets['scores'][i][j]
65 |       item['class'] = int(dets['clses'][i][j]) + 1
66 |       item['ct'] = transform_preds_with_trans(
67 |         (dets['cts'][i][j]).reshape(1, 2), trans).reshape(2)
68 | 
69 |       if 'tracking' in dets:
70 |         # displacement to original image space
71 |         tracking = transform_preds_with_trans(
72 |           (dets['tracking'][i][j] + dets['cts'][i][j]).reshape(1, 2), trans).reshape(2)
73 |         item['tracking'] = tracking - item['ct']  # ct in the ct int in original image plan
74 |         item['pre_cts'] = tracking
75 | 
76 |       if 'bboxes' in dets:
77 |         bbox = transform_preds_with_trans(
78 |           dets['bboxes'][i][j].reshape(2, 2), trans).reshape(4)
79 |         item['bbox'] = bbox
80 | 
81 |       preds.append(item)
82 | 
83 |     ret.append(preds)
84 |   
85 |   return ret


--------------------------------------------------------------------------------
/models/p3aformer/p3aformer_post_processing/utils.py:
--------------------------------------------------------------------------------
  1 | ## TransCenter: Transformers with Dense Queries for Multiple-Object Tracking
  2 | ## Copyright Inria
  3 | ## Year 2021
  4 | ## Contact : yihong.xu@inria.fr
  5 | ##
  6 | ## TransCenter is free software: you can redistribute it and/or modify
  7 | ## it under the terms of the GNU General Public License as published by
  8 | ## the Free Software Foundation, either version 3 of the License, or
  9 | ## (at your option) any later version.
 10 | 
 11 | ## TransCenter is distributed in the hope that it will be useful,
 12 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | ## GNU General Public License for more details.
 15 | ##
 16 | ## You should have received a copy of the GNU General Public License
 17 | ## along with this program, TransCenter.  If not, see <http://www.gnu.org/licenses/> and the LICENSE file.
 18 | ##
 19 | ##
 20 | ## TransCenter has code derived from
 21 | ## (1) 2020 fundamentalvision.(Apache License 2.0: https://github.com/fundamentalvision/Deformable-DETR)
 22 | ## (2) 2020 Philipp Bergmann, Tim Meinhardt. (GNU General Public License v3.0 Licence: https://github.com/phil-bergmann/tracking_wo_bnw)
 23 | ## (3) 2020 Facebook. (Apache License Version 2.0: https://github.com/facebookresearch/detr/)
 24 | ## (4) 2020 Xingyi Zhou.(MIT License: https://github.com/xingyizhou/CenterTrack)
 25 | ##
 26 | ## TransCenter uses packages from
 27 | ## (1) 2019 Charles Shang. (BSD 3-Clause Licence: https://github.com/CharlesShang/DCNv2)
 28 | ## (2) 2017 NVIDIA CORPORATION. (Apache License, Version 2.0: https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package)
 29 | ## (3) 2019 Simon Niklaus. (GNU General Public License v3.0: https://github.com/sniklaus/pytorch-liteflownet)
 30 | ## (4) 2018 Tak-Wai Hui. (Copyright (c), see details in the LICENSE file: https://github.com/twhui/LiteFlowNet)
 31 | from __future__ import absolute_import
 32 | from __future__ import division
 33 | from __future__ import print_function
 34 | 
 35 | import torch
 36 | import torch.nn as nn
 37 | from util.image import gaussian_radius
 38 | import math
 39 | import numpy as np
 40 | 
 41 | def _sigmoid(x):
 42 |   y = torch.clamp(x.sigmoid_(), min=1e-4, max=1-1e-4)
 43 |   return y
 44 | 
 45 | def _sigmoid12(x):
 46 |   y = torch.clamp(x.sigmoid_(), 1e-12)
 47 |   return y
 48 | 
 49 | def _gather_feat(feat, ind):
 50 |   dim = feat.size(2)
 51 |   ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
 52 |   feat = feat.gather(1, ind)
 53 |   return feat
 54 | 
 55 | def _tranpose_and_gather_feat(feat, ind):
 56 |   feat = feat.permute(0, 2, 3, 1).contiguous()
 57 |   feat = feat.view(feat.size(0), -1, feat.size(3))
 58 |   feat = _gather_feat(feat, ind)
 59 |   return feat
 60 | 
 61 | def flip_tensor(x):
 62 |   return torch.flip(x, [3])
 63 |   # tmp = x.detach().cpu().numpy()[..., ::-1].copy()
 64 |   # return torch.from_numpy(tmp).to(x.device)
 65 | 
 66 | def flip_lr(x, flip_idx):
 67 |   tmp = x.detach().cpu().numpy()[..., ::-1].copy()
 68 |   shape = tmp.shape
 69 |   for e in flip_idx:
 70 |     tmp[:, e[0], ...], tmp[:, e[1], ...] = \
 71 |       tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy()
 72 |   return torch.from_numpy(tmp.reshape(shape)).to(x.device)
 73 | 
 74 | def flip_lr_off(x, flip_idx):
 75 |   tmp = x.detach().cpu().numpy()[..., ::-1].copy()
 76 |   shape = tmp.shape
 77 |   tmp = tmp.reshape(tmp.shape[0], 17, 2, 
 78 |                     tmp.shape[2], tmp.shape[3])
 79 |   tmp[:, :, 0, :, :] *= -1
 80 |   for e in flip_idx:
 81 |     tmp[:, e[0], ...], tmp[:, e[1], ...] = \
 82 |       tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy()
 83 |   return torch.from_numpy(tmp.reshape(shape)).to(x.device)
 84 | 
 85 | def _nms(heat, kernel=3):
 86 |   pad = (kernel - 1) // 2
 87 | 
 88 |   hmax = nn.functional.max_pool2d(
 89 |       heat, (kernel, kernel), stride=1, padding=pad)
 90 |   keep = (hmax == heat).float()
 91 |   return heat * keep
 92 | 
 93 | def _topk_channel(scores, K=100):
 94 |   batch, cat, height, width = scores.size()
 95 |   
 96 |   topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
 97 | 
 98 |   topk_inds = topk_inds % (height * width)
 99 |   topk_ys   = (topk_inds / width).int().float()
100 |   topk_xs   = (topk_inds % width).int().float()
101 | 
102 |   return topk_scores, topk_inds, topk_ys, topk_xs
103 | 
104 | def _topk(scores, K=100):
105 |   batch, cat, height, width = scores.size()
106 | 
107 |   topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
108 | 
109 |   topk_inds = topk_inds % (height * width)
110 |   topk_ys   = (topk_inds / width).int().float()
111 |   topk_xs   = (topk_inds % width).int().float()
112 | 
113 |   topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
114 |   topk_clses = (topk_ind / K).int()
115 |   topk_inds = _gather_feat(
116 |       topk_inds.view(batch, -1, 1), topk_ind).view(batch, K)
117 |   topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K)
118 |   topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K)
119 | 
120 |   return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
121 | 


--------------------------------------------------------------------------------
/models/position_encoding.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
  3 | # ------------------------------------------------------------------------
  4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  6 | # ------------------------------------------------------------------------
  7 | # Modified from DETR (https://github.com/facebookresearch/detr)
  8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  9 | # ------------------------------------------------------------------------
 10 | 
 11 | 
 12 | """
 13 | Various positional encodings for the transformer.
 14 | """
 15 | import math
 16 | import torch
 17 | from torch import nn
 18 | 
 19 | from util.misc import NestedTensor
 20 | 
 21 | 
 22 | class PositionEmbeddingSine(nn.Module):
 23 |     """
 24 |     This is a more standard version of the position embedding, very similar to the one
 25 |     used by the Attention is all you need paper, generalized to work on images.
 26 |     """
 27 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
 28 |         super().__init__()
 29 |         self.num_pos_feats = num_pos_feats
 30 |         self.temperature = temperature
 31 |         self.normalize = normalize
 32 |         if scale is not None and normalize is False:
 33 |             raise ValueError("normalize should be True if scale is passed")
 34 |         if scale is None:
 35 |             scale = 2 * math.pi
 36 |         self.scale = scale
 37 | 
 38 |     def forward(self, tensor_list: NestedTensor):
 39 |         x = tensor_list.tensors
 40 |         mask = tensor_list.mask
 41 |         assert mask is not None
 42 |         not_mask = ~mask
 43 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
 44 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
 45 |         if self.normalize:
 46 |             eps = 1e-6
 47 |             y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
 48 |             x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
 49 | 
 50 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
 51 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
 52 | 
 53 |         pos_x = x_embed[:, :, :, None] / dim_t
 54 |         pos_y = y_embed[:, :, :, None] / dim_t
 55 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
 56 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
 57 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
 58 |         return pos
 59 | 
 60 | 
 61 | class PositionEmbeddingLearned(nn.Module):
 62 |     """
 63 |     Absolute pos embedding, learned.
 64 |     """
 65 |     def __init__(self, num_pos_feats=256):
 66 |         super().__init__()
 67 |         self.row_embed = nn.Embedding(50, num_pos_feats)
 68 |         self.col_embed = nn.Embedding(50, num_pos_feats)
 69 |         self.reset_parameters()
 70 | 
 71 |     def reset_parameters(self):
 72 |         nn.init.uniform_(self.row_embed.weight)
 73 |         nn.init.uniform_(self.col_embed.weight)
 74 | 
 75 |     def forward(self, tensor_list: NestedTensor):
 76 |         x = tensor_list.tensors
 77 |         h, w = x.shape[-2:]
 78 |         i = torch.arange(w, device=x.device)
 79 |         j = torch.arange(h, device=x.device)
 80 |         x_emb = self.col_embed(i)
 81 |         y_emb = self.row_embed(j)
 82 |         pos = torch.cat([
 83 |             x_emb.unsqueeze(0).repeat(h, 1, 1),
 84 |             y_emb.unsqueeze(1).repeat(1, w, 1),
 85 |         ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
 86 |         return pos
 87 | 
 88 | 
 89 | def build_position_encoding(args):
 90 |     N_steps = args.hidden_dim // 2
 91 |     if args.position_embedding in ('v2', 'sine'):
 92 |         # TODO find a better way of exposing other arguments
 93 |         position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
 94 |     elif args.position_embedding in ('v3', 'learned'):
 95 |         position_embedding = PositionEmbeddingLearned(N_steps)
 96 |     else:
 97 |         raise ValueError(f"not supported {args.position_embedding}")
 98 | 
 99 |     return position_embedding
100 | 


--------------------------------------------------------------------------------
/models/structures/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Modified from Detectron2 (https://github.com/facebookresearch/detectron2)
3 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
4 | # ------------------------------------------------------------------------
5 | from .boxes import Boxes, BoxMode, pairwise_iou, pairwise_ioa, matched_boxlist_iou
6 | from .instances import Instances
7 | 
8 | __all__ = [k for k in globals().keys() if not k.startswith("_")]


--------------------------------------------------------------------------------
/preprocess/convert_cityperson_to_coco.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import json
 4 | from PIL import Image
 5 | 
 6 | DATA_PATH = "data/Cityscapes/"
 7 | DATA_FILE_PATH = "data/data_path/citypersons.train"
 8 | OUT_PATH = DATA_PATH + "annotations/"
 9 | 
10 | 
11 | def load_paths(data_path):
12 |     with open(data_path, "r") as file:
13 |         img_files = file.readlines()
14 |         img_files = [x.replace("\n", "") for x in img_files]
15 |         img_files = list(filter(lambda x: len(x) > 0, img_files))
16 |     label_files = [
17 |         x.replace("images", "labels_with_ids")
18 |         .replace(".png", ".txt")
19 |         .replace(".jpg", ".txt")
20 |         for x in img_files
21 |     ]
22 |     return img_files, label_files
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     if not os.path.exists(OUT_PATH):
27 |         os.mkdir(OUT_PATH)
28 | 
29 |     out_path = OUT_PATH + "train.json"
30 |     out = {"images": [], "annotations": [], "categories": [{"id": 1, "name": "person"}]}
31 |     img_paths, label_paths = load_paths(DATA_FILE_PATH)
32 |     image_cnt = 0
33 |     ann_cnt = 0
34 |     video_cnt = 0
35 |     for img_path, label_path in zip(img_paths, label_paths):
36 |         image_cnt += 1
37 |         im = Image.open(os.path.join("data", img_path))
38 |         image_info = {
39 |             "file_name": img_path,
40 |             "id": image_cnt,
41 |             "height": im.size[1],
42 |             "width": im.size[0],
43 |         }
44 |         out["images"].append(image_info)
45 |         # Load labels
46 |         if os.path.isfile(os.path.join("data", label_path)):
47 |             labels0 = np.loadtxt(
48 |                 os.path.join("data", label_path), dtype=np.float32
49 |             ).reshape(-1, 6)
50 |             # Normalized xywh to pixel xyxy format
51 |             labels = labels0.copy()
52 |             labels[:, 2] = image_info["width"] * (labels0[:, 2] - labels0[:, 4] / 2)
53 |             labels[:, 3] = image_info["height"] * (labels0[:, 3] - labels0[:, 5] / 2)
54 |             labels[:, 4] = image_info["width"] * labels0[:, 4]
55 |             labels[:, 5] = image_info["height"] * labels0[:, 5]
56 |         else:
57 |             labels = np.array([])
58 |         for i in range(len(labels)):
59 |             ann_cnt += 1
60 |             fbox = labels[i, 2:6].tolist()
61 |             ann = {
62 |                 "id": ann_cnt,
63 |                 "category_id": 1,
64 |                 "image_id": image_cnt,
65 |                 "track_id": -1,
66 |                 "bbox": fbox,
67 |                 "area": fbox[2] * fbox[3],
68 |                 "iscrowd": 0,
69 |             }
70 |             out["annotations"].append(ann)
71 |     print(
72 |         "loaded train for {} images and {} samples".format(
73 |             len(out["images"]), len(out["annotations"])
74 |         )
75 |     )
76 |     json.dump(out, open(out_path, "w"))
77 | 


--------------------------------------------------------------------------------
/preprocess/convert_crowdhuman_to_coco.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import json
 4 | from PIL import Image
 5 | 
 6 | DATA_PATH = "data/crowdhuman/"
 7 | OUT_PATH = DATA_PATH + "annotations/"
 8 | SPLITS = ["val", "train"]
 9 | DEBUG = False
10 | 
11 | 
12 | def load_func(fpath):
13 |     print("fpath", fpath)
14 |     assert os.path.exists(fpath)
15 |     with open(fpath, "r") as fid:
16 |         lines = fid.readlines()
17 |     records = [json.loads(line.strip("\n")) for line in lines]
18 |     return records
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     if not os.path.exists(OUT_PATH):
23 |         os.mkdir(OUT_PATH)
24 |     for split in SPLITS:
25 |         data_path = DATA_PATH + split
26 |         out_path = OUT_PATH + "{}.json".format(split)
27 |         out = {
28 |             "images": [],
29 |             "annotations": [],
30 |             "categories": [{"id": 1, "name": "person"}],
31 |         }
32 |         ann_path = DATA_PATH + "annotation_{}.odgt".format(split)
33 |         anns_data = load_func(ann_path)
34 |         image_cnt = 0
35 |         ann_cnt = 0
36 |         video_cnt = 0
37 |         for ann_data in anns_data:
38 |             image_cnt += 1
39 |             file_path = (
40 |                 DATA_PATH
41 |                 + "CrowdHuman_{}/".format(split)
42 |                 + "{}.jpg".format(ann_data["ID"])
43 |             )
44 |             im = Image.open(file_path)
45 |             image_info = {
46 |                 "file_name": "{}.jpg".format(ann_data["ID"]),
47 |                 "id": image_cnt,
48 |                 "height": im.size[1],
49 |                 "width": im.size[0],
50 |             }
51 |             out["images"].append(image_info)
52 |             if split != "test":
53 |                 anns = ann_data["gtboxes"]
54 |                 for i in range(len(anns)):
55 |                     ann_cnt += 1
56 |                     fbox = anns[i]["fbox"]
57 |                     ann = {
58 |                         "id": ann_cnt,
59 |                         "category_id": 1,
60 |                         "image_id": image_cnt,
61 |                         "track_id": -1,
62 |                         "bbox_vis": anns[i]["vbox"],
63 |                         "bbox": fbox,
64 |                         "area": fbox[2] * fbox[3],
65 |                         "iscrowd": 1
66 |                         if "extra" in anns[i]
67 |                         and "ignore" in anns[i]["extra"]
68 |                         and anns[i]["extra"]["ignore"] == 1
69 |                         else 0,
70 |                     }
71 |                     out["annotations"].append(ann)
72 |         print(
73 |             "loaded {} for {} images and {} samples".format(
74 |                 split, len(out["images"]), len(out["annotations"])
75 |             )
76 |         )
77 |         json.dump(out, open(out_path, "w"))
78 | 


--------------------------------------------------------------------------------
/preprocess/convert_ethz_to_coco.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import json
 4 | from PIL import Image
 5 | 
 6 | DATA_PATH = "data/ETHZ/"
 7 | DATA_FILE_PATH = "data/data_path/eth.train"
 8 | OUT_PATH = DATA_PATH + "annotations/"
 9 | 
10 | 
11 | def load_paths(data_path):
12 |     with open(data_path, "r") as file:
13 |         img_files = file.readlines()
14 |         img_files = [x.replace("\n", "") for x in img_files]
15 |         img_files = list(filter(lambda x: len(x) > 0, img_files))
16 |     label_files = [
17 |         x.replace("images", "labels_with_ids")
18 |         .replace(".png", ".txt")
19 |         .replace(".jpg", ".txt")
20 |         for x in img_files
21 |     ]
22 |     return img_files, label_files
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     if not os.path.exists(OUT_PATH):
27 |         os.mkdir(OUT_PATH)
28 | 
29 |     out_path = OUT_PATH + "train.json"
30 |     out = {"images": [], "annotations": [], "categories": [{"id": 1, "name": "person"}]}
31 |     img_paths, label_paths = load_paths(DATA_FILE_PATH)
32 |     image_cnt = 0
33 |     ann_cnt = 0
34 |     video_cnt = 0
35 |     for img_path, label_path in zip(img_paths, label_paths):
36 |         image_cnt += 1
37 |         im = Image.open(os.path.join("data", img_path))
38 |         image_info = {
39 |             "file_name": img_path,
40 |             "id": image_cnt,
41 |             "height": im.size[1],
42 |             "width": im.size[0],
43 |         }
44 |         out["images"].append(image_info)
45 |         # Load labels
46 |         if os.path.isfile(os.path.join("data", label_path)):
47 |             labels0 = np.loadtxt(
48 |                 os.path.join("data", label_path), dtype=np.float32
49 |             ).reshape(-1, 6)
50 |             # Normalized xywh to pixel xyxy format
51 |             labels = labels0.copy()
52 |             labels[:, 2] = image_info["width"] * (labels0[:, 2] - labels0[:, 4] / 2)
53 |             labels[:, 3] = image_info["height"] * (labels0[:, 3] - labels0[:, 5] / 2)
54 |             labels[:, 4] = image_info["width"] * labels0[:, 4]
55 |             labels[:, 5] = image_info["height"] * labels0[:, 5]
56 |         else:
57 |             labels = np.array([])
58 |         for i in range(len(labels)):
59 |             ann_cnt += 1
60 |             fbox = labels[i, 2:6].tolist()
61 |             ann = {
62 |                 "id": ann_cnt,
63 |                 "category_id": 1,
64 |                 "image_id": image_cnt,
65 |                 "track_id": -1,
66 |                 "bbox": fbox,
67 |                 "area": fbox[2] * fbox[3],
68 |                 "iscrowd": 0,
69 |             }
70 |             out["annotations"].append(ann)
71 |     print(
72 |         "loaded train for {} images and {} samples".format(
73 |             len(out["images"]), len(out["annotations"])
74 |         )
75 |     )
76 |     json.dump(out, open(out_path, "w"))
77 | 


--------------------------------------------------------------------------------
/preprocess/data_preprocess.sh:
--------------------------------------------------------------------------------
 1 | python3 preprocess/convert_mot17_to_coco.py
 2 | python3 preprocess/convert_mot20_to_coco.py
 3 | python3 preprocess/convert_crowdhuman_to_coco.py
 4 | python3 preprocess/convert_cityperson_to_coco.py
 5 | python3 preprocess/convert_ethz_to_coco.py
 6 | 
 7 | bash preprocess/make_mixed_dirs.sh
 8 | python3 preprocess/mix_data_ablation.py
 9 | python3 preprocess/mix_data_test_mot17.py
10 | python3 preprocess/mix_data_test_mot20.py


--------------------------------------------------------------------------------
/preprocess/make_mixed_dirs.sh:
--------------------------------------------------------------------------------
 1 | cd data
 2 | mkdir -p mix_mot_ch/annotations
 3 | cp mot/annotations/val_half.json mix_mot_ch/annotations/val_half.json
 4 | cp mot/annotations/test.json mix_mot_ch/annotations/test.json
 5 | cd mix_mot_ch
 6 | ln -s ../mot/train mot_train
 7 | ln -s ../crowdhuman/CrowdHuman_train crowdhuman_train
 8 | ln -s ../crowdhuman/CrowdHuman_val crowdhuman_val
 9 | cd ..
10 | 
11 | mkdir -p mix_det/annotations
12 | cp mot/annotations/val_half.json mix_det/annotations/val_half.json
13 | cp mot/annotations/test.json mix_det/annotations/test.json
14 | cd mix_det
15 | ln -s ../mot/train mot_train
16 | ln -s ../crowdhuman/CrowdHuman_train crowdhuman_train
17 | ln -s ../crowdhuman/CrowdHuman_val crowdhuman_val
18 | ln -s ../Cityscapes cp_train
19 | ln -s ../ETHZ ethz_train
20 | cd ..
21 | 
22 | mkdir -p mix_mot20_ch/annotations
23 | cp MOT20/annotations/val_half.json mix_mot20_ch/annotations/val_half.json
24 | cp MOT20/annotations/test.json mix_mot20_ch/annotations/test.json
25 | cd mix_mot20_ch
26 | ln -s ../MOT20/train mot20_train
27 | ln -s ../crowdhuman/CrowdHuman_train crowdhuman_train
28 | ln -s ../crowdhuman/CrowdHuman_val crowdhuman_val
29 | cd ..


--------------------------------------------------------------------------------
/preprocess/mix_data_ablation.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | 
 5 | """
 6 | cd data
 7 | mkdir -p mix_mot_ch/annotations
 8 | cp mot/annotations/val_half.json mix_mot_ch/annotations/val_half.json
 9 | cp mot/annotations/test.json mix_mot_ch/annotations/test.json
10 | cd mix_mot_ch
11 | ln -s ../mot/train mot_train
12 | ln -s ../crowdhuman/CrowdHuman_train crowdhuman_train
13 | ln -s ../crowdhuman/CrowdHuman_val crowdhuman_val
14 | cd ..
15 | """
16 | 
17 | mot_json = json.load(open("data/mot/annotations/train_half.json", "r"))
18 | 
19 | img_list = list()
20 | for img in mot_json["images"]:
21 |     img["file_name"] = "mot_train/" + img["file_name"]
22 |     img_list.append(img)
23 | 
24 | ann_list = list()
25 | for ann in mot_json["annotations"]:
26 |     ann_list.append(ann)
27 | 
28 | video_list = mot_json["videos"]
29 | category_list = mot_json["categories"]
30 | 
31 | print("mot17")
32 | 
33 | max_img = 10000
34 | max_ann = 2000000
35 | max_video = 10
36 | 
37 | crowdhuman_json = json.load(open("data/crowdhuman/annotations/train.json", "r"))
38 | img_id_count = 0
39 | for img in crowdhuman_json["images"]:
40 |     img_id_count += 1
41 |     img["file_name"] = "crowdhuman_train/" + img["file_name"]
42 |     img["frame_id"] = img_id_count
43 |     img["prev_image_id"] = img["id"] + max_img
44 |     img["next_image_id"] = img["id"] + max_img
45 |     img["id"] = img["id"] + max_img
46 |     img["video_id"] = max_video
47 |     img_list.append(img)
48 | 
49 | for ann in crowdhuman_json["annotations"]:
50 |     ann["id"] = ann["id"] + max_ann
51 |     ann["image_id"] = ann["image_id"] + max_img
52 |     ann_list.append(ann)
53 | 
54 | video_list.append({"id": max_video, "file_name": "crowdhuman_train"})
55 | 
56 | print("crowdhuman_train")
57 | 
58 | max_img = 30000
59 | max_ann = 10000000
60 | 
61 | crowdhuman_val_json = json.load(open("data/crowdhuman/annotations/val.json", "r"))
62 | img_id_count = 0
63 | for img in crowdhuman_val_json["images"]:
64 |     img_id_count += 1
65 |     img["file_name"] = "crowdhuman_val/" + img["file_name"]
66 |     img["frame_id"] = img_id_count
67 |     img["prev_image_id"] = img["id"] + max_img
68 |     img["next_image_id"] = img["id"] + max_img
69 |     img["id"] = img["id"] + max_img
70 |     img["video_id"] = max_video
71 |     img_list.append(img)
72 | 
73 | for ann in crowdhuman_val_json["annotations"]:
74 |     ann["id"] = ann["id"] + max_ann
75 |     ann["image_id"] = ann["image_id"] + max_img
76 |     ann_list.append(ann)
77 | 
78 | video_list.append({"id": max_video, "file_name": "crowdhuman_val"})
79 | 
80 | print("crowdhuman_val")
81 | 
82 | mix_json = dict()
83 | mix_json["images"] = img_list
84 | mix_json["annotations"] = ann_list
85 | mix_json["videos"] = video_list
86 | mix_json["categories"] = category_list
87 | json.dump(mix_json, open("data/mix_mot_ch/annotations/train.json", "w"))
88 | 


--------------------------------------------------------------------------------
/preprocess/mix_data_test_mot17.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | 
  4 | mot_json = json.load(open("data/mot/annotations/train_half.json", "r"))
  5 | 
  6 | img_list = list()
  7 | for img in mot_json["images"]:
  8 |     img["file_name"] = "mot_train/" + img["file_name"]
  9 |     img_list.append(img)
 10 | 
 11 | ann_list = list()
 12 | for ann in mot_json["annotations"]:
 13 |     ann_list.append(ann)
 14 | 
 15 | video_list = mot_json["videos"]
 16 | category_list = mot_json["categories"]
 17 | 
 18 | 
 19 | print("mot17")
 20 | 
 21 | max_img = 10000
 22 | max_ann = 2000000
 23 | max_video = 10
 24 | 
 25 | crowdhuman_json = json.load(open("data/crowdhuman/annotations/train.json", "r"))
 26 | img_id_count = 0
 27 | for img in crowdhuman_json["images"]:
 28 |     img_id_count += 1
 29 |     img["file_name"] = "crowdhuman_train/" + img["file_name"]
 30 |     img["frame_id"] = img_id_count
 31 |     img["prev_image_id"] = img["id"] + max_img
 32 |     img["next_image_id"] = img["id"] + max_img
 33 |     img["id"] = img["id"] + max_img
 34 |     img["video_id"] = max_video
 35 |     img_list.append(img)
 36 | 
 37 | for ann in crowdhuman_json["annotations"]:
 38 |     ann["id"] = ann["id"] + max_ann
 39 |     ann["image_id"] = ann["image_id"] + max_img
 40 |     ann_list.append(ann)
 41 | 
 42 | print("crowdhuman_train")
 43 | 
 44 | video_list.append({"id": max_video, "file_name": "crowdhuman_train"})
 45 | 
 46 | 
 47 | max_img = 30000
 48 | max_ann = 10000000
 49 | 
 50 | crowdhuman_val_json = json.load(open("data/crowdhuman/annotations/val.json", "r"))
 51 | img_id_count = 0
 52 | for img in crowdhuman_val_json["images"]:
 53 |     img_id_count += 1
 54 |     img["file_name"] = "crowdhuman_val/" + img["file_name"]
 55 |     img["frame_id"] = img_id_count
 56 |     img["prev_image_id"] = img["id"] + max_img
 57 |     img["next_image_id"] = img["id"] + max_img
 58 |     img["id"] = img["id"] + max_img
 59 |     img["video_id"] = max_video
 60 |     img_list.append(img)
 61 | 
 62 | for ann in crowdhuman_val_json["annotations"]:
 63 |     ann["id"] = ann["id"] + max_ann
 64 |     ann["image_id"] = ann["image_id"] + max_img
 65 |     ann_list.append(ann)
 66 | 
 67 | print("crowdhuman_val")
 68 | 
 69 | video_list.append({"id": max_video, "file_name": "crowdhuman_val"})
 70 | 
 71 | max_img = 40000
 72 | max_ann = 20000000
 73 | 
 74 | ethz_json = json.load(open("data/ETHZ/annotations/train.json", "r"))
 75 | img_id_count = 0
 76 | for img in ethz_json["images"]:
 77 |     img_id_count += 1
 78 |     img["file_name"] = "ethz_train/" + img["file_name"][5:]
 79 |     img["frame_id"] = img_id_count
 80 |     img["prev_image_id"] = img["id"] + max_img
 81 |     img["next_image_id"] = img["id"] + max_img
 82 |     img["id"] = img["id"] + max_img
 83 |     img["video_id"] = max_video
 84 |     img_list.append(img)
 85 | 
 86 | for ann in ethz_json["annotations"]:
 87 |     ann["id"] = ann["id"] + max_ann
 88 |     ann["image_id"] = ann["image_id"] + max_img
 89 |     ann_list.append(ann)
 90 | 
 91 | print("ETHZ")
 92 | 
 93 | video_list.append({"id": max_video, "file_name": "ethz"})
 94 | 
 95 | max_img = 50000
 96 | max_ann = 25000000
 97 | 
 98 | cp_json = json.load(open("data/Cityscapes/annotations/train.json", "r"))
 99 | img_id_count = 0
100 | for img in cp_json["images"]:
101 |     img_id_count += 1
102 |     img["file_name"] = "cp_train/" + img["file_name"][11:]
103 |     img["frame_id"] = img_id_count
104 |     img["prev_image_id"] = img["id"] + max_img
105 |     img["next_image_id"] = img["id"] + max_img
106 |     img["id"] = img["id"] + max_img
107 |     img["video_id"] = max_video
108 |     img_list.append(img)
109 | 
110 | for ann in cp_json["annotations"]:
111 |     ann["id"] = ann["id"] + max_ann
112 |     ann["image_id"] = ann["image_id"] + max_img
113 |     ann_list.append(ann)
114 | 
115 | print("Cityscapes")
116 | 
117 | video_list.append({"id": max_video, "file_name": "cityperson"})
118 | 
119 | mix_json = dict()
120 | mix_json["images"] = img_list
121 | mix_json["annotations"] = ann_list
122 | mix_json["videos"] = video_list
123 | mix_json["categories"] = category_list
124 | json.dump(mix_json, open("data/mix_det/annotations/train.json", "w"))
125 | 


--------------------------------------------------------------------------------
/preprocess/mix_data_test_mot20.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | 
 5 | """
 6 | cd data
 7 | mkdir -p mix_mot20_ch/annotations
 8 | cp MOT20/annotations/val_half.json mix_mot20_ch/annotations/val_half.json
 9 | cp MOT20/annotations/test.json mix_mot20_ch/annotations/test.json
10 | cd mix_mot20_ch
11 | ln -s ../MOT20/train mot20_train
12 | ln -s ../crowdhuman/CrowdHuman_train crowdhuman_train
13 | ln -s ../crowdhuman/CrowdHuman_val crowdhuman_val
14 | cd ..
15 | """
16 | 
17 | mot_json = json.load(open("data/MOT20/annotations/train.json", "r"))
18 | 
19 | img_list = list()
20 | for img in mot_json["images"]:
21 |     img["file_name"] = "mot20_train/" + img["file_name"]
22 |     img_list.append(img)
23 | 
24 | ann_list = list()
25 | for ann in mot_json["annotations"]:
26 |     ann_list.append(ann)
27 | 
28 | video_list = mot_json["videos"]
29 | category_list = mot_json["categories"]
30 | 
31 | 
32 | max_img = 10000
33 | max_ann = 2000000
34 | max_video = 10
35 | 
36 | crowdhuman_json = json.load(open("data/crowdhuman/annotations/train.json", "r"))
37 | img_id_count = 0
38 | for img in crowdhuman_json["images"]:
39 |     img_id_count += 1
40 |     img["file_name"] = "crowdhuman_train/" + img["file_name"]
41 |     img["frame_id"] = img_id_count
42 |     img["prev_image_id"] = img["id"] + max_img
43 |     img["next_image_id"] = img["id"] + max_img
44 |     img["id"] = img["id"] + max_img
45 |     img["video_id"] = max_video
46 |     img_list.append(img)
47 | 
48 | for ann in crowdhuman_json["annotations"]:
49 |     ann["id"] = ann["id"] + max_ann
50 |     ann["image_id"] = ann["image_id"] + max_img
51 |     ann_list.append(ann)
52 | 
53 | video_list.append({"id": max_video, "file_name": "crowdhuman_train"})
54 | 
55 | 
56 | max_img = 30000
57 | max_ann = 10000000
58 | 
59 | crowdhuman_val_json = json.load(open("data/crowdhuman/annotations/val.json", "r"))
60 | img_id_count = 0
61 | for img in crowdhuman_val_json["images"]:
62 |     img_id_count += 1
63 |     img["file_name"] = "crowdhuman_val/" + img["file_name"]
64 |     img["frame_id"] = img_id_count
65 |     img["prev_image_id"] = img["id"] + max_img
66 |     img["next_image_id"] = img["id"] + max_img
67 |     img["id"] = img["id"] + max_img
68 |     img["video_id"] = max_video
69 |     img_list.append(img)
70 | 
71 | for ann in crowdhuman_val_json["annotations"]:
72 |     ann["id"] = ann["id"] + max_ann
73 |     ann["image_id"] = ann["image_id"] + max_img
74 |     ann_list.append(ann)
75 | 
76 | video_list.append({"id": max_video, "file_name": "crowdhuman_val"})
77 | 
78 | mix_json = dict()
79 | mix_json["images"] = img_list
80 | mix_json["annotations"] = ann_list
81 | mix_json["videos"] = video_list
82 | mix_json["categories"] = category_list
83 | json.dump(mix_json, open("data/mix_mot20_ch/annotations/train.json", "w"))
84 | 


--------------------------------------------------------------------------------
/pretrained:
--------------------------------------------------------------------------------
1 | /data/P3AFormer/pretrained/


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pycocotools
2 | tqdm
3 | cython
4 | scipy
5 | lap
6 | motmetrics


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 


--------------------------------------------------------------------------------
/tools/add_train_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | mot_path = "/data/dataset/mot"
 5 | sub_dir = 'train'
 6 | seq_nums = os.listdir('/data/dataset/mot/train')
 7 | accs = []
 8 | seqs = []
 9 | predict_path = "/data/dataset/mot/train_result"
10 | for seq_num in seq_nums:
11 |     shutil.copyfile(os.path.join(mot_path, sub_dir, f'{seq_num}/gt/gt.txt'),
12 |                     os.path.join(predict_path, f'{seq_num}.txt'))
13 | 


--------------------------------------------------------------------------------
/tools/combine_labels_mot.py:
--------------------------------------------------------------------------------
 1 | all_path2labels = {}
 2 | print("Trying loading all files ...")
 3 | for label_path in dataset_train.label_files:
 4 |     if osp.isfile(label_path):
 5 |         labels0 = np.loadtxt(label_path, dtype=np.float32).reshape(-1, 6).tolist()
 6 |         all_path2labels[label_path] = labels0
 7 |     else:
 8 |         raise ValueError('invalid label path: {}'.format(label_path))
 9 | for label_path in dataset_val.label_files:
10 |     if osp.isfile(label_path):
11 |         labels0 = np.loadtxt(label_path, dtype=np.float32).reshape(-1, 6).tolist()
12 |         all_path2labels[label_path] = labels0
13 |     else:
14 |         raise ValueError('invalid label path: {}'.format(label_path))
15 | import json
16 | 
17 | json.dump(all_path2labels, open("datasets/data_path/mot.json", 'w'))


--------------------------------------------------------------------------------
/tools/gen_labels_MOT17.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os.path as osp
16 | import os
17 | import numpy as np
18 | 
19 | MOT_data = '/data/dataset/mot'
20 | 
21 | # choose a data in ['MOT15', 'MOT16', 'MOT17', 'MOT20']
22 | # or your custom data (prepare it following the 'docs/tutorials/PrepareMOTDataSet.md')
23 | 
24 | 
25 | def mkdirs(d):
26 |     if not osp.exists(d):
27 |         os.makedirs(d)
28 | 
29 | 
30 | seq_root = '{}/train'.format(MOT_data)
31 | label_root = '{}/labels_with_ids/train'.format(MOT_data)
32 | mkdirs(label_root)
33 | seqs = [s for s in os.listdir(seq_root)]
34 | 
35 | tid_curr = 0
36 | tid_last = -1
37 | for seq in seqs:
38 |     seq_info = open(osp.join(seq_root, seq, 'seqinfo.ini')).read()
39 |     seq_width = int(seq_info[seq_info.find('imWidth=') + 8:seq_info.find(
40 |         '\nimHeight')])
41 |     seq_height = int(seq_info[seq_info.find('imHeight=') + 9:seq_info.find(
42 |         '\nimExt')])
43 | 
44 |     gt_txt = osp.join(seq_root, seq, 'gt', 'gt.txt')
45 |     gt = np.loadtxt(gt_txt, dtype=np.float64, delimiter=',')
46 |     print(len(gt))
47 |     seq_label_root = osp.join(label_root, seq, 'img1')
48 |     mkdirs(seq_label_root)
49 | 
50 |     for fid, tid, x, y, w, h, mark, label, _ in gt:
51 |         if mark == 0 or not label == 1:
52 |             continue
53 |         fid = int(fid)
54 |         tid = int(tid)
55 |         if not tid == tid_last:
56 |             tid_curr += 1
57 |             tid_last = tid
58 |         x += w / 2
59 |         y += h / 2
60 |         label_fpath = osp.join(seq_label_root, '{:06d}.txt'.format(fid))
61 |         label_str = '0 {:d} {:.6f} {:.6f} {:.6f} {:.6f}\n'.format(
62 |             tid_curr, x / seq_width, y / seq_height, w / seq_width,
63 |             h / seq_height)
64 |         with open(label_fpath, 'a') as f:
65 |             f.write(label_str)
66 | 


--------------------------------------------------------------------------------
/tools/gen_labels_mot15.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import os
 3 | import numpy as np
 4 | 
 5 | 
 6 | def mkdirs(d):
 7 |     if not osp.exists(d):
 8 |         os.makedirs(d)
 9 | 
10 | 
11 | seq_root = '/data/dataset/MOT15/images/train'
12 | label_root = '/data/dataset/MOT15/labels_with_ids/train'
13 | mkdirs(label_root)
14 | #seqs = [s for s in os.listdir(seq_root)]
15 | seqs = ['ADL-Rundle-6', 'ETH-Bahnhof', 'KITTI-13', 'PETS09-S2L1', 'TUD-Stadtmitte', 'ADL-Rundle-8', 'KITTI-17',
16 |         'ETH-Pedcross2', 'ETH-Sunnyday', 'TUD-Campus', 'Venice-2']
17 | 
18 | tid_curr = 0
19 | tid_last = -1
20 | for seq in seqs:
21 |     seq_info = open(osp.join(seq_root, seq, 'seqinfo.ini')).read()
22 |     seq_width = int(seq_info[seq_info.find('imWidth=') + 8:seq_info.find('\nimHeight')])
23 |     seq_height = int(seq_info[seq_info.find('imHeight=') + 9:seq_info.find('\nimExt')])
24 | 
25 |     gt_txt = osp.join(seq_root, seq, 'gt', 'gt.txt')
26 |     gt = np.loadtxt(gt_txt, dtype=np.float64, delimiter=',')
27 |     idx = np.lexsort(gt.T[:2, :])
28 |     gt = gt[idx, :]
29 | 
30 |     seq_label_root = osp.join(label_root, seq, 'img1')
31 |     mkdirs(seq_label_root)
32 | 
33 |     for fid, tid, x, y, w, h, mark, _, _, _ in gt:
34 |         if mark == 0:
35 |             continue
36 |         fid = int(fid)
37 |         tid = int(tid)
38 |         if not tid == tid_last:
39 |             tid_curr += 1
40 |             tid_last = tid
41 |         x += w / 2
42 |         y += h / 2
43 |         label_fpath = osp.join(seq_label_root, '{:06d}.txt'.format(fid))
44 |         label_str = '0 {:d} {:.6f} {:.6f} {:.6f} {:.6f}\n'.format(
45 |             tid_curr, x / seq_width, y / seq_height, w / seq_width, h / seq_height)
46 |         with open(label_fpath, 'a') as f:
47 |             f.write(label_str)


--------------------------------------------------------------------------------
/tools/run_dist_launch.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------
 3 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
 4 | # ------------------------------------------------------------------------
 5 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 6 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 7 | # ------------------------------------------------------------------------
 8 | # Modified from DETR (https://github.com/facebookresearch/detr)
 9 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
10 | # ------------------------------------------------------------------------
11 | 
12 | 
13 | set -x
14 | 
15 | GPUS=$1
16 | RUN_COMMAND=${@:2}
17 | if [ $GPUS -lt 8 ]; then
18 |     GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS}
19 | else
20 |     GPUS_PER_NODE=${GPUS_PER_NODE:-8}
21 | fi
22 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
23 | MASTER_PORT=${MASTER_PORT:-"29500"}
24 | NODE_RANK=${NODE_RANK:-0}
25 | 
26 | let "NNODES=GPUS/GPUS_PER_NODE"
27 | 
28 | python3 ./tools/launch.py \
29 |     --nnodes ${NNODES} \
30 |     --node_rank ${NODE_RANK} \
31 |     --master_addr ${MASTER_ADDR} \
32 |     --master_port ${MASTER_PORT} \
33 |     --nproc_per_node ${GPUS_PER_NODE} \
34 |     ${RUN_COMMAND}


--------------------------------------------------------------------------------
/tools/run_dist_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------
 3 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
 4 | # ------------------------------------------------------------------------
 5 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 6 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 7 | # ------------------------------------------------------------------------
 8 | # Modified from DETR (https://github.com/facebookresearch/detr)
 9 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
10 | # ------------------------------------------------------------------------
11 | 
12 | 
13 | set -x
14 | 
15 | PARTITION=$1
16 | JOB_NAME=$2
17 | GPUS=$3
18 | RUN_COMMAND=${@:4}
19 | if [ $GPUS -lt 8 ]; then
20 |     GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS}
21 | else
22 |     GPUS_PER_NODE=${GPUS_PER_NODE:-8}
23 | fi
24 | CPUS_PER_TASK=${CPUS_PER_TASK:-4}
25 | SRUN_ARGS=${SRUN_ARGS:-""}
26 | 
27 | srun -p ${PARTITION} \
28 |     --job-name=${JOB_NAME} \
29 |     --gres=gpu:${GPUS_PER_NODE} \
30 |     --ntasks=${GPUS} \
31 |     --ntasks-per-node=${GPUS_PER_NODE} \
32 |     --cpus-per-task=${CPUS_PER_TASK} \
33 |     --kill-on-bad-exit=1 \
34 |     ${SRUN_ARGS} \
35 |     ${RUN_COMMAND}
36 | 
37 | 


--------------------------------------------------------------------------------
/tools/visualize_validation_gt_mot17.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from visualization_tool import Visualizer
 3 | import cv2
 4 | import pycocotools.coco as coco
 5 | import pdb
 6 | from collections import defaultdict
 7 | 
 8 | dataset_root = "/data/dataset/mot"
 9 | ann_path = os.path.join(dataset_root, "annotations", "val_half.json")
10 | img_dir = os.path.join(dataset_root, "train")
11 | output_path = "/data/cache"
12 | os.makedirs(output_path, exist_ok=True)
13 | 
14 | coco_obj = coco.COCO(ann_path)
15 | video_info = coco_obj.dataset["videos"]
16 | VidtoVname = {}
17 | for v_info in video_info:
18 |     VidtoVname[v_info["id"]] = v_info["file_name"]
19 | video_to_images = defaultdict(list)
20 | for image in coco_obj.dataset["images"]:
21 |     if image["video_id"] not in VidtoVname.keys():
22 |         continue
23 |     video_to_images[VidtoVname[image["video_id"]]].append(image)
24 | image_id_to_filename = {}
25 | for one_image in coco_obj.dataset["images"]:
26 |     image_id_to_filename[one_image["id"]] = one_image["file_name"]
27 | 
28 | image_file_name_to_anns = defaultdict(list)
29 | for anns in coco_obj.dataset["annotations"]:
30 |     image_file_name = image_id_to_filename[anns["image_id"]]
31 |     image_file_name_to_anns[image_file_name].append(anns)
32 | for video_id in video_to_images:
33 |     print(f"Visualizing video: {video_id} ...")
34 |     visualizer = Visualizer()
35 |     for idx, image_d in enumerate(video_to_images[video_id]):
36 |         print(f"Stepping frame {idx} / {len(video_to_images[video_id])} ...")
37 |         img_path = os.path.join(img_dir, image_d["file_name"])
38 |         assert os.path.exists(img_path), f"{img_path} does not exist!"
39 |         img = cv2.imread(img_path)
40 |         visualizer.add_img(img, img_id=idx)
41 |         anns = image_file_name_to_anns[image_d["file_name"]]
42 |         for jdx, cur_anns in enumerate(anns):
43 |             track_id = cur_anns["track_id"]
44 |             bbox = [
45 |                 cur_anns["bbox"][0],
46 |                 cur_anns["bbox"][1],
47 |                 cur_anns["bbox"][0] + cur_anns["bbox"][2],
48 |                 cur_anns["bbox"][1] + cur_anns["bbox"][3],
49 |             ]
50 |             if track_id > 100000:
51 |                 track_id -= 100000
52 |             visualizer.add_coco_bbox(bbox, 0, conf=track_id, add_txt="", img_id=idx)
53 |         visualizer.save_video(path=output_path, name=video_id)
54 |         
55 | 


--------------------------------------------------------------------------------
/tracker/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/tracker/.DS_Store


--------------------------------------------------------------------------------
/tracker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/tracker/__init__.py


--------------------------------------------------------------------------------
/tracker/byte_tracker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/tracker/byte_tracker/__init__.py


--------------------------------------------------------------------------------
/tracker/byte_tracker/mot_online/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/tracker/byte_tracker/mot_online/__init__.py


--------------------------------------------------------------------------------
/tracker/byte_tracker/mot_online/basetrack.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from collections import OrderedDict
 3 | 
 4 | 
 5 | class TrackState(object):
 6 |     New = 0
 7 |     Tracked = 1
 8 |     Lost = 2
 9 |     Removed = 3
10 | 
11 | 
12 | class BaseTrack(object):
13 |     _count = 0
14 | 
15 |     track_id = 0
16 |     is_activated = False
17 |     state = TrackState.New
18 | 
19 |     history = OrderedDict()
20 |     features = []
21 |     curr_feature = None
22 |     score = 0
23 |     start_frame = 0
24 |     frame_id = 0
25 |     time_since_update = 0
26 | 
27 |     # multi-camera
28 |     location = (np.inf, np.inf)
29 | 
30 |     @property
31 |     def end_frame(self):
32 |         return self.frame_id
33 | 
34 |     @staticmethod
35 |     def next_id():
36 |         BaseTrack._count += 1
37 |         return BaseTrack._count
38 | 
39 |     def activate(self, *args):
40 |         raise NotImplementedError
41 | 
42 |     def predict(self):
43 |         raise NotImplementedError
44 | 
45 |     def update(self, *args, **kwargs):
46 |         raise NotImplementedError
47 | 
48 |     def mark_lost(self):
49 |         self.state = TrackState.Lost
50 | 
51 |     def mark_removed(self):
52 |         self.state = TrackState.Removed
53 | 


--------------------------------------------------------------------------------
/tracker/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/tracker/common/__init__.py


--------------------------------------------------------------------------------
/tracker/common/track_structure_transfer.py:
--------------------------------------------------------------------------------
 1 | from os import curdir
 2 | import pdb
 3 | import numpy as np
 4 | 
 5 | def frame_first_to_id_first(frame_first):
 6 |     """
 7 |     Frame first result: {Frame ID: a list of [x1, y1, x2, y2, score, ...]}
 8 |     Track first result: {Track ID: {Frame ID: [x1, y1, x2, y2, score, ...]}}
 9 |     """
10 |     results = {}
11 |     for frameid, bbs in frame_first.items():
12 |         for one_bb in bbs:
13 |             x1, y1, x2, y2, score, cur_id = one_bb[0], one_bb[1], one_bb[2], one_bb[3], one_bb[4], one_bb[5]
14 |             if cur_id not in results:
15 |                 results[cur_id] = {}
16 |             results[cur_id][frameid] = np.array([x1, y1, x2, y2, score])
17 |     return results
18 | 
19 | 
20 | def id_first_to_frame_first(id_first):
21 |     """
22 |     Frame first result: {Frame ID: a list of [x1, y1, x2, y2, score, ...]}
23 |     Track first result: {Track ID: {Frame ID: [x1, y1, x2, y2, score, ...]}}
24 |     """
25 |     results = {}
26 |     for i, track in id_first.items():
27 |         for frame, bb in track.items():
28 |             if frame not in results:
29 |                 results[frame] = []
30 |             x1 = bb[0]
31 |             y1 = bb[1]
32 |             x2 = bb[2]
33 |             y2 = bb[3]
34 |             score = bb[4]
35 |             results[frame].append([x1, y1, x2, y2, score, i+1])
36 |     return results


--------------------------------------------------------------------------------
/tracker/d2_p3aformer/write_results.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | import csv
 4 | from shutil import copyfile
 5 | 
 6 | 
 7 | def write_results(all_tracks, out_dir, seq_name=None, frame_offset=0, verbose=False):
 8 |     output_dir = out_dir + "/txt/"
 9 |     """Write the tracks in the format for MOT16/MOT17 submission
10 |     all_tracks: dictionary with 1 dictionary for every track with {..., i:np.array([x1,y1,x2,y2]), ...} at key track_num  if frame_first=False,
11 |     Each file contains these lines:
12 |     <frame>, <id>, <bb_left>, <bb_top>, <bb_width>, <bb_height>, <conf>, <x>, <y>, <z>
13 |     """
14 |     # format_str = "{}, -1, {}, {}, {}, {}, {}, -1, -1, -1"
15 |     assert seq_name is not None, "[!] No seq_name, probably using combined database"
16 |     if not os.path.exists(output_dir):
17 |         os.makedirs(output_dir)
18 |     save_path = osp.join(output_dir, seq_name + ".txt")
19 |     with open(save_path, "w") as of:
20 |         writer = csv.writer(of, delimiter=",")
21 |         for i in sorted(all_tracks):
22 |             track = all_tracks[i]
23 |             for frame, bb in track.items():
24 |                 x1 = bb[0]
25 |                 y1 = bb[1]
26 |                 x2 = bb[2]
27 |                 y2 = bb[3]
28 |                 writer.writerow(
29 |                     [
30 |                         frame + frame_offset,
31 |                         i + 1,
32 |                         x1 + 1,
33 |                         y1 + 1,
34 |                         x2 - x1 + 1,
35 |                         y2 - y1 + 1,
36 |                         -1,
37 |                         -1,
38 |                         -1,
39 |                         -1,
40 |                     ]
41 |                 )
42 |     # TODO: validate this in MOT15
43 |     # copy to FRCNN, DPM.txt, private setting
44 |     copyfile(save_path, save_path[:-7] + "FRCNN.txt")
45 |     copyfile(save_path, save_path[:-7] + "DPM.txt")
46 |     if verbose:
47 |         print("Write txt results at: ", save_path, ".")
48 |     return save_path
49 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 


--------------------------------------------------------------------------------
/util/box_ops.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | 
12 | """
13 | Utilities for bounding box manipulation and GIoU.
14 | """
15 | import torch
16 | from torchvision.ops.boxes import box_area
17 | 
18 | 
19 | def box_cxcywh_to_xyxy(x):
20 |     x_c, y_c, w, h = x.unbind(-1)
21 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
22 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
23 |     return torch.stack(b, dim=-1)
24 | 
25 | 
26 | def box_xyxy_to_cxcywh(x):
27 |     x0, y0, x1, y1 = x.unbind(-1)
28 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
29 |          (x1 - x0), (y1 - y0)]
30 |     return torch.stack(b, dim=-1)
31 | 
32 | 
33 | # modified from torchvision to also return the union
34 | def box_iou(boxes1, boxes2):
35 |     area1 = box_area(boxes1)
36 |     area2 = box_area(boxes2)
37 | 
38 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
39 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
40 | 
41 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
42 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
43 | 
44 |     union = area1[:, None] + area2 - inter
45 | 
46 |     iou = inter / union
47 |     return iou, union
48 | 
49 | 
50 | def generalized_box_iou(boxes1, boxes2):
51 |     """
52 |     Generalized IoU from https://giou.stanford.edu/
53 | 
54 |     The boxes should be in [x0, y0, x1, y1] format
55 | 
56 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
57 |     and M = len(boxes2)
58 |     """
59 |     # degenerate boxes gives inf / nan results
60 |     # so do an early check
61 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
62 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
63 |     iou, union = box_iou(boxes1, boxes2)
64 | 
65 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
66 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
67 | 
68 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
69 |     area = wh[:, :, 0] * wh[:, :, 1]
70 | 
71 |     return iou - (area - union) / area
72 | 
73 | 
74 | def masks_to_boxes(masks):
75 |     """Compute the bounding boxes around the provided masks
76 | 
77 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
78 | 
79 |     Returns a [N, 4] tensors, with the boxes in xyxy format
80 |     """
81 |     if masks.numel() == 0:
82 |         return torch.zeros((0, 4), device=masks.device)
83 | 
84 |     h, w = masks.shape[-2:]
85 | 
86 |     y = torch.arange(0, h, dtype=torch.float)
87 |     x = torch.arange(0, w, dtype=torch.float)
88 |     y, x = torch.meshgrid(y, x)
89 | 
90 |     x_mask = (masks * x.unsqueeze(0))
91 |     x_max = x_mask.flatten(1).max(-1)[0]
92 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
93 | 
94 |     y_mask = (masks * y.unsqueeze(0))
95 |     y_max = y_mask.flatten(1).max(-1)[0]
96 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
97 | 
98 |     return torch.stack([x_min, y_min, x_max, y_max], 1)
99 | 


--------------------------------------------------------------------------------
/util/p3aformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/ECCV22-P3AFormer-Tracking-Objects-as-Pixel-wise-Distributions/673d34698188e23e18e8ac920ec229ee79e67d71/util/p3aformer/__init__.py


--------------------------------------------------------------------------------
/util/system.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import glob
 4 | 
 5 | def remove_files_under_folder(folder, select_str):
 6 |     files = glob.glob(os.path.join(folder, '*'))
 7 |     for f in files:
 8 |         if os.path.isdir(f):
 9 |             continue
10 |         if select_str is not None and select_str in f:
11 |             os.remove(f)
12 |     return


--------------------------------------------------------------------------------
/util/tool.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Copyright (c) 2021 megvii-model. All Rights Reserved.
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # ------------------------------------------------------------------------
 7 | # Modified from DETR (https://github.com/facebookresearch/detr)
 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 9 | # ------------------------------------------------------------------------
10 | 
11 | import torch
12 | import numpy as np
13 | 
14 | 
15 | def load_model(model, model_path, optimizer=None, resume=False,
16 |                lr=None, lr_step=None):
17 |     start_epoch = 0
18 |     checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
19 |     print(f'loaded {model_path}')
20 |     state_dict = checkpoint['model']
21 |     model_state_dict = model.state_dict()
22 | 
23 |     # check loaded parameters and created model parameters
24 |     msg = 'If you see this, your model does not fully load the ' + \
25 |           'pre-trained weight. Please make sure ' + \
26 |           'you set the correct --num_classes for your own dataset.'
27 |     for k in state_dict:
28 |         if k in model_state_dict:
29 |             if state_dict[k].shape != model_state_dict[k].shape:
30 |                 print('Skip loading parameter {}, required shape{}, ' \
31 |                       'loaded shape{}. {}'.format(
32 |                     k, model_state_dict[k].shape, state_dict[k].shape, msg))
33 |                 if 'class_embed' in k:
34 |                     print("load class_embed: {} shape={}".format(k, state_dict[k].shape))
35 |                     if model_state_dict[k].shape[0] == 1:
36 |                         state_dict[k] = state_dict[k][1:2]
37 |                     elif model_state_dict[k].shape[0] == 2:
38 |                         state_dict[k] = state_dict[k][1:3]
39 |                     elif model_state_dict[k].shape[0] == 3:
40 |                         state_dict[k] = state_dict[k][1:4]
41 |                     else:
42 |                         raise NotImplementedError('invalid shape: {}'.format(model_state_dict[k].shape))
43 |                     continue
44 |                 state_dict[k] = model_state_dict[k]
45 |         else:
46 |             print('Drop parameter {}.'.format(k) + msg)
47 |     for k in model_state_dict:
48 |         if not (k in state_dict):
49 |             print('No param {}.'.format(k) + msg)
50 |             state_dict[k] = model_state_dict[k]
51 |     model.load_state_dict(state_dict, strict=False)
52 | 
53 |     # resume optimizer parameters
54 |     if optimizer is not None and resume:
55 |         if 'optimizer' in checkpoint:
56 |             optimizer.load_state_dict(checkpoint['optimizer'])
57 |             start_epoch = checkpoint['epoch']
58 |             start_lr = lr
59 |             for step in lr_step:
60 |                 if start_epoch >= step:
61 |                     start_lr *= 0.1
62 |             for param_group in optimizer.param_groups:
63 |                 param_group['lr'] = start_lr
64 |             print('Resumed optimizer with start lr', start_lr)
65 |         else:
66 |             print('No optimizer parameters in checkpoint.')
67 |     if optimizer is not None:
68 |         return model, optimizer, start_epoch
69 |     else:
70 |         return model
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------