├── requirements.txt ├── mmcv_custom ├── __init__.py └── runner │ ├── __init__.py │ ├── checkpoint.py │ └── epoch_based_runner.py ├── datasets ├── torchvision_datasets │ ├── __init__.py │ └── coco.py ├── __init__.py ├── panoptic_eval.py ├── data_prefetcher.py ├── coco_panoptic.py ├── samplers.py ├── coco.py ├── transforms.py └── coco_eval.py ├── configs ├── one_stage │ └── deformable-detr-baseline │ │ └── 50eps │ │ ├── r50_deformable_detr.sh │ │ ├── r50_deformable_detr_single_scale.sh │ │ ├── r50_deformable_detr_single_scale_dc5.sh │ │ └── r50_deformable_detr_plus_iterative_bbox_refinement.sh └── two_stage │ ├── deformable-detr-baseline │ ├── 12eps │ │ ├── r50_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ ├── r50_n1800_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ ├── r50_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ └── swin │ │ │ ├── swin_tiny_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ │ ├── swin_large_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ │ ├── swin_tiny_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ │ └── swin_large_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── 24eps │ │ ├── r50_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ ├── r50_n1800_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ └── r50_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── 36eps │ │ ├── r50_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ ├── r50_n1800_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ ├── r50_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ └── swin │ │ │ ├── swin_tiny_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ │ ├── drop_path0.5_swin_large_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ │ ├── swin_tiny_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ │ └── drop_path0.5_swin_large_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ └── 50eps │ │ ├── r50_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ ├── r50_n1800_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ └── r50_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ └── deformable-detr-hybrid-branch │ ├── 12eps │ ├── r50_hybrid_branch_lambda1_group6_t1500_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda1_group1_t300_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda1_group2_t600_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda1_group3_t900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda1_group6_t300_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda1_group6_t600_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda1_group6_t900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda1_group4_t1200_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda1_group5_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda1_group6_t1200_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda1_group6_t1800_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda2_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda5_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda0.1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda0.2_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda0.5_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r101_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ └── swin │ │ ├── swin_small_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ ├── swin_tiny_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ ├── swin_large_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ ├── swin_small_22k_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ ├── swin_large_hybrid_branch_lambda1_group6_t1500_n900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ │ └── decay0.05_swin_tiny_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── 24eps │ ├── r50_hybrid_branch_lambda1_group6_t1500_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ └── r50_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ └── 36eps │ ├── r50_hybrid_branch_lambda1_group6_t1500_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r50_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── r101_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ └── swin │ ├── swin_small_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── swin_tiny_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── swin_small_22k_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── decay0.05_swin_tiny_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── drop_path0.5_swin_large_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ ├── drop_path0.5_swin_large_hybrid_branch_lambda1_group6_t1500_n900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh │ └── decay0.05_drop_path0.5_swin_large_hybrid_branch_lambda1_group6_t1500_n900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh ├── util ├── __init__.py ├── box_ops.py └── plot_utils.py ├── models ├── ops │ ├── make.sh │ ├── modules │ │ ├── __init__.py │ │ └── ms_deform_attn.py │ ├── functions │ │ ├── __init__.py │ │ └── ms_deform_attn_func.py │ ├── src │ │ ├── vision.cpp │ │ ├── cuda │ │ │ ├── ms_deform_attn_cuda.h │ │ │ └── ms_deform_attn_cuda.cu │ │ ├── cpu │ │ │ ├── ms_deform_attn_cpu.h │ │ │ └── ms_deform_attn_cpu.cpp │ │ └── ms_deform_attn.h │ ├── setup.py │ └── test.py ├── __init__.py ├── position_encoding.py ├── matcher.py ├── backbone.py └── segmentation.py ├── tools ├── run_dist_launch.sh ├── run_dist_slurm.sh └── launch.py ├── LICENSE ├── benchmark.py └── engine.py /requirements.txt: -------------------------------------------------------------------------------- 1 | pycocotools 2 | tqdm 3 | cython 4 | scipy 5 | wandb 6 | timm -------------------------------------------------------------------------------- /mmcv_custom/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .checkpoint import load_checkpoint 4 | 5 | __all__ = ["load_checkpoint"] 6 | 7 | -------------------------------------------------------------------------------- /mmcv_custom/runner/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Open-MMLab. All rights reserved. 2 | from .checkpoint import save_checkpoint 3 | from .epoch_based_runner import EpochBasedRunnerAmp 4 | 5 | 6 | __all__ = ["EpochBasedRunnerAmp", "save_checkpoint"] 7 | 8 | -------------------------------------------------------------------------------- /datasets/torchvision_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | 7 | from .coco import CocoDetection 8 | -------------------------------------------------------------------------------- /configs/one_stage/deformable-detr-baseline/50eps/r50_deformable_detr.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/one_stage/deformable-detr-baseline/12eps/r50_deformable_detr 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --num_queries_one2one 300 \ 11 | --num_queries_one2many 0 \ 12 | --k_one2many 0 \ 13 | --epochs 50 \ 14 | --lr_drop 40 \ 15 | ${PY_ARGS} 16 | -------------------------------------------------------------------------------- /configs/one_stage/deformable-detr-baseline/50eps/r50_deformable_detr_single_scale.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/one_stage/deformable-detr-baseline/12eps/r50_deformable_detr_single_scale 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --num_feature_levels 1 \ 10 | --output_dir ${EXP_DIR} \ 11 | --num_queries_one2one 300 \ 12 | --num_queries_one2many 0 \ 13 | --k_one2many 0 \ 14 | --epochs 50 \ 15 | --lr_drop 40 \ 16 | ${PY_ARGS} 17 | -------------------------------------------------------------------------------- /configs/one_stage/deformable-detr-baseline/50eps/r50_deformable_detr_single_scale_dc5.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/one_stage/deformable-detr-baseline/12eps/r50_deformable_detr_single_scale_dc5 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --num_feature_levels 1 \ 10 | --dilation \ 11 | --output_dir ${EXP_DIR} \ 12 | --num_queries_one2one 300 \ 13 | --num_queries_one2many 0 \ 14 | --k_one2many 0 \ 15 | --epochs 50 \ 16 | --lr_drop 40 \ 17 | ${PY_ARGS} 18 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | -------------------------------------------------------------------------------- /configs/one_stage/deformable-detr-baseline/50eps/r50_deformable_detr_plus_iterative_bbox_refinement.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/one_stage/deformable-detr-baseline/12eps/r50_deformable_detr_plus_iterative_bbox_refinement 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --dim_feedforward 2048 \ 12 | --num_queries_one2one 300 \ 13 | --num_queries_one2many 0 \ 14 | --k_one2many 0 \ 15 | --epochs 50 \ 16 | --lr_drop 40 \ 17 | ${PY_ARGS} 18 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/12eps/r50_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/12eps/r50_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 12 \ 17 | --lr_drop 11 \ 18 | ${PY_ARGS} 19 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/24eps/r50_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/24eps/r50_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 24 \ 17 | --lr_drop 20 \ 18 | ${PY_ARGS} 19 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/36eps/r50_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/36eps/r50_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 36 \ 17 | --lr_drop 30 \ 18 | ${PY_ARGS} 19 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/50eps/r50_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/50eps/r50_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 50 \ 17 | --lr_drop 40 \ 18 | ${PY_ARGS} 19 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/12eps/r50_n1800_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/12eps/r50_n1800_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 1800 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 12 \ 17 | --lr_drop 11 \ 18 | ${PY_ARGS} 19 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/24eps/r50_n1800_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/24eps/r50_n1800_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 1800 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 24 \ 17 | --lr_drop 20 \ 18 | ${PY_ARGS} 19 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/36eps/r50_n1800_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/36eps/r50_n1800_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 1800 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 36 \ 17 | --lr_drop 30 \ 18 | ${PY_ARGS} 19 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/50eps/r50_n1800_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/50eps/r50_n1800_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 1800 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 50 \ 17 | --lr_drop 40 \ 18 | ${PY_ARGS} 19 | -------------------------------------------------------------------------------- /models/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | python setup.py build install 11 | -------------------------------------------------------------------------------- /models/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn import MSDeformAttn 10 | -------------------------------------------------------------------------------- /models/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn_func import MSDeformAttnFunction 10 | 11 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | from .deformable_detr import build 11 | 12 | 13 | def build_model(args): 14 | return build(args) 15 | 16 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/12eps/r50_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/12eps/r50_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 12 \ 17 | --lr_drop 11 \ 18 | --dropout 0.0 \ 19 | --mixed_selection \ 20 | --look_forward_twice \ 21 | ${PY_ARGS} 22 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/24eps/r50_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/24eps/r50_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 24 \ 17 | --lr_drop 20 \ 18 | --dropout 0.0 \ 19 | --mixed_selection \ 20 | --look_forward_twice \ 21 | ${PY_ARGS} 22 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/36eps/r50_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/36eps/r50_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 36 \ 17 | --lr_drop 30 \ 18 | --dropout 0.0 \ 19 | --mixed_selection \ 20 | --look_forward_twice \ 21 | ${PY_ARGS} 22 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/50eps/r50_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/50eps/r50_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 50 \ 17 | --lr_drop 40 \ 18 | --dropout 0.0 \ 19 | --mixed_selection \ 20 | --look_forward_twice \ 21 | ${PY_ARGS} 22 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group6_t1500_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group6_t1500_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | ${PY_ARGS} 20 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/24eps/r50_hybrid_branch_lambda1_group6_t1500_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/24eps/r50_hybrid_branch_lambda1_group6_t1500_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 24 \ 14 | --lr_drop 20 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | ${PY_ARGS} 20 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/36eps/r50_hybrid_branch_lambda1_group6_t1500_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/24eps/r50_hybrid_branch_lambda1_group6_t1500_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 36 \ 14 | --lr_drop 30 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | ${PY_ARGS} 20 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/12eps/swin/swin_tiny_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/12eps/swin/swin_tiny_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 12 \ 17 | --lr_drop 11 \ 18 | --backbone swin_tiny \ 19 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_tiny_patch4_window7_224.pth \ 20 | ${PY_ARGS} 21 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/36eps/swin/swin_tiny_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/36eps/swin/swin_tiny_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 36 \ 17 | --lr_drop 30 \ 18 | --backbone swin_tiny \ 19 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_tiny_patch4_window7_224.pth \ 20 | ${PY_ARGS} 21 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/12eps/swin/swin_large_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/12eps/swin/swin_large_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 12 \ 17 | --lr_drop 11 \ 18 | --backbone swin_large \ 19 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_large_patch4_window7_224_22k.pth \ 20 | ${PY_ARGS} 21 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/36eps/swin/drop_path0.5_swin_large_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/36eps/swin/drop_path0.5_swin_large_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 36 \ 17 | --lr_drop 30 \ 18 | --backbone swin_large \ 19 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_large_patch4_window7_224_22k.pth \ 20 | --drop_path_rate 0.5 \ 21 | ${PY_ARGS} 22 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group1_t300_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group1_t300_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 300 \ 17 | --k_one2many 1 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group2_t600_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group2_t600_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 600 \ 17 | --k_one2many 2 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group3_t900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group3_t900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 900 \ 17 | --k_one2many 3 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group6_t300_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group6_t300_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 300 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group6_t600_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group6_t600_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 600 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group6_t900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group6_t900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 900 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group4_t1200_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group4_t1200_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1200 \ 17 | --k_one2many 4 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group5_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group5_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 5 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group6_t1200_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group6_t1200_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1200 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group6_t1800_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda1_group6_t1800_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1800 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda2_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda2_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 2.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda5_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda5_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 5.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/24eps/r50_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/24eps/r50_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 24 \ 14 | --lr_drop 20 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/36eps/r50_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/36eps/r50_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 36 \ 14 | --lr_drop 30 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda0.1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda0.1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 0.1 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda0.2_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda0.2_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 0.2 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda0.5_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r50_hybrid_branch_lambda0.5_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 0.5 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | ${PY_ARGS} 23 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/12eps/swin/swin_tiny_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/12eps/swin/swin_tiny_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 12 \ 17 | --lr_drop 11 \ 18 | --dropout 0.0 \ 19 | --mixed_selection \ 20 | --look_forward_twice \ 21 | --backbone swin_tiny \ 22 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_tiny_patch4_window7_224.pth \ 23 | ${PY_ARGS} 24 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/36eps/swin/swin_tiny_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/36eps/swin/swin_tiny_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 36 \ 17 | --lr_drop 30 \ 18 | --dropout 0.0 \ 19 | --mixed_selection \ 20 | --look_forward_twice \ 21 | --backbone swin_tiny \ 22 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_tiny_patch4_window7_224.pth \ 23 | ${PY_ARGS} 24 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/r101_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/r101_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | --backbone resnet101 \ 23 | ${PY_ARGS} 24 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/12eps/swin/swin_large_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/12eps/swin/swin_large_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 12 \ 17 | --lr_drop 11 \ 18 | --dropout 0.0 \ 19 | --mixed_selection \ 20 | --look_forward_twice \ 21 | --backbone swin_large \ 22 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_large_patch4_window7_224_22k.pth \ 23 | ${PY_ARGS} 24 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/36eps/r101_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/configs/two_stage/deformable-detr-hybrid-branch/36eps/r101_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 36 \ 14 | --lr_drop 30 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | --backbone resnet101 \ 23 | ${PY_ARGS} 24 | -------------------------------------------------------------------------------- /models/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-baseline/36eps/swin/drop_path0.5_swin_large_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-baseline/36eps/swin/drop_path0.5_swin_large_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --num_queries_one2one 300 \ 14 | --num_queries_one2many 0 \ 15 | --k_one2many 0 \ 16 | --epochs 36 \ 17 | --lr_drop 30 \ 18 | --dropout 0.0 \ 19 | --mixed_selection \ 20 | --look_forward_twice \ 21 | --backbone swin_large \ 22 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_large_patch4_window7_224_22k.pth \ 23 | --drop_path_rate 0.5 \ 24 | ${PY_ARGS} 25 | -------------------------------------------------------------------------------- /tools/run_dist_launch.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------ 7 | 8 | set -x 9 | 10 | GPUS=$1 11 | RUN_COMMAND=${@:2} 12 | if [ $GPUS -lt 8 ]; then 13 | GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS} 14 | else 15 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 16 | fi 17 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 18 | MASTER_PORT=${MASTER_PORT:-"29500"} 19 | NODE_RANK=${NODE_RANK:-0} 20 | 21 | let "NNODES=GPUS/GPUS_PER_NODE" 22 | 23 | python ./tools/launch.py \ 24 | --nnodes ${NNODES} \ 25 | --node_rank ${NODE_RANK} \ 26 | --master_addr ${MASTER_ADDR} \ 27 | --master_port ${MASTER_PORT} \ 28 | --nproc_per_node ${GPUS_PER_NODE} \ 29 | ${RUN_COMMAND} 30 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/swin/swin_small_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/swin/swin_small_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | 7 | python -u main.py \ 8 | --output_dir ${EXP_DIR} \ 9 | --with_box_refine \ 10 | --two_stage \ 11 | --dim_feedforward 2048 \ 12 | --epochs 12 \ 13 | --lr_drop 11 \ 14 | --num_queries_one2one 300 \ 15 | --num_queries_one2many 1500 \ 16 | --k_one2many 6 \ 17 | --lambda_one2many 1.0 \ 18 | --dropout 0.0 \ 19 | --mixed_selection \ 20 | --look_forward_twice \ 21 | --backbone swin_small \ 22 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_small_patch4_window7_224.pth \ 23 | ${PY_ARGS} 24 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/36eps/swin/swin_small_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/36eps/swin/swin_small_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | 7 | python -u main.py \ 8 | --output_dir ${EXP_DIR} \ 9 | --with_box_refine \ 10 | --two_stage \ 11 | --dim_feedforward 2048 \ 12 | --epochs 36 \ 13 | --lr_drop 30 \ 14 | --num_queries_one2one 300 \ 15 | --num_queries_one2many 1500 \ 16 | --k_one2many 6 \ 17 | --lambda_one2many 1.0 \ 18 | --dropout 0.0 \ 19 | --mixed_selection \ 20 | --look_forward_twice \ 21 | --backbone swin_small \ 22 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_small_patch4_window7_224.pth \ 23 | ${PY_ARGS} 24 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/swin/swin_tiny_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/swin/swin_tiny_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | --backbone swin_tiny \ 23 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_tiny_patch4_window7_224.pth \ 24 | ${PY_ARGS} 25 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/36eps/swin/swin_tiny_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/36eps/swin/swin_tiny_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 36 \ 14 | --lr_drop 30 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | --backbone swin_tiny \ 23 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_tiny_patch4_window7_224.pth \ 24 | ${PY_ARGS} 25 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/swin/swin_large_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/swin/swin_large_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | --backbone swin_large \ 23 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_large_patch4_window7_224_22k.pth \ 24 | ${PY_ARGS} 25 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/swin/swin_small_22k_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/swin/swin_small_22k_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | --backbone swin_small \ 23 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_small_patch4_window7_224_22k.pth \ 24 | ${PY_ARGS} 25 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/36eps/swin/swin_small_22k_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/36eps/swin/swin_small_22k_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 36 \ 14 | --lr_drop 30 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | --backbone swin_small \ 23 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_small_patch4_window7_224_22k.pth \ 24 | ${PY_ARGS} 25 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/swin/swin_large_hybrid_branch_lambda1_group6_t1500_n900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/swin/swin_large_hybrid_branch_lambda1_group6_t1500_n900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 900 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | --backbone swin_large \ 23 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_large_patch4_window7_224_22k.pth \ 24 | ${PY_ARGS} 25 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/12eps/swin/decay0.05_swin_tiny_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/12eps/swin/swin_tiny_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 12 \ 14 | --lr_drop 11 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | --backbone swin_tiny \ 23 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_tiny_patch4_window7_224.pth \ 24 | --weight_decay 0.05 \ 25 | ${PY_ARGS} 26 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/36eps/swin/decay0.05_swin_tiny_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/36eps/swin/swin_tiny_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 36 \ 14 | --lr_drop 30 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | --backbone swin_tiny \ 23 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_tiny_patch4_window7_224.pth \ 24 | --weight_decay 0.05 \ 25 | ${PY_ARGS} 26 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/36eps/swin/drop_path0.5_swin_large_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/36eps/swin/drop_path0.5_swin_large_hybrid_branch_lambda1_group6_t1500_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 36 \ 14 | --lr_drop 30 \ 15 | --num_queries_one2one 300 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | --backbone swin_large \ 23 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_large_patch4_window7_224_22k.pth \ 24 | --drop_path_rate 0.5 \ 25 | ${PY_ARGS} 26 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/36eps/swin/drop_path0.5_swin_large_hybrid_branch_lambda1_group6_t1500_n900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/36eps/swin/drop_path0.5_swin_large_hybrid_branch_lambda1_group6_t1500_n900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 36 \ 14 | --lr_drop 30 \ 15 | --num_queries_one2one 900 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | --backbone swin_large \ 23 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_large_patch4_window7_224_22k.pth \ 24 | --drop_path_rate 0.5 \ 25 | ${PY_ARGS} 26 | -------------------------------------------------------------------------------- /configs/two_stage/deformable-detr-hybrid-branch/36eps/swin/decay0.05_drop_path0.5_swin_large_hybrid_branch_lambda1_group6_t1500_n900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/two_stage/deformable-detr-hybrid-branch/36eps/swin/drop_path0.5_swin_large_hybrid_branch_lambda1_group6_t1500_n900_dp0_mqs_lft_deformable_detr_plus_iterative_bbox_refinement_plus_plus_two_stage 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --dim_feedforward 2048 \ 13 | --epochs 36 \ 14 | --lr_drop 30 \ 15 | --num_queries_one2one 900 \ 16 | --num_queries_one2many 1500 \ 17 | --k_one2many 6 \ 18 | --lambda_one2many 1.0 \ 19 | --dropout 0.0 \ 20 | --mixed_selection \ 21 | --look_forward_twice \ 22 | --backbone swin_large \ 23 | --pretrained_backbone_path /mnt/pretrained_backbone/swin_large_patch4_window7_224_22k.pth \ 24 | --drop_path_rate 0.5 \ 25 | --weight_decay 0.05 \ 26 | ${PY_ARGS} 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 HDETR-group (Yuhui Yuan,Ding Jia,Haodi He,Xiaopei Wu,Haojun Yu) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /models/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | 31 | -------------------------------------------------------------------------------- /models/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | 34 | -------------------------------------------------------------------------------- /tools/run_dist_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # -------------------------------------------------------------------------------------------------------------------------- 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # -------------------------------------------------------------------------------------------------------------------------- 7 | # Modified from https://github.com/open-mmlab/mmdetection/blob/3b53fe15d87860c6941f3dda63c0f27422da6266/tools/slurm_train.sh 8 | # -------------------------------------------------------------------------------------------------------------------------- 9 | 10 | set -x 11 | 12 | PARTITION=$1 13 | JOB_NAME=$2 14 | GPUS=$3 15 | RUN_COMMAND=${@:4} 16 | if [ $GPUS -lt 8 ]; then 17 | GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS} 18 | else 19 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 20 | fi 21 | CPUS_PER_TASK=${CPUS_PER_TASK:-4} 22 | SRUN_ARGS=${SRUN_ARGS:-""} 23 | 24 | srun -p ${PARTITION} \ 25 | --job-name=${JOB_NAME} \ 26 | --gres=gpu:${GPUS_PER_NODE} \ 27 | --ntasks=${GPUS} \ 28 | --ntasks-per-node=${GPUS_PER_NODE} \ 29 | --cpus-per-task=${CPUS_PER_TASK} \ 30 | --kill-on-bad-exit=1 \ 31 | ${SRUN_ARGS} \ 32 | ${RUN_COMMAND} 33 | 34 | -------------------------------------------------------------------------------- /models/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # H-DETR 3 | # Copyright (c) 2022 Peking University & Microsoft Research Asia. All Rights Reserved. 4 | # Licensed under the MIT-style license found in the LICENSE file in the root directory 5 | # ------------------------------------------------------------------------ 6 | # Deformable DETR 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 9 | # ------------------------------------------------------------------------ 10 | # Modified from DETR (https://github.com/facebookresearch/detr) 11 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 12 | # ------------------------------------------------------------------------ 13 | 14 | import torch.utils.data 15 | from .torchvision_datasets import CocoDetection 16 | 17 | from .coco import build as build_coco 18 | 19 | 20 | def get_coco_api_from_dataset(dataset): 21 | for _ in range(10): 22 | # if isinstance(dataset, torchvision.datasets.CocoDetection): 23 | # break 24 | if isinstance(dataset, torch.utils.data.Subset): 25 | dataset = dataset.dataset 26 | if isinstance(dataset, CocoDetection): 27 | return dataset.coco 28 | 29 | 30 | def build_dataset(image_set, args, eval_in_training_set=False): 31 | if args.dataset_file == "coco": 32 | return build_coco(image_set, args, eval_in_training_set) 33 | if args.dataset_file == "coco_panoptic": 34 | # to avoid making panopticapi required for coco 35 | from .coco_panoptic import build as build_coco_panoptic 36 | 37 | return build_coco_panoptic(image_set, args) 38 | raise ValueError(f"dataset {args.dataset_file} not supported") 39 | -------------------------------------------------------------------------------- /models/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /datasets/panoptic_eval.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | import json 11 | import os 12 | 13 | import util.misc as utils 14 | 15 | try: 16 | from panopticapi.evaluation import pq_compute 17 | except ImportError: 18 | pass 19 | 20 | 21 | class PanopticEvaluator(object): 22 | def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"): 23 | self.gt_json = ann_file 24 | self.gt_folder = ann_folder 25 | if utils.is_main_process(): 26 | if not os.path.exists(output_dir): 27 | os.mkdir(output_dir) 28 | self.output_dir = output_dir 29 | self.predictions = [] 30 | 31 | def update(self, predictions): 32 | for p in predictions: 33 | with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f: 34 | f.write(p.pop("png_string")) 35 | 36 | self.predictions += predictions 37 | 38 | def synchronize_between_processes(self): 39 | all_predictions = utils.all_gather(self.predictions) 40 | merged_predictions = [] 41 | for p in all_predictions: 42 | merged_predictions += p 43 | self.predictions = merged_predictions 44 | 45 | def summarize(self): 46 | if utils.is_main_process(): 47 | json_data = {"annotations": self.predictions} 48 | predictions_json = os.path.join(self.output_dir, "predictions.json") 49 | with open(predictions_json, "w") as f: 50 | f.write(json.dumps(json_data)) 51 | return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir) 52 | return None 53 | -------------------------------------------------------------------------------- /models/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import os 10 | import glob 11 | 12 | import torch 13 | 14 | from torch.utils.cpp_extension import CUDA_HOME 15 | from torch.utils.cpp_extension import CppExtension 16 | from torch.utils.cpp_extension import CUDAExtension 17 | 18 | from setuptools import find_packages 19 | from setuptools import setup 20 | 21 | requirements = ["torch", "torchvision"] 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | if torch.cuda.is_available() and CUDA_HOME is not None: 37 | extension = CUDAExtension 38 | sources += source_cuda 39 | define_macros += [("WITH_CUDA", None)] 40 | extra_compile_args["nvcc"] = [ 41 | "-DCUDA_HAS_FP16=1", 42 | "-D__CUDA_NO_HALF_OPERATORS__", 43 | "-D__CUDA_NO_HALF_CONVERSIONS__", 44 | "-D__CUDA_NO_HALF2_OPERATORS__", 45 | ] 46 | else: 47 | raise NotImplementedError('Cuda is not availabel') 48 | 49 | sources = [os.path.join(extensions_dir, s) for s in sources] 50 | include_dirs = [extensions_dir] 51 | ext_modules = [ 52 | extension( 53 | "MultiScaleDeformableAttention", 54 | sources, 55 | include_dirs=include_dirs, 56 | define_macros=define_macros, 57 | extra_compile_args=extra_compile_args, 58 | ) 59 | ] 60 | return ext_modules 61 | 62 | setup( 63 | name="MultiScaleDeformableAttention", 64 | version="1.0", 65 | author="Weijie Su", 66 | url="https://github.com/fundamentalvision/Deformable-DETR", 67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 68 | packages=find_packages(exclude=("configs", "tests",)), 69 | ext_modules=get_extensions(), 70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 71 | ) 72 | -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | 7 | """ 8 | Benchmark inference speed of Deformable DETR. 9 | """ 10 | import os 11 | import time 12 | import argparse 13 | 14 | import torch 15 | 16 | from main import get_args_parser as get_main_args_parser 17 | from models import build_model 18 | from datasets import build_dataset 19 | from util.misc import nested_tensor_from_tensor_list 20 | 21 | 22 | def get_benckmark_arg_parser(): 23 | parser = argparse.ArgumentParser("Benchmark inference speed of Deformable DETR.") 24 | parser.add_argument( 25 | "--num_iters", type=int, default=300, help="total iters to benchmark speed" 26 | ) 27 | parser.add_argument( 28 | "--warm_iters", 29 | type=int, 30 | default=5, 31 | help="ignore first several iters that are very slow", 32 | ) 33 | parser.add_argument( 34 | "--batch_size", type=int, default=1, help="batch size in inference" 35 | ) 36 | parser.add_argument("--resume", type=str, help="load the pre-trained checkpoint") 37 | return parser 38 | 39 | 40 | @torch.no_grad() 41 | def measure_average_inference_time(model, inputs, num_iters=100, warm_iters=5): 42 | ts = [] 43 | for iter_ in range(num_iters): 44 | torch.cuda.synchronize() 45 | t_ = time.perf_counter() 46 | model(inputs) 47 | torch.cuda.synchronize() 48 | t = time.perf_counter() - t_ 49 | if iter_ >= warm_iters: 50 | ts.append(t) 51 | print(ts) 52 | return sum(ts) / len(ts) 53 | 54 | 55 | def benchmark(): 56 | args, _ = get_benckmark_arg_parser().parse_known_args() 57 | main_args = get_main_args_parser().parse_args(_) 58 | assert ( 59 | args.warm_iters < args.num_iters and args.num_iters > 0 and args.warm_iters >= 0 60 | ) 61 | assert args.batch_size > 0 62 | assert args.resume is None or os.path.exists(args.resume) 63 | dataset = build_dataset("val", main_args) 64 | model, _, _ = build_model(main_args) 65 | model.cuda() 66 | model.eval() 67 | if args.resume is not None: 68 | ckpt = torch.load(args.resume, map_location=lambda storage, loc: storage) 69 | model.load_state_dict(ckpt["model"]) 70 | inputs = nested_tensor_from_tensor_list( 71 | [dataset.__getitem__(0)[0].cuda() for _ in range(args.batch_size)] 72 | ) 73 | t = measure_average_inference_time(model, inputs, args.num_iters, args.warm_iters) 74 | return 1.0 / t * args.batch_size 75 | 76 | 77 | if __name__ == "__main__": 78 | fps = benchmark() 79 | print(f"Inference Speed: {fps:.1f} FPS") 80 | 81 | -------------------------------------------------------------------------------- /mmcv_custom/runner/checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Open-MMLab. All rights reserved. 2 | import os.path as osp 3 | import time 4 | from tempfile import TemporaryDirectory 5 | 6 | import torch 7 | from torch.optim import Optimizer 8 | 9 | import mmcv 10 | from mmcv.parallel import is_module_wrapper 11 | from mmcv.runner.checkpoint import weights_to_cpu, get_state_dict 12 | 13 | try: 14 | import apex 15 | except: 16 | print("apex is not installed") 17 | 18 | 19 | def save_checkpoint(model, filename, optimizer=None, meta=None): 20 | """Save checkpoint to file. 21 | 22 | The checkpoint will have 4 fields: ``meta``, ``state_dict`` and 23 | ``optimizer``, ``amp``. By default ``meta`` will contain version 24 | and time info. 25 | 26 | Args: 27 | model (Module): Module whose params are to be saved. 28 | filename (str): Checkpoint filename. 29 | optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. 30 | meta (dict, optional): Metadata to be saved in checkpoint. 31 | """ 32 | if meta is None: 33 | meta = {} 34 | elif not isinstance(meta, dict): 35 | raise TypeError(f"meta must be a dict or None, but got {type(meta)}") 36 | meta.update(mmcv_version=mmcv.__version__, time=time.asctime()) 37 | 38 | if is_module_wrapper(model): 39 | model = model.module 40 | 41 | if hasattr(model, "CLASSES") and model.CLASSES is not None: 42 | # save class name to the meta 43 | meta.update(CLASSES=model.CLASSES) 44 | 45 | checkpoint = {"meta": meta, "state_dict": weights_to_cpu(get_state_dict(model))} 46 | # save optimizer state dict in the checkpoint 47 | if isinstance(optimizer, Optimizer): 48 | checkpoint["optimizer"] = optimizer.state_dict() 49 | elif isinstance(optimizer, dict): 50 | checkpoint["optimizer"] = {} 51 | for name, optim in optimizer.items(): 52 | checkpoint["optimizer"][name] = optim.state_dict() 53 | 54 | # save amp state dict in the checkpoint 55 | checkpoint["amp"] = apex.amp.state_dict() 56 | 57 | if filename.startswith("pavi://"): 58 | try: 59 | from pavi import modelcloud 60 | from pavi.exception import NodeNotFoundError 61 | except ImportError: 62 | raise ImportError("Please install pavi to load checkpoint from modelcloud.") 63 | model_path = filename[7:] 64 | root = modelcloud.Folder() 65 | model_dir, model_name = osp.split(model_path) 66 | try: 67 | model = modelcloud.get(model_dir) 68 | except NodeNotFoundError: 69 | model = root.create_training_model(model_dir) 70 | with TemporaryDirectory() as tmp_dir: 71 | checkpoint_file = osp.join(tmp_dir, model_name) 72 | with open(checkpoint_file, "wb") as f: 73 | torch.save(checkpoint, f) 74 | f.flush() 75 | model.create_file(checkpoint_file, name=model_name) 76 | else: 77 | mmcv.mkdir_or_exist(osp.dirname(filename)) 78 | # immediately flush buffer 79 | with open(filename, "wb") as f: 80 | torch.save(checkpoint, f) 81 | f.flush() 82 | -------------------------------------------------------------------------------- /datasets/data_prefetcher.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | 7 | import torch 8 | 9 | def to_cuda(samples, targets, device): 10 | samples = samples.to(device, non_blocking=True) 11 | targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets] 12 | return samples, targets 13 | 14 | class data_prefetcher(): 15 | def __init__(self, loader, device, prefetch=True): 16 | self.loader = iter(loader) 17 | self.prefetch = prefetch 18 | self.device = device 19 | if prefetch: 20 | self.stream = torch.cuda.Stream() 21 | self.preload() 22 | 23 | def preload(self): 24 | try: 25 | self.next_samples, self.next_targets = next(self.loader) 26 | except StopIteration: 27 | self.next_samples = None 28 | self.next_targets = None 29 | return 30 | # if record_stream() doesn't work, another option is to make sure device inputs are created 31 | # on the main stream. 32 | # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda') 33 | # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda') 34 | # Need to make sure the memory allocated for next_* is not still in use by the main stream 35 | # at the time we start copying to next_*: 36 | # self.stream.wait_stream(torch.cuda.current_stream()) 37 | with torch.cuda.stream(self.stream): 38 | self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device) 39 | # more code for the alternative if record_stream() doesn't work: 40 | # copy_ will record the use of the pinned source tensor in this side stream. 41 | # self.next_input_gpu.copy_(self.next_input, non_blocking=True) 42 | # self.next_target_gpu.copy_(self.next_target, non_blocking=True) 43 | # self.next_input = self.next_input_gpu 44 | # self.next_target = self.next_target_gpu 45 | 46 | # With Amp, it isn't necessary to manually convert data to half. 47 | # if args.fp16: 48 | # self.next_input = self.next_input.half() 49 | # else: 50 | 51 | def next(self): 52 | if self.prefetch: 53 | torch.cuda.current_stream().wait_stream(self.stream) 54 | samples = self.next_samples 55 | targets = self.next_targets 56 | if samples is not None: 57 | samples.record_stream(torch.cuda.current_stream()) 58 | if targets is not None: 59 | for t in targets: 60 | for k, v in t.items(): 61 | v.record_stream(torch.cuda.current_stream()) 62 | self.preload() 63 | else: 64 | try: 65 | samples, targets = next(self.loader) 66 | samples, targets = to_cuda(samples, targets, self.device) 67 | except StopIteration: 68 | samples = None 69 | targets = None 70 | return samples, targets 71 | -------------------------------------------------------------------------------- /util/box_ops.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Utilities for bounding box manipulation and GIoU. 12 | """ 13 | import torch 14 | from torchvision.ops.boxes import box_area 15 | 16 | 17 | def box_cxcywh_to_xyxy(x): 18 | x_c, y_c, w, h = x.unbind(-1) 19 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] 20 | return torch.stack(b, dim=-1) 21 | 22 | 23 | def box_xyxy_to_cxcywh(x): 24 | x0, y0, x1, y1 = x.unbind(-1) 25 | b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)] 26 | return torch.stack(b, dim=-1) 27 | 28 | 29 | # modified from torchvision to also return the union 30 | def box_iou(boxes1, boxes2): 31 | area1 = box_area(boxes1) 32 | area2 = box_area(boxes2) 33 | 34 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 35 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 36 | 37 | wh = (rb - lt).clamp(min=0) # [N,M,2] 38 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 39 | 40 | union = area1[:, None] + area2 - inter 41 | 42 | iou = inter / union 43 | return iou, union 44 | 45 | 46 | def generalized_box_iou(boxes1, boxes2): 47 | """ 48 | Generalized IoU from https://giou.stanford.edu/ 49 | 50 | The boxes should be in [x0, y0, x1, y1] format 51 | 52 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 53 | and M = len(boxes2) 54 | """ 55 | # degenerate boxes gives inf / nan results 56 | # so do an early check 57 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 58 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 59 | iou, union = box_iou(boxes1, boxes2) 60 | 61 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 62 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 63 | 64 | wh = (rb - lt).clamp(min=0) # [N,M,2] 65 | area = wh[:, :, 0] * wh[:, :, 1] 66 | 67 | return iou - (area - union) / area 68 | 69 | 70 | def masks_to_boxes(masks): 71 | """Compute the bounding boxes around the provided masks 72 | 73 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 74 | 75 | Returns a [N, 4] tensors, with the boxes in xyxy format 76 | """ 77 | if masks.numel() == 0: 78 | return torch.zeros((0, 4), device=masks.device) 79 | 80 | h, w = masks.shape[-2:] 81 | 82 | y = torch.arange(0, h, dtype=torch.float) 83 | x = torch.arange(0, w, dtype=torch.float) 84 | y, x = torch.meshgrid(y, x) 85 | 86 | x_mask = masks * x.unsqueeze(0) 87 | x_max = x_mask.flatten(1).max(-1)[0] 88 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 89 | 90 | y_mask = masks * y.unsqueeze(0) 91 | y_max = y_mask.flatten(1).max(-1)[0] 92 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 93 | 94 | return torch.stack([x_min, y_min, x_max, y_max], 1) 95 | -------------------------------------------------------------------------------- /datasets/torchvision_datasets/coco.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from torchvision 7 | # ------------------------------------------------------------------------ 8 | 9 | """ 10 | Copy-Paste from torchvision, but add utility of caching images on memory 11 | """ 12 | from torchvision.datasets.vision import VisionDataset 13 | from PIL import Image 14 | import os 15 | import os.path 16 | import tqdm 17 | from io import BytesIO 18 | 19 | 20 | class CocoDetection(VisionDataset): 21 | """`MS Coco Detection `_ Dataset. 22 | Args: 23 | root (string): Root directory where images are downloaded to. 24 | annFile (string): Path to json annotation file. 25 | transform (callable, optional): A function/transform that takes in an PIL image 26 | and returns a transformed version. E.g, ``transforms.ToTensor`` 27 | target_transform (callable, optional): A function/transform that takes in the 28 | target and transforms it. 29 | transforms (callable, optional): A function/transform that takes input sample and its target as entry 30 | and returns a transformed version. 31 | """ 32 | 33 | def __init__(self, root, annFile, transform=None, target_transform=None, transforms=None, 34 | cache_mode=False, local_rank=0, local_size=1): 35 | super(CocoDetection, self).__init__(root, transforms, transform, target_transform) 36 | from pycocotools.coco import COCO 37 | self.coco = COCO(annFile) 38 | self.ids = list(sorted(self.coco.imgs.keys())) 39 | self.cache_mode = cache_mode 40 | self.local_rank = local_rank 41 | self.local_size = local_size 42 | if cache_mode: 43 | self.cache = {} 44 | self.cache_images() 45 | 46 | def cache_images(self): 47 | self.cache = {} 48 | for index, img_id in zip(tqdm.trange(len(self.ids)), self.ids): 49 | if index % self.local_size != self.local_rank: 50 | continue 51 | path = self.coco.loadImgs(img_id)[0]['file_name'] 52 | with open(os.path.join(self.root, path), 'rb') as f: 53 | self.cache[path] = f.read() 54 | 55 | def get_image(self, path): 56 | if self.cache_mode: 57 | if path not in self.cache.keys(): 58 | with open(os.path.join(self.root, path), 'rb') as f: 59 | self.cache[path] = f.read() 60 | return Image.open(BytesIO(self.cache[path])).convert('RGB') 61 | return Image.open(os.path.join(self.root, path)).convert('RGB') 62 | 63 | def __getitem__(self, index): 64 | """ 65 | Args: 66 | index (int): Index 67 | Returns: 68 | tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``. 69 | """ 70 | coco = self.coco 71 | img_id = self.ids[index] 72 | ann_ids = coco.getAnnIds(imgIds=img_id) 73 | target = coco.loadAnns(ann_ids) 74 | 75 | path = coco.loadImgs(img_id)[0]['file_name'] 76 | 77 | img = self.get_image(path) 78 | if self.transforms is not None: 79 | img, target = self.transforms(img, target) 80 | 81 | return img, target 82 | 83 | def __len__(self): 84 | return len(self.ids) 85 | -------------------------------------------------------------------------------- /mmcv_custom/runner/epoch_based_runner.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Open-MMLab. All rights reserved. 2 | import os.path as osp 3 | import platform 4 | import shutil 5 | 6 | import torch 7 | from torch.optim import Optimizer 8 | 9 | import mmcv 10 | from mmcv.runner import RUNNERS, EpochBasedRunner 11 | from .checkpoint import save_checkpoint 12 | 13 | try: 14 | import apex 15 | except: 16 | print("apex is not installed") 17 | 18 | 19 | @RUNNERS.register_module() 20 | class EpochBasedRunnerAmp(EpochBasedRunner): 21 | """Epoch-based Runner with AMP support. 22 | 23 | This runner train models epoch by epoch. 24 | """ 25 | 26 | def save_checkpoint( 27 | self, 28 | out_dir, 29 | filename_tmpl="epoch_{}.pth", 30 | save_optimizer=True, 31 | meta=None, 32 | create_symlink=True, 33 | ): 34 | """Save the checkpoint. 35 | 36 | Args: 37 | out_dir (str): The directory that checkpoints are saved. 38 | filename_tmpl (str, optional): The checkpoint filename template, 39 | which contains a placeholder for the epoch number. 40 | Defaults to 'epoch_{}.pth'. 41 | save_optimizer (bool, optional): Whether to save the optimizer to 42 | the checkpoint. Defaults to True. 43 | meta (dict, optional): The meta information to be saved in the 44 | checkpoint. Defaults to None. 45 | create_symlink (bool, optional): Whether to create a symlink 46 | "latest.pth" to point to the latest checkpoint. 47 | Defaults to True. 48 | """ 49 | if meta is None: 50 | meta = dict(epoch=self.epoch + 1, iter=self.iter) 51 | elif isinstance(meta, dict): 52 | meta.update(epoch=self.epoch + 1, iter=self.iter) 53 | else: 54 | raise TypeError(f"meta should be a dict or None, but got {type(meta)}") 55 | if self.meta is not None: 56 | meta.update(self.meta) 57 | 58 | filename = filename_tmpl.format(self.epoch + 1) 59 | filepath = osp.join(out_dir, filename) 60 | optimizer = self.optimizer if save_optimizer else None 61 | save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta) 62 | # in some environments, `os.symlink` is not supported, you may need to 63 | # set `create_symlink` to False 64 | if create_symlink: 65 | dst_file = osp.join(out_dir, "latest.pth") 66 | if platform.system() != "Windows": 67 | mmcv.symlink(filename, dst_file) 68 | else: 69 | shutil.copy(filepath, dst_file) 70 | 71 | def resume(self, checkpoint, resume_optimizer=True, map_location="default"): 72 | if map_location == "default": 73 | if torch.cuda.is_available(): 74 | device_id = torch.cuda.current_device() 75 | checkpoint = self.load_checkpoint( 76 | checkpoint, 77 | map_location=lambda storage, loc: storage.cuda(device_id), 78 | ) 79 | else: 80 | checkpoint = self.load_checkpoint(checkpoint) 81 | else: 82 | checkpoint = self.load_checkpoint(checkpoint, map_location=map_location) 83 | 84 | self._epoch = checkpoint["meta"]["epoch"] 85 | self._iter = checkpoint["meta"]["iter"] 86 | if "optimizer" in checkpoint and resume_optimizer: 87 | if isinstance(self.optimizer, Optimizer): 88 | self.optimizer.load_state_dict(checkpoint["optimizer"]) 89 | elif isinstance(self.optimizer, dict): 90 | for k in self.optimizer.keys(): 91 | self.optimizer[k].load_state_dict(checkpoint["optimizer"][k]) 92 | else: 93 | raise TypeError( 94 | "Optimizer should be dict or torch.optim.Optimizer " 95 | f"but got {type(self.optimizer)}" 96 | ) 97 | 98 | if "amp" in checkpoint: 99 | apex.amp.load_state_dict(checkpoint["amp"]) 100 | self.logger.info("load amp state dict") 101 | 102 | self.logger.info("resumed epoch %d, iter %d", self.epoch, self.iter) 103 | 104 | -------------------------------------------------------------------------------- /models/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import time 14 | import torch 15 | import torch.nn as nn 16 | from torch.autograd import gradcheck 17 | 18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 19 | 20 | 21 | N, M, D = 1, 2, 2 22 | Lq, L, P = 2, 2, 2 23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 25 | S = sum([(H*W).item() for H, W in shapes]) 26 | 27 | 28 | torch.manual_seed(3) 29 | 30 | 31 | @torch.no_grad() 32 | def check_forward_equal_with_pytorch_double(): 33 | value = torch.rand(N, S, M, D).cuda() * 0.01 34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 37 | im2col_step = 2 38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 40 | fwdok = torch.allclose(output_cuda, output_pytorch) 41 | max_abs_err = (output_cuda - output_pytorch).abs().max() 42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 43 | 44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 45 | 46 | 47 | @torch.no_grad() 48 | def check_forward_equal_with_pytorch_float(): 49 | value = torch.rand(N, S, M, D).cuda() * 0.01 50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 53 | im2col_step = 2 54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 57 | max_abs_err = (output_cuda - output_pytorch).abs().max() 58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 59 | 60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 61 | 62 | 63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 64 | 65 | value = torch.rand(N, S, M, channels).cuda() * 0.01 66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 69 | im2col_step = 2 70 | func = MSDeformAttnFunction.apply 71 | 72 | value.requires_grad = grad_value 73 | sampling_locations.requires_grad = grad_sampling_loc 74 | attention_weights.requires_grad = grad_attn_weight 75 | 76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 77 | 78 | print(f'* {gradok} check_gradient_numerical(D={channels})') 79 | 80 | 81 | if __name__ == '__main__': 82 | check_forward_equal_with_pytorch_double() 83 | check_forward_equal_with_pytorch_float() 84 | 85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 86 | check_gradient_numerical(channels, True, True, True) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /models/position_encoding.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Various positional encodings for the transformer. 12 | """ 13 | import math 14 | import torch 15 | from torch import nn 16 | 17 | from util.misc import NestedTensor 18 | 19 | 20 | class PositionEmbeddingSine(nn.Module): 21 | """ 22 | This is a more standard version of the position embedding, very similar to the one 23 | used by the Attention is all you need paper, generalized to work on images. 24 | """ 25 | 26 | def __init__( 27 | self, num_pos_feats=64, temperature=10000, normalize=False, scale=None 28 | ): 29 | super().__init__() 30 | self.num_pos_feats = num_pos_feats 31 | self.temperature = temperature 32 | self.normalize = normalize 33 | if scale is not None and normalize is False: 34 | raise ValueError("normalize should be True if scale is passed") 35 | if scale is None: 36 | scale = 2 * math.pi 37 | self.scale = scale 38 | 39 | def forward(self, tensor_list: NestedTensor): 40 | x = tensor_list.tensors 41 | mask = tensor_list.mask 42 | assert mask is not None 43 | not_mask = ~mask 44 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 45 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 46 | if self.normalize: 47 | eps = 1e-6 48 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale 49 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale 50 | 51 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 52 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 53 | 54 | pos_x = x_embed[:, :, :, None] / dim_t 55 | pos_y = y_embed[:, :, :, None] / dim_t 56 | pos_x = torch.stack( 57 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 58 | ).flatten(3) 59 | pos_y = torch.stack( 60 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 61 | ).flatten(3) 62 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 63 | return pos 64 | 65 | 66 | class PositionEmbeddingLearned(nn.Module): 67 | """ 68 | Absolute pos embedding, learned. 69 | """ 70 | 71 | def __init__(self, num_pos_feats=256): 72 | super().__init__() 73 | self.row_embed = nn.Embedding(50, num_pos_feats) 74 | self.col_embed = nn.Embedding(50, num_pos_feats) 75 | self.reset_parameters() 76 | 77 | def reset_parameters(self): 78 | nn.init.uniform_(self.row_embed.weight) 79 | nn.init.uniform_(self.col_embed.weight) 80 | 81 | def forward(self, tensor_list: NestedTensor): 82 | x = tensor_list.tensors 83 | h, w = x.shape[-2:] 84 | i = torch.arange(w, device=x.device) 85 | j = torch.arange(h, device=x.device) 86 | x_emb = self.col_embed(i) 87 | y_emb = self.row_embed(j) 88 | pos = ( 89 | torch.cat( 90 | [ 91 | x_emb.unsqueeze(0).repeat(h, 1, 1), 92 | y_emb.unsqueeze(1).repeat(1, w, 1), 93 | ], 94 | dim=-1, 95 | ) 96 | .permute(2, 0, 1) 97 | .unsqueeze(0) 98 | .repeat(x.shape[0], 1, 1, 1) 99 | ) 100 | return pos 101 | 102 | 103 | def build_position_encoding(args): 104 | N_steps = args.hidden_dim // 2 105 | if args.position_embedding in ("v2", "sine"): 106 | # TODO find a better way of exposing other arguments 107 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 108 | elif args.position_embedding in ("v3", "learned"): 109 | position_embedding = PositionEmbeddingLearned(N_steps) 110 | else: 111 | raise ValueError(f"not supported {args.position_embedding}") 112 | 113 | return position_embedding 114 | -------------------------------------------------------------------------------- /datasets/coco_panoptic.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | import json 11 | from pathlib import Path 12 | 13 | import numpy as np 14 | import torch 15 | from PIL import Image 16 | 17 | from panopticapi.utils import rgb2id 18 | from util.box_ops import masks_to_boxes 19 | 20 | from .coco import make_coco_transforms 21 | 22 | 23 | class CocoPanoptic: 24 | def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True): 25 | with open(ann_file, 'r') as f: 26 | self.coco = json.load(f) 27 | 28 | # sort 'images' field so that they are aligned with 'annotations' 29 | # i.e., in alphabetical order 30 | self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id']) 31 | # sanity check 32 | if "annotations" in self.coco: 33 | for img, ann in zip(self.coco['images'], self.coco['annotations']): 34 | assert img['file_name'][:-4] == ann['file_name'][:-4] 35 | 36 | self.img_folder = img_folder 37 | self.ann_folder = ann_folder 38 | self.ann_file = ann_file 39 | self.transforms = transforms 40 | self.return_masks = return_masks 41 | 42 | def __getitem__(self, idx): 43 | ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx] 44 | img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg') 45 | ann_path = Path(self.ann_folder) / ann_info['file_name'] 46 | 47 | img = Image.open(img_path).convert('RGB') 48 | w, h = img.size 49 | if "segments_info" in ann_info: 50 | masks = np.asarray(Image.open(ann_path), dtype=np.uint32) 51 | masks = rgb2id(masks) 52 | 53 | ids = np.array([ann['id'] for ann in ann_info['segments_info']]) 54 | masks = masks == ids[:, None, None] 55 | 56 | masks = torch.as_tensor(masks, dtype=torch.uint8) 57 | labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64) 58 | 59 | target = {} 60 | target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]]) 61 | if self.return_masks: 62 | target['masks'] = masks 63 | target['labels'] = labels 64 | 65 | target["boxes"] = masks_to_boxes(masks) 66 | 67 | target['size'] = torch.as_tensor([int(h), int(w)]) 68 | target['orig_size'] = torch.as_tensor([int(h), int(w)]) 69 | if "segments_info" in ann_info: 70 | for name in ['iscrowd', 'area']: 71 | target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']]) 72 | 73 | if self.transforms is not None: 74 | img, target = self.transforms(img, target) 75 | 76 | return img, target 77 | 78 | def __len__(self): 79 | return len(self.coco['images']) 80 | 81 | def get_height_and_width(self, idx): 82 | img_info = self.coco['images'][idx] 83 | height = img_info['height'] 84 | width = img_info['width'] 85 | return height, width 86 | 87 | 88 | def build(image_set, args): 89 | img_folder_root = Path(args.coco_path) 90 | ann_folder_root = Path(args.coco_panoptic_path) 91 | assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist' 92 | assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist' 93 | mode = 'panoptic' 94 | PATHS = { 95 | "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'), 96 | "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'), 97 | } 98 | 99 | img_folder, ann_file = PATHS[image_set] 100 | img_folder_path = img_folder_root / img_folder 101 | ann_folder = ann_folder_root / f'{mode}_{img_folder}' 102 | ann_file = ann_folder_root / ann_file 103 | 104 | dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file, 105 | transforms=make_coco_transforms(image_set), return_masks=args.masks) 106 | 107 | return dataset 108 | -------------------------------------------------------------------------------- /models/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # H-DETR 3 | # Copyright (c) 2022 Peking University & Microsoft Research Asia. All Rights Reserved. 4 | # Licensed under the MIT-style license found in the LICENSE file in the root directory 5 | # ------------------------------------------------------------------------------------------------ 6 | # Deformable DETR 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 9 | # ------------------------------------------------------------------------------------------------ 10 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 11 | # ------------------------------------------------------------------------------------------------ 12 | 13 | from __future__ import absolute_import 14 | from __future__ import print_function 15 | from __future__ import division 16 | 17 | import torch 18 | import torch.nn.functional as F 19 | from torch.autograd import Function 20 | from torch.autograd.function import once_differentiable 21 | 22 | import MultiScaleDeformableAttention as MSDA 23 | 24 | 25 | class MSDeformAttnFunction(Function): 26 | @staticmethod 27 | @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32) 28 | def forward( 29 | ctx, 30 | value, 31 | value_spatial_shapes, 32 | value_level_start_index, 33 | sampling_locations, 34 | attention_weights, 35 | im2col_step, 36 | ): 37 | ctx.im2col_step = im2col_step 38 | output = MSDA.ms_deform_attn_forward( 39 | value, 40 | value_spatial_shapes, 41 | value_level_start_index, 42 | sampling_locations, 43 | attention_weights, 44 | ctx.im2col_step, 45 | ) 46 | ctx.save_for_backward( 47 | value, 48 | value_spatial_shapes, 49 | value_level_start_index, 50 | sampling_locations, 51 | attention_weights, 52 | ) 53 | return output 54 | 55 | @staticmethod 56 | @once_differentiable 57 | @torch.cuda.amp.custom_bwd 58 | def backward(ctx, grad_output): 59 | ( 60 | value, 61 | value_spatial_shapes, 62 | value_level_start_index, 63 | sampling_locations, 64 | attention_weights, 65 | ) = ctx.saved_tensors 66 | grad_value, grad_sampling_loc, grad_attn_weight = MSDA.ms_deform_attn_backward( 67 | value, 68 | value_spatial_shapes, 69 | value_level_start_index, 70 | sampling_locations, 71 | attention_weights, 72 | grad_output, 73 | ctx.im2col_step, 74 | ) 75 | 76 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 77 | 78 | 79 | def ms_deform_attn_core_pytorch( 80 | value, value_spatial_shapes, sampling_locations, attention_weights 81 | ): 82 | # for debug and test only, 83 | # need to use cuda version instead 84 | N_, S_, M_, D_ = value.shape 85 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 86 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 87 | sampling_grids = 2 * sampling_locations - 1 88 | sampling_value_list = [] 89 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 90 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 91 | value_l_ = ( 92 | value_list[lid_].flatten(2).transpose(1, 2).reshape(N_ * M_, D_, H_, W_) 93 | ) 94 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 95 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 96 | # N_*M_, D_, Lq_, P_ 97 | sampling_value_l_ = F.grid_sample( 98 | value_l_, 99 | sampling_grid_l_, 100 | mode="bilinear", 101 | padding_mode="zeros", 102 | align_corners=False, 103 | ) 104 | sampling_value_list.append(sampling_value_l_) 105 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 106 | attention_weights = attention_weights.transpose(1, 2).reshape( 107 | N_ * M_, 1, Lq_, L_ * P_ 108 | ) 109 | output = ( 110 | (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) 111 | .sum(-1) 112 | .view(N_, M_ * D_, Lq_) 113 | ) 114 | return output.transpose(1, 2).contiguous() 115 | -------------------------------------------------------------------------------- /util/plot_utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Plotting utilities to visualize training logs. 12 | """ 13 | import torch 14 | import pandas as pd 15 | import seaborn as sns 16 | import matplotlib.pyplot as plt 17 | 18 | from pathlib import Path, PurePath 19 | 20 | 21 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'): 22 | ''' 23 | Function to plot specific fields from training log(s). Plots both training and test results. 24 | 25 | :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file 26 | - fields = which results to plot from each log file - plots both training and test for each field. 27 | - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots 28 | - log_name = optional, name of log file if different than default 'log.txt'. 29 | 30 | :: Outputs - matplotlib plots of results in fields, color coded for each log file. 31 | - solid lines are training results, dashed lines are test results. 32 | 33 | ''' 34 | func_name = "plot_utils.py::plot_logs" 35 | 36 | # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path, 37 | # convert single Path to list to avoid 'not iterable' error 38 | 39 | if not isinstance(logs, list): 40 | if isinstance(logs, PurePath): 41 | logs = [logs] 42 | print(f"{func_name} info: logs param expects a list argument, converted to list[Path].") 43 | else: 44 | raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \ 45 | Expect list[Path] or single Path obj, received {type(logs)}") 46 | 47 | # verify valid dir(s) and that every item in list is Path object 48 | for i, dir in enumerate(logs): 49 | if not isinstance(dir, PurePath): 50 | raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}") 51 | if dir.exists(): 52 | continue 53 | raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}") 54 | 55 | # load log file(s) and plot 56 | dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs] 57 | 58 | fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5)) 59 | 60 | for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))): 61 | for j, field in enumerate(fields): 62 | if field == 'mAP': 63 | coco_eval = pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]).ewm(com=ewm_col).mean() 64 | axs[j].plot(coco_eval, c=color) 65 | else: 66 | df.interpolate().ewm(com=ewm_col).mean().plot( 67 | y=[f'train_{field}', f'test_{field}'], 68 | ax=axs[j], 69 | color=[color] * 2, 70 | style=['-', '--'] 71 | ) 72 | for ax, field in zip(axs, fields): 73 | ax.legend([Path(p).name for p in logs]) 74 | ax.set_title(field) 75 | 76 | 77 | def plot_precision_recall(files, naming_scheme='iter'): 78 | if naming_scheme == 'exp_id': 79 | # name becomes exp_id 80 | names = [f.parts[-3] for f in files] 81 | elif naming_scheme == 'iter': 82 | names = [f.stem for f in files] 83 | else: 84 | raise ValueError(f'not supported {naming_scheme}') 85 | fig, axs = plt.subplots(ncols=2, figsize=(16, 5)) 86 | for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names): 87 | data = torch.load(f) 88 | # precision is n_iou, n_points, n_cat, n_area, max_det 89 | precision = data['precision'] 90 | recall = data['params'].recThrs 91 | scores = data['scores'] 92 | # take precision for all classes, all areas and 100 detections 93 | precision = precision[0, :, :, 0, -1].mean(1) 94 | scores = scores[0, :, :, 0, -1].mean(1) 95 | prec = precision.mean() 96 | rec = data['recall'][0, :, 0, -1].mean() 97 | print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' + 98 | f'score={scores.mean():0.3f}, ' + 99 | f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}' 100 | ) 101 | axs[0].plot(recall, precision, c=color) 102 | axs[1].plot(recall, scores, c=color) 103 | 104 | axs[0].set_title('Precision / Recall') 105 | axs[0].legend(names) 106 | axs[1].set_title('Scores / Recall') 107 | axs[1].legend(names) 108 | return fig, axs 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /models/matcher.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Modules to compute the matching cost and solve the corresponding LSAP. 12 | """ 13 | import torch 14 | from scipy.optimize import linear_sum_assignment 15 | from torch import nn 16 | 17 | from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou 18 | 19 | 20 | class HungarianMatcher(nn.Module): 21 | """This class computes an assignment between the targets and the predictions of the network 22 | 23 | For efficiency reasons, the targets don't include the no_object. Because of this, in general, 24 | there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, 25 | while the others are un-matched (and thus treated as non-objects). 26 | """ 27 | 28 | def __init__( 29 | self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1 30 | ): 31 | """Creates the matcher 32 | 33 | Params: 34 | cost_class: This is the relative weight of the classification error in the matching cost 35 | cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost 36 | cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost 37 | """ 38 | super().__init__() 39 | self.cost_class = cost_class 40 | self.cost_bbox = cost_bbox 41 | self.cost_giou = cost_giou 42 | assert ( 43 | cost_class != 0 or cost_bbox != 0 or cost_giou != 0 44 | ), "all costs cant be 0" 45 | 46 | def forward(self, outputs, targets): 47 | """ Performs the matching 48 | 49 | Params: 50 | outputs: This is a dict that contains at least these entries: 51 | "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits 52 | "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates 53 | 54 | targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: 55 | "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth 56 | objects in the target) containing the class labels 57 | "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates 58 | 59 | Returns: 60 | A list of size batch_size, containing tuples of (index_i, index_j) where: 61 | - index_i is the indices of the selected predictions (in order) 62 | - index_j is the indices of the corresponding selected targets (in order) 63 | For each batch element, it holds: 64 | len(index_i) = len(index_j) = min(num_queries, num_target_boxes) 65 | """ 66 | with torch.no_grad(): 67 | bs, num_queries = outputs["pred_logits"].shape[:2] 68 | 69 | # We flatten to compute the cost matrices in a batch 70 | out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() 71 | out_bbox = outputs["pred_boxes"].flatten( 72 | 0, 1 73 | ) # [batch_size * num_queries, 4] 74 | 75 | # Also concat the target labels and boxes 76 | tgt_ids = torch.cat([v["labels"] for v in targets]) 77 | tgt_bbox = torch.cat([v["boxes"] for v in targets]) 78 | 79 | # Compute the classification cost. 80 | alpha = 0.25 81 | gamma = 2.0 82 | neg_cost_class = ( 83 | (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log()) 84 | ) 85 | pos_cost_class = ( 86 | alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) 87 | ) 88 | cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids] 89 | 90 | # Compute the L1 cost between boxes 91 | cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) 92 | 93 | # Compute the giou cost betwen boxes 94 | cost_giou = -generalized_box_iou( 95 | box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox) 96 | ) 97 | 98 | # Final cost matrix 99 | C = ( 100 | self.cost_bbox * cost_bbox 101 | + self.cost_class * cost_class 102 | + self.cost_giou * cost_giou 103 | ) 104 | C = C.view(bs, num_queries, -1).cpu() 105 | 106 | sizes = [len(v["boxes"]) for v in targets] 107 | indices = [ 108 | linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1)) 109 | ] 110 | return [ 111 | ( 112 | torch.as_tensor(i, dtype=torch.int64), 113 | torch.as_tensor(j, dtype=torch.int64), 114 | ) 115 | for i, j in indices 116 | ] 117 | 118 | 119 | def build_matcher(args): 120 | return HungarianMatcher( 121 | cost_class=args.set_cost_class, 122 | cost_bbox=args.set_cost_bbox, 123 | cost_giou=args.set_cost_giou, 124 | ) 125 | -------------------------------------------------------------------------------- /datasets/samplers.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from codes in torch.utils.data.distributed 7 | # ------------------------------------------------------------------------ 8 | 9 | import os 10 | import math 11 | import torch 12 | import torch.distributed as dist 13 | from torch.utils.data.sampler import Sampler 14 | 15 | 16 | class DistributedSampler(Sampler): 17 | """Sampler that restricts data loading to a subset of the dataset. 18 | It is especially useful in conjunction with 19 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 20 | process can pass a DistributedSampler instance as a DataLoader sampler, 21 | and load a subset of the original dataset that is exclusive to it. 22 | .. note:: 23 | Dataset is assumed to be of constant size. 24 | Arguments: 25 | dataset: Dataset used for sampling. 26 | num_replicas (optional): Number of processes participating in 27 | distributed training. 28 | rank (optional): Rank of the current process within num_replicas. 29 | """ 30 | 31 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): 32 | if num_replicas is None: 33 | if not dist.is_available(): 34 | raise RuntimeError("Requires distributed package to be available") 35 | num_replicas = dist.get_world_size() 36 | if rank is None: 37 | if not dist.is_available(): 38 | raise RuntimeError("Requires distributed package to be available") 39 | rank = dist.get_rank() 40 | self.dataset = dataset 41 | self.num_replicas = num_replicas 42 | self.rank = rank 43 | self.epoch = 0 44 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 45 | self.total_size = self.num_samples * self.num_replicas 46 | self.shuffle = shuffle 47 | 48 | def __iter__(self): 49 | if self.shuffle: 50 | # deterministically shuffle based on epoch 51 | g = torch.Generator() 52 | g.manual_seed(self.epoch) 53 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 54 | else: 55 | indices = torch.arange(len(self.dataset)).tolist() 56 | 57 | # add extra samples to make it evenly divisible 58 | indices += indices[: (self.total_size - len(indices))] 59 | assert len(indices) == self.total_size 60 | 61 | # subsample 62 | offset = self.num_samples * self.rank 63 | indices = indices[offset : offset + self.num_samples] 64 | assert len(indices) == self.num_samples 65 | 66 | return iter(indices) 67 | 68 | def __len__(self): 69 | return self.num_samples 70 | 71 | def set_epoch(self, epoch): 72 | self.epoch = epoch 73 | 74 | 75 | class NodeDistributedSampler(Sampler): 76 | """Sampler that restricts data loading to a subset of the dataset. 77 | It is especially useful in conjunction with 78 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 79 | process can pass a DistributedSampler instance as a DataLoader sampler, 80 | and load a subset of the original dataset that is exclusive to it. 81 | .. note:: 82 | Dataset is assumed to be of constant size. 83 | Arguments: 84 | dataset: Dataset used for sampling. 85 | num_replicas (optional): Number of processes participating in 86 | distributed training. 87 | rank (optional): Rank of the current process within num_replicas. 88 | """ 89 | 90 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): 91 | if num_replicas is None: 92 | if not dist.is_available(): 93 | raise RuntimeError("Requires distributed package to be available") 94 | num_replicas = dist.get_world_size() 95 | if rank is None: 96 | if not dist.is_available(): 97 | raise RuntimeError("Requires distributed package to be available") 98 | rank = dist.get_rank() 99 | if local_rank is None: 100 | local_rank = int(os.environ.get('LOCAL_RANK', 0)) 101 | if local_size is None: 102 | local_size = int(os.environ.get('LOCAL_SIZE', 1)) 103 | self.dataset = dataset 104 | self.shuffle = shuffle 105 | self.num_replicas = num_replicas 106 | self.num_parts = local_size 107 | self.rank = rank 108 | self.local_rank = local_rank 109 | self.epoch = 0 110 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 111 | self.total_size = self.num_samples * self.num_replicas 112 | 113 | self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts 114 | 115 | def __iter__(self): 116 | if self.shuffle: 117 | # deterministically shuffle based on epoch 118 | g = torch.Generator() 119 | g.manual_seed(self.epoch) 120 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 121 | else: 122 | indices = torch.arange(len(self.dataset)).tolist() 123 | indices = [i for i in indices if i % self.num_parts == self.local_rank] 124 | 125 | # add extra samples to make it evenly divisible 126 | indices += indices[:(self.total_size_parts - len(indices))] 127 | assert len(indices) == self.total_size_parts 128 | 129 | # subsample 130 | indices = indices[self.rank // self.num_parts:self.total_size_parts:self.num_replicas // self.num_parts] 131 | assert len(indices) == self.num_samples 132 | 133 | return iter(indices) 134 | 135 | def __len__(self): 136 | return self.num_samples 137 | 138 | def set_epoch(self, epoch): 139 | self.epoch = epoch 140 | -------------------------------------------------------------------------------- /datasets/coco.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # H-DETR 3 | # Copyright (c) 2022 Peking University & Microsoft Research Asia. All Rights Reserved. 4 | # Licensed under the MIT-style license found in the LICENSE file in the root directory 5 | # ------------------------------------------------------------------------ 6 | # Deformable DETR 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 9 | # ------------------------------------------------------------------------ 10 | # Modified from DETR (https://github.com/facebookresearch/detr) 11 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 12 | # ------------------------------------------------------------------------ 13 | 14 | """ 15 | COCO dataset which returns image_id for evaluation. 16 | 17 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py 18 | """ 19 | from pathlib import Path 20 | 21 | import torch 22 | import torch.utils.data 23 | from pycocotools import mask as coco_mask 24 | 25 | from .torchvision_datasets import CocoDetection as TvCocoDetection 26 | from util.misc import get_local_rank, get_local_size 27 | import datasets.transforms as T 28 | 29 | 30 | class CocoDetection(TvCocoDetection): 31 | def __init__( 32 | self, 33 | img_folder, 34 | ann_file, 35 | transforms, 36 | return_masks, 37 | cache_mode=False, 38 | local_rank=0, 39 | local_size=1, 40 | ): 41 | super(CocoDetection, self).__init__( 42 | img_folder, 43 | ann_file, 44 | cache_mode=cache_mode, 45 | local_rank=local_rank, 46 | local_size=local_size, 47 | ) 48 | self._transforms = transforms 49 | self.prepare = ConvertCocoPolysToMask(return_masks) 50 | 51 | def __getitem__(self, idx): 52 | img, target = super(CocoDetection, self).__getitem__(idx) 53 | image_id = self.ids[idx] 54 | target = {"image_id": image_id, "annotations": target} 55 | img, target = self.prepare(img, target) 56 | if self._transforms is not None: 57 | img, target = self._transforms(img, target) 58 | return img, target 59 | 60 | 61 | def convert_coco_poly_to_mask(segmentations, height, width): 62 | masks = [] 63 | for polygons in segmentations: 64 | rles = coco_mask.frPyObjects(polygons, height, width) 65 | mask = coco_mask.decode(rles) 66 | if len(mask.shape) < 3: 67 | mask = mask[..., None] 68 | mask = torch.as_tensor(mask, dtype=torch.uint8) 69 | mask = mask.any(dim=2) 70 | masks.append(mask) 71 | if masks: 72 | masks = torch.stack(masks, dim=0) 73 | else: 74 | masks = torch.zeros((0, height, width), dtype=torch.uint8) 75 | return masks 76 | 77 | 78 | class ConvertCocoPolysToMask(object): 79 | def __init__(self, return_masks=False): 80 | self.return_masks = return_masks 81 | 82 | def __call__(self, image, target): 83 | w, h = image.size 84 | 85 | image_id = target["image_id"] 86 | image_id = torch.tensor([image_id]) 87 | 88 | anno = target["annotations"] 89 | 90 | anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0] 91 | 92 | boxes = [obj["bbox"] for obj in anno] 93 | # guard against no boxes via resizing 94 | boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) 95 | boxes[:, 2:] += boxes[:, :2] 96 | boxes[:, 0::2].clamp_(min=0, max=w) 97 | boxes[:, 1::2].clamp_(min=0, max=h) 98 | 99 | classes = [obj["category_id"] for obj in anno] 100 | classes = torch.tensor(classes, dtype=torch.int64) 101 | 102 | if self.return_masks: 103 | segmentations = [obj["segmentation"] for obj in anno] 104 | masks = convert_coco_poly_to_mask(segmentations, h, w) 105 | 106 | keypoints = None 107 | if anno and "keypoints" in anno[0]: 108 | keypoints = [obj["keypoints"] for obj in anno] 109 | keypoints = torch.as_tensor(keypoints, dtype=torch.float32) 110 | num_keypoints = keypoints.shape[0] 111 | if num_keypoints: 112 | keypoints = keypoints.view(num_keypoints, -1, 3) 113 | 114 | keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) 115 | boxes = boxes[keep] 116 | classes = classes[keep] 117 | if self.return_masks: 118 | masks = masks[keep] 119 | if keypoints is not None: 120 | keypoints = keypoints[keep] 121 | 122 | target = {} 123 | target["boxes"] = boxes 124 | target["labels"] = classes 125 | if self.return_masks: 126 | target["masks"] = masks 127 | target["image_id"] = image_id 128 | if keypoints is not None: 129 | target["keypoints"] = keypoints 130 | 131 | # for conversion to coco api 132 | area = torch.tensor([obj["area"] for obj in anno]) 133 | iscrowd = torch.tensor( 134 | [obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno] 135 | ) 136 | target["area"] = area[keep] 137 | target["iscrowd"] = iscrowd[keep] 138 | 139 | target["orig_size"] = torch.as_tensor([int(h), int(w)]) 140 | target["size"] = torch.as_tensor([int(h), int(w)]) 141 | 142 | return image, target 143 | 144 | 145 | def make_coco_transforms(image_set): 146 | 147 | normalize = T.Compose( 148 | [T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])] 149 | ) 150 | 151 | scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800] 152 | 153 | if image_set == "train": 154 | return T.Compose( 155 | [ 156 | T.RandomHorizontalFlip(), 157 | T.RandomSelect( 158 | T.RandomResize(scales, max_size=1333), 159 | T.Compose( 160 | [ 161 | T.RandomResize([400, 500, 600]), 162 | T.RandomSizeCrop(384, 600), 163 | T.RandomResize(scales, max_size=1333), 164 | ] 165 | ), 166 | ), 167 | normalize, 168 | ] 169 | ) 170 | 171 | if image_set == "val": 172 | return T.Compose([T.RandomResize([800], max_size=1333), normalize,]) 173 | 174 | raise ValueError(f"unknown {image_set}") 175 | 176 | 177 | def build(image_set, args, eval_in_training_set): 178 | root = Path(args.coco_path) 179 | assert root.exists(), f"provided COCO path {root} does not exist" 180 | mode = "instances" 181 | PATHS = { 182 | "train": (root / "train2017", root / "annotations" / f"{mode}_train2017.json"), 183 | "val": (root / "val2017", root / "annotations" / f"{mode}_val2017.json"), 184 | } 185 | 186 | img_folder, ann_file = PATHS[image_set] 187 | if eval_in_training_set: 188 | image_set = "val" 189 | print("use validation dataset transforms") 190 | dataset = CocoDetection( 191 | img_folder, 192 | ann_file, 193 | transforms=make_coco_transforms(image_set), 194 | return_masks=args.masks, 195 | cache_mode=args.cache_mode, 196 | local_rank=get_local_rank(), 197 | local_size=get_local_size(), 198 | ) 199 | return dataset 200 | -------------------------------------------------------------------------------- /models/ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # H-DETR 3 | # Copyright (c) 2022 Peking University & Microsoft Research Asia. All Rights Reserved. 4 | # Licensed under the MIT-style license found in the LICENSE file in the root directory 5 | # ------------------------------------------------------------------------------------------------ 6 | # Deformable DETR 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 9 | # ------------------------------------------------------------------------------------------------ 10 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 11 | # ------------------------------------------------------------------------------------------------ 12 | 13 | from __future__ import absolute_import 14 | from __future__ import print_function 15 | from __future__ import division 16 | 17 | import warnings 18 | import math 19 | 20 | import torch 21 | from torch import nn 22 | import torch.nn.functional as F 23 | from torch.nn.init import xavier_uniform_, constant_ 24 | 25 | from ..functions import MSDeformAttnFunction 26 | 27 | 28 | def _is_power_of_2(n): 29 | if (not isinstance(n, int)) or (n < 0): 30 | raise ValueError( 31 | "invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)) 32 | ) 33 | return (n & (n - 1) == 0) and n != 0 34 | 35 | 36 | class MSDeformAttn(nn.Module): 37 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 38 | """ 39 | Multi-Scale Deformable Attention Module 40 | :param d_model hidden dimension 41 | :param n_levels number of feature levels 42 | :param n_heads number of attention heads 43 | :param n_points number of sampling points per attention head per feature level 44 | """ 45 | super().__init__() 46 | if d_model % n_heads != 0: 47 | raise ValueError( 48 | "d_model must be divisible by n_heads, but got {} and {}".format( 49 | d_model, n_heads 50 | ) 51 | ) 52 | _d_per_head = d_model // n_heads 53 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 54 | if not _is_power_of_2(_d_per_head): 55 | warnings.warn( 56 | "You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 57 | "which is more efficient in our CUDA implementation." 58 | ) 59 | 60 | self.im2col_step = 64 61 | 62 | self.d_model = d_model 63 | self.n_levels = n_levels 64 | self.n_heads = n_heads 65 | self.n_points = n_points 66 | 67 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 68 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 69 | self.value_proj = nn.Linear(d_model, d_model) 70 | self.output_proj = nn.Linear(d_model, d_model) 71 | 72 | self._reset_parameters() 73 | 74 | def _reset_parameters(self): 75 | constant_(self.sampling_offsets.weight.data, 0.0) 76 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * ( 77 | 2.0 * math.pi / self.n_heads 78 | ) 79 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 80 | grid_init = ( 81 | (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) 82 | .view(self.n_heads, 1, 1, 2) 83 | .repeat(1, self.n_levels, self.n_points, 1) 84 | ) 85 | for i in range(self.n_points): 86 | grid_init[:, :, i, :] *= i + 1 87 | with torch.no_grad(): 88 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 89 | constant_(self.attention_weights.weight.data, 0.0) 90 | constant_(self.attention_weights.bias.data, 0.0) 91 | xavier_uniform_(self.value_proj.weight.data) 92 | constant_(self.value_proj.bias.data, 0.0) 93 | xavier_uniform_(self.output_proj.weight.data) 94 | constant_(self.output_proj.bias.data, 0.0) 95 | 96 | @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32) 97 | def forward( 98 | self, 99 | query, 100 | reference_points, 101 | input_flatten, 102 | input_spatial_shapes, 103 | input_level_start_index, 104 | input_padding_mask=None, 105 | ): 106 | """ 107 | :param query (N, Length_{query}, C) 108 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 109 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 110 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 111 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 112 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 113 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 114 | 115 | :return output (N, Length_{query}, C) 116 | """ 117 | N, Len_q, _ = query.shape 118 | N, Len_in, _ = input_flatten.shape 119 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 120 | 121 | value = self.value_proj(input_flatten) 122 | if input_padding_mask is not None: 123 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 124 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 125 | sampling_offsets = self.sampling_offsets(query).view( 126 | N, Len_q, self.n_heads, self.n_levels, self.n_points, 2 127 | ) 128 | attention_weights = self.attention_weights(query).view( 129 | N, Len_q, self.n_heads, self.n_levels * self.n_points 130 | ) 131 | attention_weights = F.softmax(attention_weights, -1).view( 132 | N, Len_q, self.n_heads, self.n_levels, self.n_points 133 | ) 134 | # N, Len_q, n_heads, n_levels, n_points, 2 135 | if reference_points.shape[-1] == 2: 136 | offset_normalizer = torch.stack( 137 | [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1 138 | ) 139 | sampling_locations = ( 140 | reference_points[:, :, None, :, None, :] 141 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 142 | ) 143 | elif reference_points.shape[-1] == 4: 144 | sampling_locations = ( 145 | reference_points[:, :, None, :, None, :2] 146 | + sampling_offsets 147 | / self.n_points 148 | * reference_points[:, :, None, :, None, 2:] 149 | * 0.5 150 | ) 151 | else: 152 | raise ValueError( 153 | "Last dim of reference_points must be 2 or 4, but get {} instead.".format( 154 | reference_points.shape[-1] 155 | ) 156 | ) 157 | output = MSDeformAttnFunction.apply( 158 | value, 159 | input_spatial_shapes, 160 | input_level_start_index, 161 | sampling_locations, 162 | attention_weights, 163 | self.im2col_step, 164 | ) 165 | output = self.output_proj(output) 166 | return output 167 | -------------------------------------------------------------------------------- /models/ops/src/cuda/ms_deform_attn_cuda.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | #include "cuda/ms_deform_im2col_cuda.cuh" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | 20 | at::Tensor ms_deform_attn_cuda_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step) 27 | { 28 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 29 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 30 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 31 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 32 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 33 | 34 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 35 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 36 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 37 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 38 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 39 | 40 | const int batch = value.size(0); 41 | const int spatial_size = value.size(1); 42 | const int num_heads = value.size(2); 43 | const int channels = value.size(3); 44 | 45 | const int num_levels = spatial_shapes.size(0); 46 | 47 | const int num_query = sampling_loc.size(1); 48 | const int num_point = sampling_loc.size(4); 49 | 50 | const int im2col_step_ = std::min(batch, im2col_step); 51 | 52 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 53 | 54 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); 55 | 56 | const int batch_n = im2col_step_; 57 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 58 | auto per_value_size = spatial_size * num_heads * channels; 59 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 60 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 61 | for (int n = 0; n < batch/im2col_step_; ++n) 62 | { 63 | auto columns = output_n.select(0, n); 64 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { 65 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), 66 | value.data() + n * im2col_step_ * per_value_size, 67 | spatial_shapes.data(), 68 | level_start_index.data(), 69 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 70 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 71 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 72 | columns.data()); 73 | 74 | })); 75 | } 76 | 77 | output = output.view({batch, num_query, num_heads*channels}); 78 | 79 | return output; 80 | } 81 | 82 | 83 | std::vector ms_deform_attn_cuda_backward( 84 | const at::Tensor &value, 85 | const at::Tensor &spatial_shapes, 86 | const at::Tensor &level_start_index, 87 | const at::Tensor &sampling_loc, 88 | const at::Tensor &attn_weight, 89 | const at::Tensor &grad_output, 90 | const int im2col_step) 91 | { 92 | 93 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 94 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 95 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 96 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 97 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 98 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); 99 | 100 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 101 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 102 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 103 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 104 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 105 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); 106 | 107 | const int batch = value.size(0); 108 | const int spatial_size = value.size(1); 109 | const int num_heads = value.size(2); 110 | const int channels = value.size(3); 111 | 112 | const int num_levels = spatial_shapes.size(0); 113 | 114 | const int num_query = sampling_loc.size(1); 115 | const int num_point = sampling_loc.size(4); 116 | 117 | const int im2col_step_ = std::min(batch, im2col_step); 118 | 119 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 120 | 121 | auto grad_value = at::zeros_like(value); 122 | auto grad_sampling_loc = at::zeros_like(sampling_loc); 123 | auto grad_attn_weight = at::zeros_like(attn_weight); 124 | 125 | const int batch_n = im2col_step_; 126 | auto per_value_size = spatial_size * num_heads * channels; 127 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 128 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 129 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 130 | 131 | for (int n = 0; n < batch/im2col_step_; ++n) 132 | { 133 | auto grad_output_g = grad_output_n.select(0, n); 134 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { 135 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), 136 | grad_output_g.data(), 137 | value.data() + n * im2col_step_ * per_value_size, 138 | spatial_shapes.data(), 139 | level_start_index.data(), 140 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 141 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 142 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 143 | grad_value.data() + n * im2col_step_ * per_value_size, 144 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 145 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); 146 | 147 | })); 148 | } 149 | 150 | return { 151 | grad_value, grad_sampling_loc, grad_attn_weight 152 | }; 153 | } -------------------------------------------------------------------------------- /tools/launch.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------------------------------------------- 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # -------------------------------------------------------------------------------------------------------------------------- 6 | # Modified from https://github.com/pytorch/pytorch/blob/173f224570017b4b1a3a1a13d0bff280a54d9cd9/torch/distributed/launch.py 7 | # -------------------------------------------------------------------------------------------------------------------------- 8 | 9 | r""" 10 | `torch.distributed.launch` is a module that spawns up multiple distributed 11 | training processes on each of the training nodes. 12 | The utility can be used for single-node distributed training, in which one or 13 | more processes per node will be spawned. The utility can be used for either 14 | CPU training or GPU training. If the utility is used for GPU training, 15 | each distributed process will be operating on a single GPU. This can achieve 16 | well-improved single-node training performance. It can also be used in 17 | multi-node distributed training, by spawning up multiple processes on each node 18 | for well-improved multi-node distributed training performance as well. 19 | This will especially be benefitial for systems with multiple Infiniband 20 | interfaces that have direct-GPU support, since all of them can be utilized for 21 | aggregated communication bandwidth. 22 | In both cases of single-node distributed training or multi-node distributed 23 | training, this utility will launch the given number of processes per node 24 | (``--nproc_per_node``). If used for GPU training, this number needs to be less 25 | or euqal to the number of GPUs on the current system (``nproc_per_node``), 26 | and each process will be operating on a single GPU from *GPU 0 to 27 | GPU (nproc_per_node - 1)*. 28 | **How to use this module:** 29 | 1. Single-Node multi-process distributed training 30 | :: 31 | >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE 32 | YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other 33 | arguments of your training script) 34 | 2. Multi-Node multi-process distributed training: (e.g. two nodes) 35 | Node 1: *(IP: 192.168.1.1, and has a free port: 1234)* 36 | :: 37 | >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE 38 | --nnodes=2 --node_rank=0 --master_addr="192.168.1.1" 39 | --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 40 | and all other arguments of your training script) 41 | Node 2: 42 | :: 43 | >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE 44 | --nnodes=2 --node_rank=1 --master_addr="192.168.1.1" 45 | --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 46 | and all other arguments of your training script) 47 | 3. To look up what optional arguments this module offers: 48 | :: 49 | >>> python -m torch.distributed.launch --help 50 | **Important Notices:** 51 | 1. This utilty and multi-process distributed (single-node or 52 | multi-node) GPU training currently only achieves the best performance using 53 | the NCCL distributed backend. Thus NCCL backend is the recommended backend to 54 | use for GPU training. 55 | 2. In your training program, you must parse the command-line argument: 56 | ``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by this module. 57 | If your training program uses GPUs, you should ensure that your code only 58 | runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by: 59 | Parsing the local_rank argument 60 | :: 61 | >>> import argparse 62 | >>> parser = argparse.ArgumentParser() 63 | >>> parser.add_argument("--local_rank", type=int) 64 | >>> args = parser.parse_args() 65 | Set your device to local rank using either 66 | :: 67 | >>> torch.cuda.set_device(arg.local_rank) # before your code runs 68 | or 69 | :: 70 | >>> with torch.cuda.device(arg.local_rank): 71 | >>> # your code to run 72 | 3. In your training program, you are supposed to call the following function 73 | at the beginning to start the distributed backend. You need to make sure that 74 | the init_method uses ``env://``, which is the only supported ``init_method`` 75 | by this module. 76 | :: 77 | torch.distributed.init_process_group(backend='YOUR BACKEND', 78 | init_method='env://') 79 | 4. In your training program, you can either use regular distributed functions 80 | or use :func:`torch.nn.parallel.DistributedDataParallel` module. If your 81 | training program uses GPUs for training and you would like to use 82 | :func:`torch.nn.parallel.DistributedDataParallel` module, 83 | here is how to configure it. 84 | :: 85 | model = torch.nn.parallel.DistributedDataParallel(model, 86 | device_ids=[arg.local_rank], 87 | output_device=arg.local_rank) 88 | Please ensure that ``device_ids`` argument is set to be the only GPU device id 89 | that your code will be operating on. This is generally the local rank of the 90 | process. In other words, the ``device_ids`` needs to be ``[args.local_rank]``, 91 | and ``output_device`` needs to be ``args.local_rank`` in order to use this 92 | utility 93 | 5. Another way to pass ``local_rank`` to the subprocesses via environment variable 94 | ``LOCAL_RANK``. This behavior is enabled when you launch the script with 95 | ``--use_env=True``. You must adjust the subprocess example above to replace 96 | ``args.local_rank`` with ``os.environ['LOCAL_RANK']``; the launcher 97 | will not pass ``--local_rank`` when you specify this flag. 98 | .. warning:: 99 | ``local_rank`` is NOT globally unique: it is only unique per process 100 | on a machine. Thus, don't use it to decide if you should, e.g., 101 | write to a networked filesystem. See 102 | https://github.com/pytorch/pytorch/issues/12042 for an example of 103 | how things can go wrong if you don't do this correctly. 104 | """ 105 | 106 | 107 | import sys 108 | import subprocess 109 | import os 110 | import socket 111 | from argparse import ArgumentParser, REMAINDER 112 | 113 | import torch 114 | 115 | 116 | def parse_args(): 117 | """ 118 | Helper function parsing the command line options 119 | @retval ArgumentParser 120 | """ 121 | parser = ArgumentParser( 122 | description="PyTorch distributed training launch " 123 | "helper utilty that will spawn up " 124 | "multiple distributed processes" 125 | ) 126 | 127 | # Optional arguments for the launch helper 128 | parser.add_argument( 129 | "--nnodes", 130 | type=int, 131 | default=1, 132 | help="The number of nodes to use for distributed " "training", 133 | ) 134 | parser.add_argument( 135 | "--node_rank", 136 | type=int, 137 | default=0, 138 | help="The rank of the node for multi-node distributed " "training", 139 | ) 140 | parser.add_argument( 141 | "--nproc_per_node", 142 | type=int, 143 | default=1, 144 | help="The number of processes to launch on each node, " 145 | "for GPU training, this is recommended to be set " 146 | "to the number of GPUs in your system so that " 147 | "each process can be bound to a single GPU.", 148 | ) 149 | parser.add_argument( 150 | "--master_addr", 151 | default="127.0.0.1", 152 | type=str, 153 | help="Master node (rank 0)'s address, should be either " 154 | "the IP address or the hostname of node 0, for " 155 | "single node multi-proc training, the " 156 | "--master_addr can simply be 127.0.0.1", 157 | ) 158 | parser.add_argument( 159 | "--master_port", 160 | default=29500, 161 | type=int, 162 | help="Master node (rank 0)'s free port that needs to " 163 | "be used for communciation during distributed " 164 | "training", 165 | ) 166 | 167 | # positional 168 | parser.add_argument( 169 | "training_script", 170 | type=str, 171 | help="The full path to the single GPU training " 172 | "program/script to be launched in parallel, " 173 | "followed by all the arguments for the " 174 | "training script", 175 | ) 176 | 177 | # rest from the training program 178 | parser.add_argument("training_script_args", nargs=REMAINDER) 179 | return parser.parse_args() 180 | 181 | 182 | def main(): 183 | args = parse_args() 184 | 185 | # world size in terms of number of processes 186 | dist_world_size = args.nproc_per_node * args.nnodes 187 | 188 | # set PyTorch distributed related environmental variables 189 | current_env = os.environ.copy() 190 | current_env["MASTER_ADDR"] = args.master_addr 191 | current_env["MASTER_PORT"] = str(args.master_port) 192 | current_env["WORLD_SIZE"] = str(dist_world_size) 193 | 194 | processes = [] 195 | 196 | for local_rank in range(0, args.nproc_per_node): 197 | # each process's rank 198 | dist_rank = args.nproc_per_node * args.node_rank + local_rank 199 | current_env["RANK"] = str(dist_rank) 200 | current_env["LOCAL_RANK"] = str(local_rank) 201 | 202 | cmd = [args.training_script] + args.training_script_args 203 | 204 | process = subprocess.Popen(cmd, env=current_env) 205 | processes.append(process) 206 | 207 | for process in processes: 208 | process.wait() 209 | if process.returncode != 0: 210 | raise subprocess.CalledProcessError( 211 | returncode=process.returncode, cmd=process.args 212 | ) 213 | 214 | 215 | if __name__ == "__main__": 216 | main() 217 | -------------------------------------------------------------------------------- /datasets/transforms.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Transforms and data augmentation for both image + bbox. 12 | """ 13 | import random 14 | 15 | import PIL 16 | import torch 17 | import torchvision.transforms as T 18 | import torchvision.transforms.functional as F 19 | 20 | from util.box_ops import box_xyxy_to_cxcywh 21 | from util.misc import interpolate 22 | 23 | 24 | def crop(image, target, region): 25 | cropped_image = F.crop(image, *region) 26 | 27 | target = target.copy() 28 | i, j, h, w = region 29 | 30 | # should we do something wrt the original size? 31 | target["size"] = torch.tensor([h, w]) 32 | 33 | fields = ["labels", "area", "iscrowd"] 34 | 35 | if "boxes" in target: 36 | boxes = target["boxes"] 37 | max_size = torch.as_tensor([w, h], dtype=torch.float32) 38 | cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) 39 | cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) 40 | cropped_boxes = cropped_boxes.clamp(min=0) 41 | area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) 42 | target["boxes"] = cropped_boxes.reshape(-1, 4) 43 | target["area"] = area 44 | fields.append("boxes") 45 | 46 | if "masks" in target: 47 | # FIXME should we update the area here if there are no boxes? 48 | target["masks"] = target["masks"][:, i : i + h, j : j + w] 49 | fields.append("masks") 50 | 51 | # remove elements for which the boxes or masks that have zero area 52 | if "boxes" in target or "masks" in target: 53 | # favor boxes selection when defining which elements to keep 54 | # this is compatible with previous implementation 55 | if "boxes" in target: 56 | cropped_boxes = target["boxes"].reshape(-1, 2, 2) 57 | keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) 58 | else: 59 | keep = target["masks"].flatten(1).any(1) 60 | 61 | for field in fields: 62 | target[field] = target[field][keep] 63 | 64 | return cropped_image, target 65 | 66 | 67 | def hflip(image, target): 68 | flipped_image = F.hflip(image) 69 | 70 | w, h = image.size 71 | 72 | target = target.copy() 73 | if "boxes" in target: 74 | boxes = target["boxes"] 75 | boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor( 76 | [-1, 1, -1, 1] 77 | ) + torch.as_tensor([w, 0, w, 0]) 78 | target["boxes"] = boxes 79 | 80 | if "masks" in target: 81 | target["masks"] = target["masks"].flip(-1) 82 | 83 | return flipped_image, target 84 | 85 | 86 | def resize(image, target, size, max_size=None): 87 | # size can be min_size (scalar) or (w, h) tuple 88 | 89 | def get_size_with_aspect_ratio(image_size, size, max_size=None): 90 | w, h = image_size 91 | if max_size is not None: 92 | min_original_size = float(min((w, h))) 93 | max_original_size = float(max((w, h))) 94 | if max_original_size / min_original_size * size > max_size: 95 | size = int(round(max_size * min_original_size / max_original_size)) 96 | 97 | if (w <= h and w == size) or (h <= w and h == size): 98 | return (h, w) 99 | 100 | if w < h: 101 | ow = size 102 | oh = int(size * h / w) 103 | else: 104 | oh = size 105 | ow = int(size * w / h) 106 | 107 | return (oh, ow) 108 | 109 | def get_size(image_size, size, max_size=None): 110 | if isinstance(size, (list, tuple)): 111 | return size[::-1] 112 | else: 113 | return get_size_with_aspect_ratio(image_size, size, max_size) 114 | 115 | size = get_size(image.size, size, max_size) 116 | rescaled_image = F.resize(image, size) 117 | 118 | if target is None: 119 | return rescaled_image, None 120 | 121 | ratios = tuple( 122 | float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size) 123 | ) 124 | ratio_width, ratio_height = ratios 125 | 126 | target = target.copy() 127 | if "boxes" in target: 128 | boxes = target["boxes"] 129 | scaled_boxes = boxes * torch.as_tensor( 130 | [ratio_width, ratio_height, ratio_width, ratio_height] 131 | ) 132 | target["boxes"] = scaled_boxes 133 | 134 | if "area" in target: 135 | area = target["area"] 136 | scaled_area = area * (ratio_width * ratio_height) 137 | target["area"] = scaled_area 138 | 139 | h, w = size 140 | target["size"] = torch.tensor([h, w]) 141 | 142 | if "masks" in target: 143 | target["masks"] = ( 144 | interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] 145 | > 0.5 146 | ) 147 | 148 | return rescaled_image, target 149 | 150 | 151 | def pad(image, target, padding): 152 | # assumes that we only pad on the bottom right corners 153 | padded_image = F.pad(image, (0, 0, padding[0], padding[1])) 154 | if target is None: 155 | return padded_image, None 156 | target = target.copy() 157 | # should we do something wrt the original size? 158 | target["size"] = torch.tensor(padded_image[::-1]) 159 | if "masks" in target: 160 | target["masks"] = torch.nn.functional.pad( 161 | target["masks"], (0, padding[0], 0, padding[1]) 162 | ) 163 | return padded_image, target 164 | 165 | 166 | class RandomCrop(object): 167 | def __init__(self, size): 168 | self.size = size 169 | 170 | def __call__(self, img, target): 171 | region = T.RandomCrop.get_params(img, self.size) 172 | return crop(img, target, region) 173 | 174 | 175 | class RandomSizeCrop(object): 176 | def __init__(self, min_size: int, max_size: int): 177 | self.min_size = min_size 178 | self.max_size = max_size 179 | 180 | def __call__(self, img: PIL.Image.Image, target: dict): 181 | w = random.randint(self.min_size, min(img.width, self.max_size)) 182 | h = random.randint(self.min_size, min(img.height, self.max_size)) 183 | region = T.RandomCrop.get_params(img, [h, w]) 184 | return crop(img, target, region) 185 | 186 | 187 | class CenterCrop(object): 188 | def __init__(self, size): 189 | self.size = size 190 | 191 | def __call__(self, img, target): 192 | image_width, image_height = img.size 193 | crop_height, crop_width = self.size 194 | crop_top = int(round((image_height - crop_height) / 2.0)) 195 | crop_left = int(round((image_width - crop_width) / 2.0)) 196 | return crop(img, target, (crop_top, crop_left, crop_height, crop_width)) 197 | 198 | 199 | class RandomHorizontalFlip(object): 200 | def __init__(self, p=0.5): 201 | self.p = p 202 | 203 | def __call__(self, img, target): 204 | if random.random() < self.p: 205 | return hflip(img, target) 206 | return img, target 207 | 208 | 209 | class RandomResize(object): 210 | def __init__(self, sizes, max_size=None): 211 | assert isinstance(sizes, (list, tuple)) 212 | self.sizes = sizes 213 | self.max_size = max_size 214 | 215 | def __call__(self, img, target=None): 216 | size = random.choice(self.sizes) 217 | return resize(img, target, size, self.max_size) 218 | 219 | 220 | class RandomPad(object): 221 | def __init__(self, max_pad): 222 | self.max_pad = max_pad 223 | 224 | def __call__(self, img, target): 225 | pad_x = random.randint(0, self.max_pad) 226 | pad_y = random.randint(0, self.max_pad) 227 | return pad(img, target, (pad_x, pad_y)) 228 | 229 | 230 | class RandomSelect(object): 231 | """ 232 | Randomly selects between transforms1 and transforms2, 233 | with probability p for transforms1 and (1 - p) for transforms2 234 | """ 235 | 236 | def __init__(self, transforms1, transforms2, p=0.5): 237 | self.transforms1 = transforms1 238 | self.transforms2 = transforms2 239 | self.p = p 240 | 241 | def __call__(self, img, target): 242 | if random.random() < self.p: 243 | return self.transforms1(img, target) 244 | return self.transforms2(img, target) 245 | 246 | 247 | class ToTensor(object): 248 | def __call__(self, img, target): 249 | return F.to_tensor(img), target 250 | 251 | 252 | class RandomErasing(object): 253 | def __init__(self, *args, **kwargs): 254 | self.eraser = T.RandomErasing(*args, **kwargs) 255 | 256 | def __call__(self, img, target): 257 | return self.eraser(img), target 258 | 259 | 260 | class Normalize(object): 261 | def __init__(self, mean, std): 262 | self.mean = mean 263 | self.std = std 264 | 265 | def __call__(self, image, target=None): 266 | image = F.normalize(image, mean=self.mean, std=self.std) 267 | if target is None: 268 | return image, None 269 | target = target.copy() 270 | h, w = image.shape[-2:] 271 | if "boxes" in target: 272 | boxes = target["boxes"] 273 | boxes = box_xyxy_to_cxcywh(boxes) 274 | boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32) 275 | target["boxes"] = boxes 276 | return image, target 277 | 278 | 279 | class Compose(object): 280 | def __init__(self, transforms): 281 | self.transforms = transforms 282 | 283 | def __call__(self, image, target): 284 | for t in self.transforms: 285 | image, target = t(image, target) 286 | return image, target 287 | 288 | def __repr__(self): 289 | format_string = self.__class__.__name__ + "(" 290 | for t in self.transforms: 291 | format_string += "\n" 292 | format_string += " {0}".format(t) 293 | format_string += "\n)" 294 | return format_string 295 | -------------------------------------------------------------------------------- /datasets/coco_eval.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | COCO evaluator that works in distributed mode. 12 | 13 | Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py 14 | The difference is that there is less copy-pasting from pycocotools 15 | in the end of the file, as python3 can suppress prints with contextlib 16 | """ 17 | import os 18 | import contextlib 19 | import copy 20 | import numpy as np 21 | import torch 22 | 23 | from pycocotools.cocoeval import COCOeval 24 | from pycocotools.coco import COCO 25 | import pycocotools.mask as mask_util 26 | 27 | from util.misc import all_gather 28 | 29 | 30 | class CocoEvaluator(object): 31 | def __init__(self, coco_gt, iou_types): 32 | assert isinstance(iou_types, (list, tuple)) 33 | coco_gt = copy.deepcopy(coco_gt) 34 | self.coco_gt = coco_gt 35 | 36 | self.iou_types = iou_types 37 | self.coco_eval = {} 38 | for iou_type in iou_types: 39 | self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type) 40 | 41 | self.img_ids = [] 42 | self.eval_imgs = {k: [] for k in iou_types} 43 | 44 | def update(self, predictions): 45 | img_ids = list(np.unique(list(predictions.keys()))) 46 | self.img_ids.extend(img_ids) 47 | 48 | for iou_type in self.iou_types: 49 | results = self.prepare(predictions, iou_type) 50 | 51 | # suppress pycocotools prints 52 | with open(os.devnull, "w") as devnull: 53 | with contextlib.redirect_stdout(devnull): 54 | coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO() 55 | coco_eval = self.coco_eval[iou_type] 56 | 57 | coco_eval.cocoDt = coco_dt 58 | coco_eval.params.imgIds = list(img_ids) 59 | img_ids, eval_imgs = evaluate(coco_eval) 60 | 61 | self.eval_imgs[iou_type].append(eval_imgs) 62 | 63 | def synchronize_between_processes(self): 64 | for iou_type in self.iou_types: 65 | self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2) 66 | create_common_coco_eval( 67 | self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type] 68 | ) 69 | 70 | def accumulate(self): 71 | for coco_eval in self.coco_eval.values(): 72 | coco_eval.accumulate() 73 | 74 | def summarize(self): 75 | for iou_type, coco_eval in self.coco_eval.items(): 76 | print("IoU metric: {}".format(iou_type)) 77 | coco_eval.summarize() 78 | 79 | def prepare(self, predictions, iou_type): 80 | if iou_type == "bbox": 81 | return self.prepare_for_coco_detection(predictions) 82 | elif iou_type == "segm": 83 | return self.prepare_for_coco_segmentation(predictions) 84 | elif iou_type == "keypoints": 85 | return self.prepare_for_coco_keypoint(predictions) 86 | else: 87 | raise ValueError("Unknown iou type {}".format(iou_type)) 88 | 89 | def prepare_for_coco_detection(self, predictions): 90 | coco_results = [] 91 | for original_id, prediction in predictions.items(): 92 | if len(prediction) == 0: 93 | continue 94 | 95 | boxes = prediction["boxes"] 96 | boxes = convert_to_xywh(boxes).tolist() 97 | scores = prediction["scores"].tolist() 98 | labels = prediction["labels"].tolist() 99 | 100 | coco_results.extend( 101 | [ 102 | { 103 | "image_id": original_id, 104 | "category_id": labels[k], 105 | "bbox": box, 106 | "score": scores[k], 107 | } 108 | for k, box in enumerate(boxes) 109 | ] 110 | ) 111 | return coco_results 112 | 113 | def prepare_for_coco_segmentation(self, predictions): 114 | coco_results = [] 115 | for original_id, prediction in predictions.items(): 116 | if len(prediction) == 0: 117 | continue 118 | 119 | scores = prediction["scores"] 120 | labels = prediction["labels"] 121 | masks = prediction["masks"] 122 | 123 | masks = masks > 0.5 124 | 125 | scores = prediction["scores"].tolist() 126 | labels = prediction["labels"].tolist() 127 | 128 | rles = [ 129 | mask_util.encode( 130 | np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F") 131 | )[0] 132 | for mask in masks 133 | ] 134 | for rle in rles: 135 | rle["counts"] = rle["counts"].decode("utf-8") 136 | 137 | coco_results.extend( 138 | [ 139 | { 140 | "image_id": original_id, 141 | "category_id": labels[k], 142 | "segmentation": rle, 143 | "score": scores[k], 144 | } 145 | for k, rle in enumerate(rles) 146 | ] 147 | ) 148 | return coco_results 149 | 150 | def prepare_for_coco_keypoint(self, predictions): 151 | coco_results = [] 152 | for original_id, prediction in predictions.items(): 153 | if len(prediction) == 0: 154 | continue 155 | 156 | boxes = prediction["boxes"] 157 | boxes = convert_to_xywh(boxes).tolist() 158 | scores = prediction["scores"].tolist() 159 | labels = prediction["labels"].tolist() 160 | keypoints = prediction["keypoints"] 161 | keypoints = keypoints.flatten(start_dim=1).tolist() 162 | 163 | coco_results.extend( 164 | [ 165 | { 166 | "image_id": original_id, 167 | "category_id": labels[k], 168 | "keypoints": keypoint, 169 | "score": scores[k], 170 | } 171 | for k, keypoint in enumerate(keypoints) 172 | ] 173 | ) 174 | return coco_results 175 | 176 | 177 | def convert_to_xywh(boxes): 178 | xmin, ymin, xmax, ymax = boxes.unbind(1) 179 | return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1) 180 | 181 | 182 | def merge(img_ids, eval_imgs): 183 | all_img_ids = all_gather(img_ids) 184 | all_eval_imgs = all_gather(eval_imgs) 185 | 186 | merged_img_ids = [] 187 | for p in all_img_ids: 188 | merged_img_ids.extend(p) 189 | 190 | merged_eval_imgs = [] 191 | for p in all_eval_imgs: 192 | merged_eval_imgs.append(p) 193 | 194 | merged_img_ids = np.array(merged_img_ids) 195 | merged_eval_imgs = np.concatenate(merged_eval_imgs, 2) 196 | 197 | # keep only unique (and in sorted order) images 198 | merged_img_ids, idx = np.unique(merged_img_ids, return_index=True) 199 | merged_eval_imgs = merged_eval_imgs[..., idx] 200 | 201 | return merged_img_ids, merged_eval_imgs 202 | 203 | 204 | def create_common_coco_eval(coco_eval, img_ids, eval_imgs): 205 | img_ids, eval_imgs = merge(img_ids, eval_imgs) 206 | img_ids = list(img_ids) 207 | eval_imgs = list(eval_imgs.flatten()) 208 | 209 | coco_eval.evalImgs = eval_imgs 210 | coco_eval.params.imgIds = img_ids 211 | coco_eval._paramsEval = copy.deepcopy(coco_eval.params) 212 | 213 | 214 | ################################################################# 215 | # From pycocotools, just removed the prints and fixed 216 | # a Python3 bug about unicode not defined 217 | ################################################################# 218 | 219 | 220 | def evaluate(self): 221 | """ 222 | Run per image evaluation on given images and store results (a list of dict) in self.evalImgs 223 | :return: None 224 | """ 225 | # tic = time.time() 226 | # print('Running per image evaluation...') 227 | p = self.params 228 | # add backward compatibility if useSegm is specified in params 229 | if p.useSegm is not None: 230 | p.iouType = "segm" if p.useSegm == 1 else "bbox" 231 | print( 232 | "useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType) 233 | ) 234 | # print('Evaluate annotation type *{}*'.format(p.iouType)) 235 | p.imgIds = list(np.unique(p.imgIds)) 236 | if p.useCats: 237 | p.catIds = list(np.unique(p.catIds)) 238 | p.maxDets = sorted(p.maxDets) 239 | self.params = p 240 | 241 | self._prepare() 242 | # loop through images, area range, max detection number 243 | catIds = p.catIds if p.useCats else [-1] 244 | 245 | if p.iouType == "segm" or p.iouType == "bbox": 246 | computeIoU = self.computeIoU 247 | elif p.iouType == "keypoints": 248 | computeIoU = self.computeOks 249 | self.ious = { 250 | (imgId, catId): computeIoU(imgId, catId) 251 | for imgId in p.imgIds 252 | for catId in catIds 253 | } 254 | 255 | evaluateImg = self.evaluateImg 256 | maxDet = p.maxDets[-1] 257 | evalImgs = [ 258 | evaluateImg(imgId, catId, areaRng, maxDet) 259 | for catId in catIds 260 | for areaRng in p.areaRng 261 | for imgId in p.imgIds 262 | ] 263 | # this is NOT in the pycocotools code, but could be done outside 264 | evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds)) 265 | self._paramsEval = copy.deepcopy(self.params) 266 | # toc = time.time() 267 | # print('DONE (t={:0.2f}s).'.format(toc-tic)) 268 | return p.imgIds, evalImgs 269 | 270 | 271 | ################################################################# 272 | # end of straight copy from pycocotools, just removing the prints 273 | ################################################################# 274 | -------------------------------------------------------------------------------- /models/backbone.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # H-DETR 3 | # Copyright (c) 2022 Peking University & Microsoft Research Asia. All Rights Reserved. 4 | # Licensed under the MIT-style license found in the LICENSE file in the root directory 5 | # ------------------------------------------------------------------------ 6 | # Deformable DETR 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 9 | # ------------------------------------------------------------------------ 10 | # Modified from DETR (https://github.com/facebookresearch/detr) 11 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 12 | # ------------------------------------------------------------------------ 13 | 14 | """ 15 | Backbone modules. 16 | """ 17 | from collections import OrderedDict 18 | 19 | import torch 20 | import torch.nn.functional as F 21 | import torchvision 22 | from torch import nn 23 | from torchvision.models._utils import IntermediateLayerGetter 24 | from typing import Dict, List 25 | 26 | from util.misc import NestedTensor, is_main_process 27 | 28 | from .position_encoding import build_position_encoding 29 | from .swin_transformer import SwinTransformer 30 | 31 | 32 | class FrozenBatchNorm2d(torch.nn.Module): 33 | """ 34 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 35 | 36 | Copy-paste from torchvision.misc.ops with added eps before rqsrt, 37 | without which any other models than torchvision.models.resnet[18,34,50,101] 38 | produce nans. 39 | """ 40 | 41 | def __init__(self, n, eps=1e-5): 42 | super(FrozenBatchNorm2d, self).__init__() 43 | self.register_buffer("weight", torch.ones(n)) 44 | self.register_buffer("bias", torch.zeros(n)) 45 | self.register_buffer("running_mean", torch.zeros(n)) 46 | self.register_buffer("running_var", torch.ones(n)) 47 | self.eps = eps 48 | 49 | def _load_from_state_dict( 50 | self, 51 | state_dict, 52 | prefix, 53 | local_metadata, 54 | strict, 55 | missing_keys, 56 | unexpected_keys, 57 | error_msgs, 58 | ): 59 | num_batches_tracked_key = prefix + "num_batches_tracked" 60 | if num_batches_tracked_key in state_dict: 61 | del state_dict[num_batches_tracked_key] 62 | 63 | super(FrozenBatchNorm2d, self)._load_from_state_dict( 64 | state_dict, 65 | prefix, 66 | local_metadata, 67 | strict, 68 | missing_keys, 69 | unexpected_keys, 70 | error_msgs, 71 | ) 72 | 73 | def forward(self, x): 74 | # move reshapes to the beginning 75 | # to make it fuser-friendly 76 | w = self.weight.reshape(1, -1, 1, 1) 77 | b = self.bias.reshape(1, -1, 1, 1) 78 | rv = self.running_var.reshape(1, -1, 1, 1) 79 | rm = self.running_mean.reshape(1, -1, 1, 1) 80 | eps = self.eps 81 | scale = w * (rv + eps).rsqrt() 82 | bias = b - rm * scale 83 | return x * scale + bias 84 | 85 | 86 | class BackboneBase(nn.Module): 87 | def __init__( 88 | self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool 89 | ): 90 | super().__init__() 91 | for name, parameter in backbone.named_parameters(): 92 | if ( 93 | not train_backbone 94 | or "layer2" not in name 95 | and "layer3" not in name 96 | and "layer4" not in name 97 | ): 98 | parameter.requires_grad_(False) 99 | if return_interm_layers: 100 | # return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"} 101 | return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"} 102 | self.strides = [8, 16, 32] 103 | self.num_channels = [512, 1024, 2048] 104 | else: 105 | return_layers = {"layer4": "0"} 106 | self.strides = [32] 107 | self.num_channels = [2048] 108 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) 109 | 110 | def forward(self, tensor_list: NestedTensor): 111 | xs = self.body(tensor_list.tensors) 112 | out: Dict[str, NestedTensor] = {} 113 | for name, x in xs.items(): 114 | m = tensor_list.mask 115 | assert m is not None 116 | mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 117 | out[name] = NestedTensor(x, mask) 118 | return out 119 | 120 | 121 | class Backbone(BackboneBase): 122 | """ResNet backbone with frozen BatchNorm.""" 123 | 124 | def __init__( 125 | self, 126 | name: str, 127 | train_backbone: bool, 128 | return_interm_layers: bool, 129 | dilation: bool, 130 | ): 131 | norm_layer = FrozenBatchNorm2d 132 | backbone = getattr(torchvision.models, name)( 133 | replace_stride_with_dilation=[False, False, dilation], 134 | pretrained=is_main_process(), 135 | norm_layer=norm_layer, 136 | ) 137 | assert name not in ("resnet18", "resnet34"), "number of channels are hard coded" 138 | super().__init__(backbone, train_backbone, return_interm_layers) 139 | if dilation: 140 | self.strides[-1] = self.strides[-1] // 2 141 | 142 | 143 | class TransformerBackbone(nn.Module): 144 | def __init__( 145 | self, backbone: str, train_backbone: bool, return_interm_layers: bool, args 146 | ): 147 | super().__init__() 148 | out_indices = (1, 2, 3) 149 | if backbone == "swin_tiny": 150 | backbone = SwinTransformer( 151 | embed_dim=96, 152 | depths=[2, 2, 6, 2], 153 | num_heads=[3, 6, 12, 24], 154 | window_size=7, 155 | ape=False, 156 | drop_path_rate=args.drop_path_rate, 157 | patch_norm=True, 158 | use_checkpoint=True, 159 | out_indices=out_indices, 160 | ) 161 | embed_dim = 96 162 | backbone.init_weights(args.pretrained_backbone_path) 163 | elif backbone == "swin_small": 164 | backbone = SwinTransformer( 165 | embed_dim=96, 166 | depths=[2, 2, 18, 2], 167 | num_heads=[3, 6, 12, 24], 168 | window_size=7, 169 | ape=False, 170 | drop_path_rate=args.drop_path_rate, 171 | patch_norm=True, 172 | use_checkpoint=True, 173 | out_indices=out_indices, 174 | ) 175 | embed_dim = 96 176 | backbone.init_weights(args.pretrained_backbone_path) 177 | elif backbone == "swin_large": 178 | backbone = SwinTransformer( 179 | embed_dim=192, 180 | depths=[2, 2, 18, 2], 181 | num_heads=[6, 12, 24, 48], 182 | window_size=7, 183 | ape=False, 184 | drop_path_rate=args.drop_path_rate, 185 | patch_norm=True, 186 | use_checkpoint=True, 187 | out_indices=out_indices, 188 | ) 189 | embed_dim = 192 190 | backbone.init_weights(args.pretrained_backbone_path) 191 | elif backbone == "swin_large_window12": 192 | backbone = SwinTransformer( 193 | pretrain_img_size=384, 194 | embed_dim=192, 195 | depths=[2, 2, 18, 2], 196 | num_heads=[6, 12, 24, 48], 197 | window_size=12, 198 | ape=False, 199 | drop_path_rate=args.drop_path_rate, 200 | patch_norm=True, 201 | use_checkpoint=True, 202 | out_indices=out_indices, 203 | ) 204 | embed_dim = 192 205 | backbone.init_weights(args.pretrained_backbone_path) 206 | else: 207 | raise NotImplementedError 208 | 209 | for name, parameter in backbone.named_parameters(): 210 | # TODO: freeze some layers? 211 | if not train_backbone: 212 | parameter.requires_grad_(False) 213 | 214 | if return_interm_layers: 215 | 216 | self.strides = [8, 16, 32] 217 | self.num_channels = [ 218 | embed_dim * 2, 219 | embed_dim * 4, 220 | embed_dim * 8, 221 | ] 222 | else: 223 | self.strides = [32] 224 | self.num_channels = [embed_dim * 8] 225 | 226 | self.body = backbone 227 | 228 | def forward(self, tensor_list: NestedTensor): 229 | xs = self.body(tensor_list.tensors) 230 | 231 | out: Dict[str, NestedTensor] = {} 232 | for name, x in xs.items(): 233 | m = tensor_list.mask 234 | assert m is not None 235 | mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 236 | out[name] = NestedTensor(x, mask) 237 | return out 238 | 239 | 240 | class Joiner(nn.Sequential): 241 | def __init__(self, backbone, position_embedding): 242 | super().__init__(backbone, position_embedding) 243 | self.strides = backbone.strides 244 | self.num_channels = backbone.num_channels 245 | 246 | def forward(self, tensor_list: NestedTensor): 247 | xs = self[0](tensor_list) 248 | out: List[NestedTensor] = [] 249 | pos = [] 250 | for name, x in sorted(xs.items()): 251 | out.append(x) 252 | 253 | # position encoding 254 | for x in out: 255 | pos.append(self[1](x).to(x.tensors.dtype)) 256 | 257 | return out, pos 258 | 259 | 260 | def build_backbone(args): 261 | position_embedding = build_position_encoding(args) 262 | train_backbone = args.lr_backbone > 0 263 | return_interm_layers = args.masks or (args.num_feature_levels > 1) 264 | if "resnet" in args.backbone: 265 | backbone = Backbone( 266 | args.backbone, train_backbone, return_interm_layers, args.dilation, 267 | ) 268 | else: 269 | backbone = TransformerBackbone( 270 | args.backbone, train_backbone, return_interm_layers, args 271 | ) 272 | model = Joiner(backbone, position_embedding) 273 | return model 274 | -------------------------------------------------------------------------------- /engine.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # H-DETR 3 | # Copyright (c) 2022 Peking University & Microsoft Research Asia. All Rights Reserved. 4 | # Licensed under the MIT-style license found in the LICENSE file in the root directory 5 | # ------------------------------------------------------------------------ 6 | # Deformable DETR 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 8 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 9 | # ------------------------------------------------------------------------ 10 | # Modified from DETR (https://github.com/facebookresearch/detr) 11 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 12 | # ------------------------------------------------------------------------ 13 | 14 | """ 15 | Train and eval functions used in main.py 16 | """ 17 | import math 18 | import os 19 | import sys 20 | from typing import Iterable 21 | import copy 22 | 23 | import wandb 24 | import torch 25 | import util.misc as utils 26 | from datasets.coco_eval import CocoEvaluator 27 | from datasets.panoptic_eval import PanopticEvaluator 28 | from datasets.data_prefetcher import data_prefetcher 29 | 30 | scaler = torch.cuda.amp.GradScaler() 31 | 32 | 33 | def train_hybrid(outputs, targets, k_one2many, criterion, lambda_one2many): 34 | # one-to-one-loss 35 | loss_dict = criterion(outputs, targets) 36 | multi_targets = copy.deepcopy(targets) 37 | # repeat the targets 38 | for target in multi_targets: 39 | target["boxes"] = target["boxes"].repeat(k_one2many, 1) 40 | target["labels"] = target["labels"].repeat(k_one2many) 41 | 42 | outputs_one2many = dict() 43 | outputs_one2many["pred_logits"] = outputs["pred_logits_one2many"] 44 | outputs_one2many["pred_boxes"] = outputs["pred_boxes_one2many"] 45 | outputs_one2many["aux_outputs"] = outputs["aux_outputs_one2many"] 46 | 47 | # one-to-many loss 48 | loss_dict_one2many = criterion(outputs_one2many, multi_targets) 49 | for key, value in loss_dict_one2many.items(): 50 | if key + "_one2many" in loss_dict.keys(): 51 | loss_dict[key + "_one2many"] += value * lambda_one2many 52 | else: 53 | loss_dict[key + "_one2many"] = value * lambda_one2many 54 | return loss_dict 55 | 56 | 57 | def train_one_epoch( 58 | model: torch.nn.Module, 59 | criterion: torch.nn.Module, 60 | data_loader: Iterable, 61 | optimizer: torch.optim.Optimizer, 62 | device: torch.device, 63 | epoch: int, 64 | max_norm: float = 0, 65 | k_one2many=1, 66 | lambda_one2many=1.0, 67 | use_wandb=False, 68 | use_fp16=False, 69 | ): 70 | model.train() 71 | criterion.train() 72 | metric_logger = utils.MetricLogger(delimiter=" ") 73 | metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}")) 74 | metric_logger.add_meter( 75 | "class_error", utils.SmoothedValue(window_size=1, fmt="{value:.2f}") 76 | ) 77 | metric_logger.add_meter( 78 | "grad_norm", utils.SmoothedValue(window_size=1, fmt="{value:.2f}") 79 | ) 80 | header = "Epoch: [{}]".format(epoch) 81 | print_freq = 10 82 | 83 | prefetcher = data_prefetcher(data_loader, device, prefetch=True) 84 | samples, targets = prefetcher.next() 85 | 86 | # for samples, targets in metric_logger.log_every(data_loader, print_freq, header): 87 | for _ in metric_logger.log_every(range(len(data_loader)), print_freq, header): 88 | with torch.cuda.amp.autocast() if use_fp16 else torch.cuda.amp.autocast( 89 | enabled=False 90 | ): 91 | if use_fp16: 92 | optimizer.zero_grad() 93 | outputs = model(samples) 94 | 95 | if k_one2many > 0: 96 | loss_dict = train_hybrid( 97 | outputs, targets, k_one2many, criterion, lambda_one2many 98 | ) 99 | else: 100 | loss_dict = criterion(outputs, targets) 101 | weight_dict = criterion.weight_dict 102 | losses = sum( 103 | loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict 104 | ) 105 | 106 | # reduce losses over all GPUs for logging purposes 107 | loss_dict_reduced = utils.reduce_dict(loss_dict) 108 | loss_dict_reduced_unscaled = { 109 | f"{k}_unscaled": v for k, v in loss_dict_reduced.items() 110 | } 111 | loss_dict_reduced_scaled = { 112 | k: v * weight_dict[k] 113 | for k, v in loss_dict_reduced.items() 114 | if k in weight_dict 115 | } 116 | losses_reduced_scaled = sum(loss_dict_reduced_scaled.values()) 117 | 118 | loss_value = losses_reduced_scaled.item() 119 | 120 | if not math.isfinite(loss_value): 121 | print("Loss is {}, stopping training".format(loss_value)) 122 | print(loss_dict_reduced) 123 | sys.exit(1) 124 | 125 | if use_fp16: 126 | scaler.scale(losses).backward() 127 | scaler.unscale_(optimizer) 128 | else: 129 | optimizer.zero_grad() 130 | losses.backward() 131 | if max_norm > 0: 132 | grad_total_norm = torch.nn.utils.clip_grad_norm_( 133 | model.parameters(), max_norm 134 | ) 135 | else: 136 | grad_total_norm = utils.get_total_grad_norm(model.parameters(), max_norm) 137 | 138 | if use_fp16: 139 | scaler.step(optimizer) 140 | scaler.update() 141 | else: 142 | optimizer.step() 143 | 144 | metric_logger.update( 145 | loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled 146 | ) 147 | metric_logger.update(class_error=loss_dict_reduced["class_error"]) 148 | metric_logger.update(lr=optimizer.param_groups[0]["lr"]) 149 | metric_logger.update(grad_norm=grad_total_norm) 150 | 151 | samples, targets = prefetcher.next() 152 | 153 | if use_wandb: 154 | try: 155 | wandb.log(loss_dict) 156 | except: 157 | pass 158 | # gather the stats from all processes 159 | metric_logger.synchronize_between_processes() 160 | print("Averaged stats:", metric_logger) 161 | return {k: meter.global_avg for k, meter in metric_logger.meters.items()} 162 | 163 | 164 | @torch.no_grad() 165 | def evaluate( 166 | model, 167 | criterion, 168 | postprocessors, 169 | data_loader, 170 | base_ds, 171 | device, 172 | output_dir, 173 | use_wandb=False, 174 | ): 175 | # disable the one-to-many branch queries 176 | # save them frist 177 | save_num_queries = model.module.num_queries 178 | save_two_stage_num_proposals = model.module.transformer.two_stage_num_proposals 179 | model.module.num_queries = model.module.num_queries_one2one 180 | model.module.transformer.two_stage_num_proposals = model.module.num_queries 181 | 182 | model.eval() 183 | criterion.eval() 184 | 185 | metric_logger = utils.MetricLogger(delimiter=" ") 186 | metric_logger.add_meter( 187 | "class_error", utils.SmoothedValue(window_size=1, fmt="{value:.2f}") 188 | ) 189 | header = "Test:" 190 | 191 | iou_types = tuple(k for k in ("segm", "bbox") if k in postprocessors.keys()) 192 | coco_evaluator = CocoEvaluator(base_ds, iou_types) 193 | # coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75] 194 | 195 | panoptic_evaluator = None 196 | if "panoptic" in postprocessors.keys(): 197 | panoptic_evaluator = PanopticEvaluator( 198 | data_loader.dataset.ann_file, 199 | data_loader.dataset.ann_folder, 200 | output_dir=os.path.join(output_dir, "panoptic_eval"), 201 | ) 202 | 203 | for samples, targets in metric_logger.log_every(data_loader, 10, header): 204 | samples = samples.to(device) 205 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets] 206 | 207 | outputs = model(samples) 208 | loss_dict = criterion(outputs, targets) 209 | weight_dict = criterion.weight_dict 210 | 211 | # reduce losses over all GPUs for logging purposes 212 | loss_dict_reduced = utils.reduce_dict(loss_dict) 213 | loss_dict_reduced_scaled = { 214 | k: v * weight_dict[k] 215 | for k, v in loss_dict_reduced.items() 216 | if k in weight_dict 217 | } 218 | loss_dict_reduced_unscaled = { 219 | f"{k}_unscaled": v for k, v in loss_dict_reduced.items() 220 | } 221 | metric_logger.update( 222 | loss=sum(loss_dict_reduced_scaled.values()), 223 | **loss_dict_reduced_scaled, 224 | **loss_dict_reduced_unscaled, 225 | ) 226 | metric_logger.update(class_error=loss_dict_reduced["class_error"]) 227 | 228 | orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) 229 | results = postprocessors["bbox"](outputs, orig_target_sizes) 230 | if "segm" in postprocessors.keys(): 231 | target_sizes = torch.stack([t["size"] for t in targets], dim=0) 232 | results = postprocessors["segm"]( 233 | results, outputs, orig_target_sizes, target_sizes 234 | ) 235 | res = { 236 | target["image_id"].item(): output 237 | for target, output in zip(targets, results) 238 | } 239 | if coco_evaluator is not None: 240 | coco_evaluator.update(res) 241 | 242 | if panoptic_evaluator is not None: 243 | res_pano = postprocessors["panoptic"]( 244 | outputs, target_sizes, orig_target_sizes 245 | ) 246 | for i, target in enumerate(targets): 247 | image_id = target["image_id"].item() 248 | file_name = f"{image_id:012d}.png" 249 | res_pano[i]["image_id"] = image_id 250 | res_pano[i]["file_name"] = file_name 251 | 252 | panoptic_evaluator.update(res_pano) 253 | 254 | # gather the stats from all processes 255 | metric_logger.synchronize_between_processes() 256 | print("Averaged stats:", metric_logger) 257 | if coco_evaluator is not None: 258 | coco_evaluator.synchronize_between_processes() 259 | if panoptic_evaluator is not None: 260 | panoptic_evaluator.synchronize_between_processes() 261 | 262 | # accumulate predictions from all images 263 | if coco_evaluator is not None: 264 | coco_evaluator.accumulate() 265 | coco_evaluator.summarize() 266 | panoptic_res = None 267 | if panoptic_evaluator is not None: 268 | panoptic_res = panoptic_evaluator.summarize() 269 | stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} 270 | if coco_evaluator is not None: 271 | if "bbox" in postprocessors.keys(): 272 | stats["coco_eval_bbox"] = coco_evaluator.coco_eval["bbox"].stats.tolist() 273 | if "segm" in postprocessors.keys(): 274 | stats["coco_eval_masks"] = coco_evaluator.coco_eval["segm"].stats.tolist() 275 | if panoptic_res is not None: 276 | stats["PQ_all"] = panoptic_res["All"] 277 | stats["PQ_th"] = panoptic_res["Things"] 278 | stats["PQ_st"] = panoptic_res["Stuff"] 279 | if use_wandb: 280 | try: 281 | wandb.log({"AP": stats["coco_eval_bbox"][0]}) 282 | wandb.log(stats) 283 | except: 284 | pass 285 | 286 | # recover the model parameters for next training epoch 287 | model.module.num_queries = save_num_queries 288 | model.module.transformer.two_stage_num_proposals = save_two_stage_num_proposals 289 | return stats, coco_evaluator 290 | -------------------------------------------------------------------------------- /models/segmentation.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | This file provides the definition of the convolutional heads used to predict masks, as well as the losses 12 | """ 13 | import io 14 | from collections import defaultdict 15 | 16 | import torch 17 | import torch.nn as nn 18 | import torch.nn.functional as F 19 | from PIL import Image 20 | 21 | import util.box_ops as box_ops 22 | from util.misc import NestedTensor, interpolate, nested_tensor_from_tensor_list 23 | 24 | try: 25 | from panopticapi.utils import id2rgb, rgb2id 26 | except ImportError: 27 | pass 28 | 29 | 30 | class DETRsegm(nn.Module): 31 | def __init__(self, detr, freeze_detr=False): 32 | super().__init__() 33 | self.detr = detr 34 | 35 | if freeze_detr: 36 | for p in self.parameters(): 37 | p.requires_grad_(False) 38 | 39 | hidden_dim, nheads = detr.transformer.d_model, detr.transformer.nhead 40 | self.bbox_attention = MHAttentionMap(hidden_dim, hidden_dim, nheads, dropout=0) 41 | self.mask_head = MaskHeadSmallConv( 42 | hidden_dim + nheads, [1024, 512, 256], hidden_dim 43 | ) 44 | 45 | def forward(self, samples: NestedTensor): 46 | if not isinstance(samples, NestedTensor): 47 | samples = nested_tensor_from_tensor_list(samples) 48 | features, pos = self.detr.backbone(samples) 49 | 50 | bs = features[-1].tensors.shape[0] 51 | 52 | src, mask = features[-1].decompose() 53 | src_proj = self.detr.input_proj(src) 54 | hs, memory = self.detr.transformer( 55 | src_proj, mask, self.detr.query_embed.weight, pos[-1] 56 | ) 57 | 58 | outputs_class = self.detr.class_embed(hs) 59 | outputs_coord = self.detr.bbox_embed(hs).sigmoid() 60 | out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]} 61 | if self.detr.aux_loss: 62 | out["aux_outputs"] = [ 63 | {"pred_logits": a, "pred_boxes": b} 64 | for a, b in zip(outputs_class[:-1], outputs_coord[:-1]) 65 | ] 66 | 67 | # FIXME h_boxes takes the last one computed, keep this in mind 68 | bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask) 69 | 70 | seg_masks = self.mask_head( 71 | src_proj, 72 | bbox_mask, 73 | [features[2].tensors, features[1].tensors, features[0].tensors], 74 | ) 75 | outputs_seg_masks = seg_masks.view( 76 | bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1] 77 | ) 78 | 79 | out["pred_masks"] = outputs_seg_masks 80 | return out 81 | 82 | 83 | class MaskHeadSmallConv(nn.Module): 84 | """ 85 | Simple convolutional head, using group norm. 86 | Upsampling is done using a FPN approach 87 | """ 88 | 89 | def __init__(self, dim, fpn_dims, context_dim): 90 | super().__init__() 91 | 92 | inter_dims = [ 93 | dim, 94 | context_dim // 2, 95 | context_dim // 4, 96 | context_dim // 8, 97 | context_dim // 16, 98 | context_dim // 64, 99 | ] 100 | self.lay1 = torch.nn.Conv2d(dim, dim, 3, padding=1) 101 | self.gn1 = torch.nn.GroupNorm(8, dim) 102 | self.lay2 = torch.nn.Conv2d(dim, inter_dims[1], 3, padding=1) 103 | self.gn2 = torch.nn.GroupNorm(8, inter_dims[1]) 104 | self.lay3 = torch.nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1) 105 | self.gn3 = torch.nn.GroupNorm(8, inter_dims[2]) 106 | self.lay4 = torch.nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1) 107 | self.gn4 = torch.nn.GroupNorm(8, inter_dims[3]) 108 | self.lay5 = torch.nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1) 109 | self.gn5 = torch.nn.GroupNorm(8, inter_dims[4]) 110 | self.out_lay = torch.nn.Conv2d(inter_dims[4], 1, 3, padding=1) 111 | 112 | self.dim = dim 113 | 114 | self.adapter1 = torch.nn.Conv2d(fpn_dims[0], inter_dims[1], 1) 115 | self.adapter2 = torch.nn.Conv2d(fpn_dims[1], inter_dims[2], 1) 116 | self.adapter3 = torch.nn.Conv2d(fpn_dims[2], inter_dims[3], 1) 117 | 118 | for m in self.modules(): 119 | if isinstance(m, nn.Conv2d): 120 | nn.init.kaiming_uniform_(m.weight, a=1) 121 | nn.init.constant_(m.bias, 0) 122 | 123 | def forward(self, x, bbox_mask, fpns): 124 | def expand(tensor, length): 125 | return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1) 126 | 127 | x = torch.cat([expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1) 128 | 129 | x = self.lay1(x) 130 | x = self.gn1(x) 131 | x = F.relu(x) 132 | x = self.lay2(x) 133 | x = self.gn2(x) 134 | x = F.relu(x) 135 | 136 | cur_fpn = self.adapter1(fpns[0]) 137 | if cur_fpn.size(0) != x.size(0): 138 | cur_fpn = expand(cur_fpn, x.size(0) / cur_fpn.size(0)) 139 | x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") 140 | x = self.lay3(x) 141 | x = self.gn3(x) 142 | x = F.relu(x) 143 | 144 | cur_fpn = self.adapter2(fpns[1]) 145 | if cur_fpn.size(0) != x.size(0): 146 | cur_fpn = expand(cur_fpn, x.size(0) / cur_fpn.size(0)) 147 | x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") 148 | x = self.lay4(x) 149 | x = self.gn4(x) 150 | x = F.relu(x) 151 | 152 | cur_fpn = self.adapter3(fpns[2]) 153 | if cur_fpn.size(0) != x.size(0): 154 | cur_fpn = expand(cur_fpn, x.size(0) / cur_fpn.size(0)) 155 | x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") 156 | x = self.lay5(x) 157 | x = self.gn5(x) 158 | x = F.relu(x) 159 | 160 | x = self.out_lay(x) 161 | return x 162 | 163 | 164 | class MHAttentionMap(nn.Module): 165 | """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)""" 166 | 167 | def __init__(self, query_dim, hidden_dim, num_heads, dropout=0, bias=True): 168 | super().__init__() 169 | self.num_heads = num_heads 170 | self.hidden_dim = hidden_dim 171 | self.dropout = nn.Dropout(dropout) 172 | 173 | self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias) 174 | self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias) 175 | 176 | nn.init.zeros_(self.k_linear.bias) 177 | nn.init.zeros_(self.q_linear.bias) 178 | nn.init.xavier_uniform_(self.k_linear.weight) 179 | nn.init.xavier_uniform_(self.q_linear.weight) 180 | self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5 181 | 182 | def forward(self, q, k, mask=None): 183 | q = self.q_linear(q) 184 | k = F.conv2d( 185 | k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias 186 | ) 187 | qh = q.view( 188 | q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads 189 | ) 190 | kh = k.view( 191 | k.shape[0], 192 | self.num_heads, 193 | self.hidden_dim // self.num_heads, 194 | k.shape[-2], 195 | k.shape[-1], 196 | ) 197 | weights = torch.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh) 198 | 199 | if mask is not None: 200 | weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf")) 201 | weights = F.softmax(weights.flatten(2), dim=-1).view_as(weights) 202 | weights = self.dropout(weights) 203 | return weights 204 | 205 | 206 | def dice_loss(inputs, targets, num_boxes): 207 | """ 208 | Compute the DICE loss, similar to generalized IOU for masks 209 | Args: 210 | inputs: A float tensor of arbitrary shape. 211 | The predictions for each example. 212 | targets: A float tensor with the same shape as inputs. Stores the binary 213 | classification label for each element in inputs 214 | (0 for the negative class and 1 for the positive class). 215 | """ 216 | inputs = inputs.sigmoid() 217 | inputs = inputs.flatten(1) 218 | numerator = 2 * (inputs * targets).sum(1) 219 | denominator = inputs.sum(-1) + targets.sum(-1) 220 | loss = 1 - (numerator + 1) / (denominator + 1) 221 | return loss.sum() / num_boxes 222 | 223 | 224 | def sigmoid_focal_loss( 225 | inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2 226 | ): 227 | """ 228 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. 229 | Args: 230 | inputs: A float tensor of arbitrary shape. 231 | The predictions for each example. 232 | targets: A float tensor with the same shape as inputs. Stores the binary 233 | classification label for each element in inputs 234 | (0 for the negative class and 1 for the positive class). 235 | alpha: (optional) Weighting factor in range (0,1) to balance 236 | positive vs negative examples. Default = -1 (no weighting). 237 | gamma: Exponent of the modulating factor (1 - p_t) to 238 | balance easy vs hard examples. 239 | Returns: 240 | Loss tensor 241 | """ 242 | prob = inputs.sigmoid() 243 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") 244 | p_t = prob * targets + (1 - prob) * (1 - targets) 245 | loss = ce_loss * ((1 - p_t) ** gamma) 246 | 247 | if alpha >= 0: 248 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets) 249 | loss = alpha_t * loss 250 | 251 | return loss.mean(1).sum() / num_boxes 252 | 253 | 254 | class PostProcessSegm(nn.Module): 255 | def __init__(self, threshold=0.5): 256 | super().__init__() 257 | self.threshold = threshold 258 | 259 | @torch.no_grad() 260 | def forward(self, results, outputs, orig_target_sizes, max_target_sizes): 261 | assert len(orig_target_sizes) == len(max_target_sizes) 262 | max_h, max_w = max_target_sizes.max(0)[0].tolist() 263 | outputs_masks = outputs["pred_masks"].squeeze(2) 264 | outputs_masks = F.interpolate( 265 | outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False 266 | ) 267 | outputs_masks = (outputs_masks.sigmoid() > self.threshold).cpu() 268 | 269 | for i, (cur_mask, t, tt) in enumerate( 270 | zip(outputs_masks, max_target_sizes, orig_target_sizes) 271 | ): 272 | img_h, img_w = t[0], t[1] 273 | results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1) 274 | results[i]["masks"] = F.interpolate( 275 | results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest" 276 | ).byte() 277 | 278 | return results 279 | 280 | 281 | class PostProcessPanoptic(nn.Module): 282 | """This class converts the output of the model to the final panoptic result, in the format expected by the 283 | coco panoptic API """ 284 | 285 | def __init__(self, is_thing_map, threshold=0.85): 286 | """ 287 | Parameters: 288 | is_thing_map: This is a whose keys are the class ids, and the values a boolean indicating whether 289 | the class is a thing (True) or a stuff (False) class 290 | threshold: confidence threshold: segments with confidence lower than this will be deleted 291 | """ 292 | super().__init__() 293 | self.threshold = threshold 294 | self.is_thing_map = is_thing_map 295 | 296 | def forward(self, outputs, processed_sizes, target_sizes=None): 297 | """ This function computes the panoptic prediction from the model's predictions. 298 | Parameters: 299 | outputs: This is a dict coming directly from the model. See the model doc for the content. 300 | processed_sizes: This is a list of tuples (or torch tensors) of sizes of the images that were passed to the 301 | model, ie the size after data augmentation but before batching. 302 | target_sizes: This is a list of tuples (or torch tensors) corresponding to the requested final size 303 | of each prediction. If left to None, it will default to the processed_sizes 304 | """ 305 | if target_sizes is None: 306 | target_sizes = processed_sizes 307 | assert len(processed_sizes) == len(target_sizes) 308 | out_logits, raw_masks, raw_boxes = ( 309 | outputs["pred_logits"], 310 | outputs["pred_masks"], 311 | outputs["pred_boxes"], 312 | ) 313 | assert len(out_logits) == len(raw_masks) == len(target_sizes) 314 | preds = [] 315 | 316 | def to_tuple(tup): 317 | if isinstance(tup, tuple): 318 | return tup 319 | return tuple(tup.cpu().tolist()) 320 | 321 | for cur_logits, cur_masks, cur_boxes, size, target_size in zip( 322 | out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes 323 | ): 324 | # we filter empty queries and detection below threshold 325 | scores, labels = cur_logits.softmax(-1).max(-1) 326 | keep = labels.ne(outputs["pred_logits"].shape[-1] - 1) & ( 327 | scores > self.threshold 328 | ) 329 | cur_scores, cur_classes = cur_logits.softmax(-1).max(-1) 330 | cur_scores = cur_scores[keep] 331 | cur_classes = cur_classes[keep] 332 | cur_masks = cur_masks[keep] 333 | cur_masks = interpolate( 334 | cur_masks[None], to_tuple(size), mode="bilinear" 335 | ).squeeze(0) 336 | cur_boxes = box_ops.box_cxcywh_to_xyxy(cur_boxes[keep]) 337 | 338 | h, w = cur_masks.shape[-2:] 339 | assert len(cur_boxes) == len(cur_classes) 340 | 341 | # It may be that we have several predicted masks for the same stuff class. 342 | # In the following, we track the list of masks ids for each stuff class (they are merged later on) 343 | cur_masks = cur_masks.flatten(1) 344 | stuff_equiv_classes = defaultdict(lambda: []) 345 | for k, label in enumerate(cur_classes): 346 | if not self.is_thing_map[label.item()]: 347 | stuff_equiv_classes[label.item()].append(k) 348 | 349 | def get_ids_area(masks, scores, dedup=False): 350 | # This helper function creates the final panoptic segmentation image 351 | # It also returns the area of the masks that appears on the image 352 | 353 | m_id = masks.transpose(0, 1).softmax(-1) 354 | 355 | if m_id.shape[-1] == 0: 356 | # We didn't detect any mask :( 357 | m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device) 358 | else: 359 | m_id = m_id.argmax(-1).view(h, w) 360 | 361 | if dedup: 362 | # Merge the masks corresponding to the same stuff class 363 | for equiv in stuff_equiv_classes.values(): 364 | if len(equiv) > 1: 365 | for eq_id in equiv: 366 | m_id.masked_fill_(m_id.eq(eq_id), equiv[0]) 367 | 368 | final_h, final_w = to_tuple(target_size) 369 | 370 | seg_img = Image.fromarray(id2rgb(m_id.view(h, w).cpu().numpy())) 371 | seg_img = seg_img.resize( 372 | size=(final_w, final_h), resample=Image.NEAREST 373 | ) 374 | 375 | np_seg_img = ( 376 | torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())) 377 | .view(final_h, final_w, 3) 378 | .numpy() 379 | ) 380 | m_id = torch.from_numpy(rgb2id(np_seg_img)) 381 | 382 | area = [] 383 | for i in range(len(scores)): 384 | area.append(m_id.eq(i).sum().item()) 385 | return area, seg_img 386 | 387 | area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True) 388 | if cur_classes.numel() > 0: 389 | # We know filter empty masks as long as we find some 390 | while True: 391 | filtered_small = torch.as_tensor( 392 | [area[i] <= 4 for i, c in enumerate(cur_classes)], 393 | dtype=torch.bool, 394 | device=keep.device, 395 | ) 396 | if filtered_small.any().item(): 397 | cur_scores = cur_scores[~filtered_small] 398 | cur_classes = cur_classes[~filtered_small] 399 | cur_masks = cur_masks[~filtered_small] 400 | area, seg_img = get_ids_area(cur_masks, cur_scores) 401 | else: 402 | break 403 | 404 | else: 405 | cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device) 406 | 407 | segments_info = [] 408 | for i, a in enumerate(area): 409 | cat = cur_classes[i].item() 410 | segments_info.append( 411 | { 412 | "id": i, 413 | "isthing": self.is_thing_map[cat], 414 | "category_id": cat, 415 | "area": a, 416 | } 417 | ) 418 | del cur_classes 419 | 420 | with io.BytesIO() as out: 421 | seg_img.save(out, format="PNG") 422 | predictions = { 423 | "png_string": out.getvalue(), 424 | "segments_info": segments_info, 425 | } 426 | preds.append(predictions) 427 | return preds 428 | --------------------------------------------------------------------------------