├── requirements.txt ├── figs ├── architecture.png └── dam_creation.png ├── configs ├── r50_deformable_detr.sh ├── swint_deformable_detr.sh ├── r50_efficient_detr.sh ├── swint_efficient_detr.sh ├── r50_sparse_detr_rho_0.1.sh ├── r50_sparse_detr_rho_0.2.sh ├── r50_sparse_detr_rho_0.3.sh ├── swint_sparse_detr_rho_0.1.sh ├── swint_sparse_detr_rho_0.2.sh └── swint_sparse_detr_rho_0.3.sh ├── models ├── swin_transformer │ ├── configs │ │ ├── swin_large_patch4_window7_224.yaml │ │ ├── swin_tiny_patch4_window7_224.yaml │ │ ├── swin_base_patch4_window7_224.yaml │ │ ├── swin_small_patch4_window7_224.yaml │ │ └── default.yaml │ ├── __init__.py │ ├── build.py │ └── config.py ├── ops │ ├── make.sh │ ├── functions │ │ ├── __init__.py │ │ └── ms_deform_attn_func.py │ ├── modules │ │ ├── __init__.py │ │ └── ms_deform_attn.py │ ├── src │ │ ├── vision.cpp │ │ ├── cuda │ │ │ ├── ms_deform_attn_cuda.h │ │ │ └── ms_deform_attn_cuda.cu │ │ ├── cpu │ │ │ ├── ms_deform_attn_cpu.h │ │ │ └── ms_deform_attn_cpu.cpp │ │ └── ms_deform_attn.h │ ├── setup.py │ └── test.py ├── __init__.py ├── position_encoding.py ├── matcher.py ├── backbone.py └── segmentation.py ├── datasets ├── torchvision_datasets │ ├── __init__.py │ └── coco.py ├── __init__.py ├── panoptic_eval.py ├── data_prefetcher.py ├── coco_panoptic.py ├── samplers.py ├── coco.py ├── transforms.py └── coco_eval.py ├── util ├── __init__.py ├── box_ops.py ├── dam.py ├── plot_utils.py └── benchmark.py ├── tools ├── run_dist_launch.sh └── launch.py ├── NOTICE ├── engine.py ├── README.md └── LICENSE /requirements.txt: -------------------------------------------------------------------------------- 1 | pycocotools 2 | tqdm 3 | scipy 4 | timm 5 | fvcore 6 | tensorboard 7 | -------------------------------------------------------------------------------- /figs/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kakaobrain/sparse-detr/HEAD/figs/architecture.png -------------------------------------------------------------------------------- /figs/dam_creation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kakaobrain/sparse-detr/HEAD/figs/dam_creation.png -------------------------------------------------------------------------------- /configs/r50_deformable_detr.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/r50_deformable_detr 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | ${PY_ARGS} 11 | -------------------------------------------------------------------------------- /configs/swint_deformable_detr.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/swint_deformable_detr 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --backbone swin-t \ 11 | ${PY_ARGS} 12 | -------------------------------------------------------------------------------- /models/swin_transformer/configs/swin_large_patch4_window7_224.yaml: -------------------------------------------------------------------------------- 1 | BASE: ['default.yaml'] 2 | MODEL: 3 | TYPE: swin 4 | NAME: swin_large_patch4_window7_224 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [ 2, 2, 18, 2 ] 8 | NUM_HEADS: [ 6, 12, 24, 48 ] 9 | WINDOW_SIZE: 7 10 | -------------------------------------------------------------------------------- /models/swin_transformer/configs/swin_tiny_patch4_window7_224.yaml: -------------------------------------------------------------------------------- 1 | BASE: ['default.yaml'] 2 | MODEL: 3 | TYPE: swin 4 | NAME: swin_tiny_patch4_window7_224 5 | DROP_PATH_RATE: 0.2 6 | SWIN: 7 | EMBED_DIM: 96 8 | DEPTHS: [ 2, 2, 6, 2 ] 9 | NUM_HEADS: [ 3, 6, 12, 24 ] 10 | WINDOW_SIZE: 7 11 | -------------------------------------------------------------------------------- /models/swin_transformer/configs/swin_base_patch4_window7_224.yaml: -------------------------------------------------------------------------------- 1 | BASE: ['default.yaml'] 2 | MODEL: 3 | TYPE: swin 4 | NAME: swin_base_patch4_window7_224 5 | DROP_PATH_RATE: 0.5 6 | SWIN: 7 | EMBED_DIM: 128 8 | DEPTHS: [ 2, 2, 18, 2 ] 9 | NUM_HEADS: [ 4, 8, 16, 32 ] 10 | WINDOW_SIZE: 7 11 | -------------------------------------------------------------------------------- /models/swin_transformer/configs/swin_small_patch4_window7_224.yaml: -------------------------------------------------------------------------------- 1 | BASE: ['default.yaml'] 2 | MODEL: 3 | TYPE: swin 4 | NAME: swin_small_patch4_window7_224 5 | DROP_PATH_RATE: 0.3 6 | SWIN: 7 | EMBED_DIM: 96 8 | DEPTHS: [ 2, 2, 18, 2 ] 9 | NUM_HEADS: [ 3, 6, 12, 24 ] 10 | WINDOW_SIZE: 7 11 | -------------------------------------------------------------------------------- /configs/r50_efficient_detr.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/r50_efficient_detr 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --eff_query_init \ 13 | --eff_specific_head \ 14 | ${PY_ARGS} 15 | -------------------------------------------------------------------------------- /configs/swint_efficient_detr.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/swint_efficient_detr 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --backbone swin-t \ 11 | --with_box_refine \ 12 | --two_stage \ 13 | --eff_query_init \ 14 | --eff_specific_head \ 15 | ${PY_ARGS} 16 | -------------------------------------------------------------------------------- /configs/r50_sparse_detr_rho_0.1.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/r50_sparse_detr_0.1 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --eff_query_init \ 13 | --eff_specific_head \ 14 | --rho 0.1 \ 15 | --use_enc_aux_loss \ 16 | ${PY_ARGS} 17 | -------------------------------------------------------------------------------- /configs/r50_sparse_detr_rho_0.2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/r50_sparse_detr_0.2 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --eff_query_init \ 13 | --eff_specific_head \ 14 | --rho 0.2 \ 15 | --use_enc_aux_loss \ 16 | ${PY_ARGS} 17 | -------------------------------------------------------------------------------- /configs/r50_sparse_detr_rho_0.3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/r50_sparse_detr_0.3 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --with_box_refine \ 11 | --two_stage \ 12 | --eff_query_init \ 13 | --eff_specific_head \ 14 | --rho 0.3 \ 15 | --use_enc_aux_loss \ 16 | ${PY_ARGS} 17 | -------------------------------------------------------------------------------- /datasets/torchvision_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | 7 | from .coco import CocoDetection 8 | -------------------------------------------------------------------------------- /models/swin_transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------ 2 | # Sparse DETR 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------ 6 | 7 | 8 | from .build import build_model 9 | -------------------------------------------------------------------------------- /configs/swint_sparse_detr_rho_0.1.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/swint_sparse_detr_0.1 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --backbone swin-t \ 11 | --with_box_refine \ 12 | --two_stage \ 13 | --eff_query_init \ 14 | --eff_specific_head \ 15 | --rho 0.1 \ 16 | --use_enc_aux_loss \ 17 | ${PY_ARGS} 18 | -------------------------------------------------------------------------------- /configs/swint_sparse_detr_rho_0.2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/swint_sparse_detr_0.2 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --backbone swin-t \ 11 | --with_box_refine \ 12 | --two_stage \ 13 | --eff_query_init \ 14 | --eff_specific_head \ 15 | --rho 0.2 \ 16 | --use_enc_aux_loss \ 17 | ${PY_ARGS} 18 | -------------------------------------------------------------------------------- /configs/swint_sparse_detr_rho_0.3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | EXP_DIR=exps/swint_sparse_detr_0.3 6 | PY_ARGS=${@:1} 7 | 8 | python -u main.py \ 9 | --output_dir ${EXP_DIR} \ 10 | --backbone swin-t \ 11 | --with_box_refine \ 12 | --two_stage \ 13 | --eff_query_init \ 14 | --eff_specific_head \ 15 | --rho 0.3 \ 16 | --use_enc_aux_loss \ 17 | ${PY_ARGS} 18 | -------------------------------------------------------------------------------- /models/swin_transformer/configs/default.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_SIZE: 224 3 | TRAIN: 4 | USE_CHECKPOINT: false 5 | MODEL: 6 | SWIN: 7 | APE: false 8 | DEPTHS: [2, 2, 6, 2] 9 | EMBED_DIM: 96 10 | IN_CHANS: 3 11 | MLP_RATIO: 4.0 12 | NUM_HEADS: [3, 6, 12, 24] 13 | PATCH_NORM: true 14 | PATCH_SIZE: 4 15 | QKV_BIAS: true 16 | QK_SCALE: null 17 | WINDOW_SIZE: 7 18 | DROP_RATE: 0.0 19 | DROP_PATH_RATE: 0.1 20 | NUM_CLASSES: 1000 21 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | -------------------------------------------------------------------------------- /models/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | python setup.py build install 11 | -------------------------------------------------------------------------------- /models/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn_func import MSDeformAttnFunction 10 | 11 | -------------------------------------------------------------------------------- /models/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------ 2 | # Sparse DETR 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Deformable DETR 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 8 | # ------------------------------------------------------------------------------------------------ 9 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 10 | # ------------------------------------------------------------------------------------------------ 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /models/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------ 2 | # Sparse DETR 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------ 6 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 8 | # ------------------------------------------------------------------------------------ 9 | # Modified from DETR (https://github.com/facebookresearch/detr) 10 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 11 | # ------------------------------------------------------------------------------------ 12 | 13 | 14 | from .deformable_detr import build 15 | 16 | 17 | def build_model(args): 18 | return build(args) 19 | 20 | -------------------------------------------------------------------------------- /tools/run_dist_launch.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------ 7 | 8 | set -x 9 | 10 | GPUS=$1 11 | RUN_COMMAND=${@:2} 12 | if [ $GPUS -lt 8 ]; then 13 | GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS} 14 | else 15 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 16 | fi 17 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 18 | MASTER_PORT=${MASTER_PORT:-"29500"} 19 | NODE_RANK=${NODE_RANK:-0} 20 | 21 | let "NNODES=GPUS/GPUS_PER_NODE" 22 | 23 | python ./tools/launch.py \ 24 | --nnodes ${NNODES} \ 25 | --node_rank ${NODE_RANK} \ 26 | --master_addr ${MASTER_ADDR} \ 27 | --master_port ${MASTER_PORT} \ 28 | --nproc_per_node ${GPUS_PER_NODE} \ 29 | ${RUN_COMMAND} -------------------------------------------------------------------------------- /models/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | 31 | -------------------------------------------------------------------------------- /models/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | 34 | -------------------------------------------------------------------------------- /models/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | import torch.utils.data 11 | from .torchvision_datasets import CocoDetection 12 | 13 | from .coco import build as build_coco 14 | 15 | 16 | def get_coco_api_from_dataset(dataset): 17 | for _ in range(10): 18 | # if isinstance(dataset, torchvision.datasets.CocoDetection): 19 | # break 20 | if isinstance(dataset, torch.utils.data.Subset): 21 | dataset = dataset.dataset 22 | if isinstance(dataset, CocoDetection): 23 | return dataset.coco 24 | 25 | 26 | def build_dataset(image_set, args): 27 | if args.dataset_file == 'coco': 28 | return build_coco(image_set, args) 29 | if args.dataset_file == 'coco_panoptic': 30 | # to avoid making panopticapi required for coco 31 | from .coco_panoptic import build as build_coco_panoptic 32 | return build_coco_panoptic(image_set, args) 33 | raise ValueError(f'dataset {args.dataset_file} not supported') 34 | -------------------------------------------------------------------------------- /models/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /datasets/panoptic_eval.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | import json 11 | import os 12 | 13 | import util.misc as utils 14 | 15 | try: 16 | from panopticapi.evaluation import pq_compute 17 | except ImportError: 18 | pass 19 | 20 | 21 | class PanopticEvaluator(object): 22 | def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"): 23 | self.gt_json = ann_file 24 | self.gt_folder = ann_folder 25 | if utils.is_main_process(): 26 | if not os.path.exists(output_dir): 27 | os.mkdir(output_dir) 28 | self.output_dir = output_dir 29 | self.predictions = [] 30 | 31 | def update(self, predictions): 32 | for p in predictions: 33 | with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f: 34 | f.write(p.pop("png_string")) 35 | 36 | self.predictions += predictions 37 | 38 | def synchronize_between_processes(self): 39 | all_predictions = utils.all_gather(self.predictions) 40 | merged_predictions = [] 41 | for p in all_predictions: 42 | merged_predictions += p 43 | self.predictions = merged_predictions 44 | 45 | def summarize(self): 46 | if utils.is_main_process(): 47 | json_data = {"annotations": self.predictions} 48 | predictions_json = os.path.join(self.output_dir, "predictions.json") 49 | with open(predictions_json, "w") as f: 50 | f.write(json.dumps(json_data)) 51 | return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir) 52 | return None 53 | -------------------------------------------------------------------------------- /models/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import os 10 | import glob 11 | 12 | import torch 13 | 14 | from torch.utils.cpp_extension import CUDA_HOME 15 | from torch.utils.cpp_extension import CppExtension 16 | from torch.utils.cpp_extension import CUDAExtension 17 | 18 | from setuptools import find_packages 19 | from setuptools import setup 20 | 21 | requirements = ["torch", "torchvision"] 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | if torch.cuda.is_available() and CUDA_HOME is not None: 37 | extension = CUDAExtension 38 | sources += source_cuda 39 | define_macros += [("WITH_CUDA", None)] 40 | extra_compile_args["nvcc"] = [ 41 | "-DCUDA_HAS_FP16=1", 42 | "-D__CUDA_NO_HALF_OPERATORS__", 43 | "-D__CUDA_NO_HALF_CONVERSIONS__", 44 | "-D__CUDA_NO_HALF2_OPERATORS__", 45 | ] 46 | else: 47 | raise NotImplementedError('Cuda is not availabel') 48 | 49 | sources = [os.path.join(extensions_dir, s) for s in sources] 50 | include_dirs = [extensions_dir] 51 | ext_modules = [ 52 | extension( 53 | "MultiScaleDeformableAttention", 54 | sources, 55 | include_dirs=include_dirs, 56 | define_macros=define_macros, 57 | extra_compile_args=extra_compile_args, 58 | ) 59 | ] 60 | return ext_modules 61 | 62 | setup( 63 | name="MultiScaleDeformableAttention", 64 | version="1.0", 65 | author="Weijie Su", 66 | url="https://github.com/fundamentalvision/Deformable-DETR", 67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 68 | packages=find_packages(exclude=("configs", "tests",)), 69 | ext_modules=get_extensions(), 70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 71 | ) 72 | -------------------------------------------------------------------------------- /datasets/data_prefetcher.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | 7 | import torch 8 | 9 | def to_cuda(samples, targets, device): 10 | samples = samples.to(device, non_blocking=True) 11 | targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets] 12 | return samples, targets 13 | 14 | class data_prefetcher(): 15 | def __init__(self, loader, device, prefetch=True): 16 | self.loader = iter(loader) 17 | self.prefetch = prefetch 18 | self.device = device 19 | if prefetch: 20 | self.stream = torch.cuda.Stream() 21 | self.preload() 22 | 23 | def preload(self): 24 | try: 25 | self.next_samples, self.next_targets = next(self.loader) 26 | except StopIteration: 27 | self.next_samples = None 28 | self.next_targets = None 29 | return 30 | # if record_stream() doesn't work, another option is to make sure device inputs are created 31 | # on the main stream. 32 | # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda') 33 | # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda') 34 | # Need to make sure the memory allocated for next_* is not still in use by the main stream 35 | # at the time we start copying to next_*: 36 | # self.stream.wait_stream(torch.cuda.current_stream()) 37 | with torch.cuda.stream(self.stream): 38 | self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device) 39 | # more code for the alternative if record_stream() doesn't work: 40 | # copy_ will record the use of the pinned source tensor in this side stream. 41 | # self.next_input_gpu.copy_(self.next_input, non_blocking=True) 42 | # self.next_target_gpu.copy_(self.next_target, non_blocking=True) 43 | # self.next_input = self.next_input_gpu 44 | # self.next_target = self.next_target_gpu 45 | 46 | # With Amp, it isn't necessary to manually convert data to half. 47 | # if args.fp16: 48 | # self.next_input = self.next_input.half() 49 | # else: 50 | 51 | def next(self): 52 | if self.prefetch: 53 | torch.cuda.current_stream().wait_stream(self.stream) 54 | samples = self.next_samples 55 | targets = self.next_targets 56 | if samples is not None: 57 | samples.record_stream(torch.cuda.current_stream()) 58 | if targets is not None: 59 | for t in targets: 60 | for k, v in t.items(): 61 | v.record_stream(torch.cuda.current_stream()) 62 | self.preload() 63 | else: 64 | try: 65 | samples, targets = next(self.loader) 66 | samples, targets = to_cuda(samples, targets, self.device) 67 | except StopIteration: 68 | samples = None 69 | targets = None 70 | return samples, targets 71 | -------------------------------------------------------------------------------- /util/box_ops.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Utilities for bounding box manipulation and GIoU. 12 | """ 13 | import torch 14 | from torchvision.ops.boxes import box_area 15 | 16 | 17 | def box_cxcywh_to_xyxy(x): 18 | x_c, y_c, w, h = x.unbind(-1) 19 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 20 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 21 | return torch.stack(b, dim=-1) 22 | 23 | 24 | def box_xyxy_to_cxcywh(x): 25 | x0, y0, x1, y1 = x.unbind(-1) 26 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 27 | (x1 - x0), (y1 - y0)] 28 | return torch.stack(b, dim=-1) 29 | 30 | 31 | # modified from torchvision to also return the union 32 | def box_iou(boxes1, boxes2): 33 | area1 = box_area(boxes1) 34 | area2 = box_area(boxes2) 35 | 36 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 37 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 38 | 39 | wh = (rb - lt).clamp(min=0) # [N,M,2] 40 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 41 | 42 | union = area1[:, None] + area2 - inter 43 | 44 | iou = inter / union 45 | return iou, union 46 | 47 | 48 | def generalized_box_iou(boxes1, boxes2): 49 | """ 50 | Generalized IoU from https://giou.stanford.edu/ 51 | 52 | The boxes should be in [x0, y0, x1, y1] format 53 | 54 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 55 | and M = len(boxes2) 56 | """ 57 | # degenerate boxes gives inf / nan results 58 | # so do an early check 59 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 60 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 61 | iou, union = box_iou(boxes1, boxes2) 62 | 63 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 64 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 65 | 66 | wh = (rb - lt).clamp(min=0) # [N,M,2] 67 | area = wh[:, :, 0] * wh[:, :, 1] 68 | 69 | return iou - (area - union) / area 70 | 71 | 72 | def masks_to_boxes(masks): 73 | """Compute the bounding boxes around the provided masks 74 | 75 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 76 | 77 | Returns a [N, 4] tensors, with the boxes in xyxy format 78 | """ 79 | if masks.numel() == 0: 80 | return torch.zeros((0, 4), device=masks.device) 81 | 82 | h, w = masks.shape[-2:] 83 | 84 | y = torch.arange(0, h, dtype=torch.float) 85 | x = torch.arange(0, w, dtype=torch.float) 86 | y, x = torch.meshgrid(y, x) 87 | 88 | x_mask = (masks * x.unsqueeze(0)) 89 | x_max = x_mask.flatten(1).max(-1)[0] 90 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 91 | 92 | y_mask = (masks * y.unsqueeze(0)) 93 | y_max = y_mask.flatten(1).max(-1)[0] 94 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 95 | 96 | return torch.stack([x_min, y_min, x_max, y_max], 1) 97 | -------------------------------------------------------------------------------- /models/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch.autograd import Function 16 | from torch.autograd.function import once_differentiable 17 | 18 | import MultiScaleDeformableAttention as MSDA 19 | 20 | 21 | class MSDeformAttnFunction(Function): 22 | @staticmethod 23 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 24 | ctx.im2col_step = im2col_step 25 | output = MSDA.ms_deform_attn_forward( 26 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 27 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 28 | return output 29 | 30 | @staticmethod 31 | @once_differentiable 32 | def backward(ctx, grad_output): 33 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 34 | grad_value, grad_sampling_loc, grad_attn_weight = \ 35 | MSDA.ms_deform_attn_backward( 36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 37 | 38 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 39 | 40 | 41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 42 | # for debug and test only, 43 | # need to use cuda version instead 44 | N_, S_, M_, D_ = value.shape 45 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 46 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 47 | sampling_grids = 2 * sampling_locations - 1 48 | sampling_value_list = [] 49 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 50 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 51 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 52 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 53 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 54 | # N_*M_, D_, Lq_, P_ 55 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 56 | mode='bilinear', padding_mode='zeros', align_corners=False) 57 | sampling_value_list.append(sampling_value_l_) 58 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 59 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 60 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 61 | return output.transpose(1, 2).contiguous() 62 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | =============================================================================== 2 | Deformable DETR's Apache License 2.0 3 | =============================================================================== 4 | The overall structure of the code is based on the implementation in 5 | Deformable-DETR(https://github.com/fundamentalvision/Deformable-DETR). 6 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 7 | Copyright (c) 2020 SenseTime 8 | 9 | Licensed under the Apache License, Version 2.0 (the "License"); 10 | you may not use this file except in compliance with the License. 11 | You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, software 16 | distributed under the License is distributed on an "AS IS" BASIS, 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | See the License for the specific language governing permissions and 19 | limitations under the License. 20 | 21 | =============================================================================== 22 | DETR's Apache License 2.0 23 | =============================================================================== 24 | Deformable DETR code is orginally built on the implementation in DETR 25 | (https://github.com/facebookresearch/detr). 26 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 27 | Copyright (c) 2020 Facebook, Inc 28 | 29 | Licensed under the Apache License, Version 2.0 (the "License"); 30 | you may not use this file except in compliance with the License. 31 | You may obtain a copy of the License at 32 | 33 | http://www.apache.org/licenses/LICENSE-2.0 34 | 35 | Unless required by applicable law or agreed to in writing, software 36 | distributed under the License is distributed on an "AS IS" BASIS, 37 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 38 | See the License for the specific language governing permissions and 39 | limitations under the License. 40 | 41 | 42 | =============================================================================== 43 | Swin Transformer' MIT License 44 | =============================================================================== 45 | The transformer backbone is based on the implementation in Swin Transformer 46 | (https://github.com/microsoft/Swin-Transformer). 47 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 48 | Copyright (c) 2021 Microsoft 49 | 50 | Permission is hereby granted, free of charge, to any person obtaining a copy 51 | of this software and associated documentation files (the "Software"), to deal 52 | in the Software without restriction, including without limitation the rights 53 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 54 | copies of the Software, and to permit persons to whom the Software is 55 | furnished to do so, subject to the following conditions: 56 | 57 | The above copyright notice and this permission notice shall be included in all 58 | copies or substantial portions of the Software. 59 | 60 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 61 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 62 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 63 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 64 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 65 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 66 | SOFTWARE. 67 | -------------------------------------------------------------------------------- /datasets/torchvision_datasets/coco.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from torchvision 7 | # ------------------------------------------------------------------------ 8 | 9 | """ 10 | Copy-Paste from torchvision, but add utility of caching images on memory 11 | """ 12 | from torchvision.datasets.vision import VisionDataset 13 | from PIL import Image 14 | import os 15 | import os.path 16 | import tqdm 17 | from io import BytesIO 18 | 19 | 20 | class CocoDetection(VisionDataset): 21 | """`MS Coco Detection `_ Dataset. 22 | Args: 23 | root (string): Root directory where images are downloaded to. 24 | annFile (string): Path to json annotation file. 25 | transform (callable, optional): A function/transform that takes in an PIL image 26 | and returns a transformed version. E.g, ``transforms.ToTensor`` 27 | target_transform (callable, optional): A function/transform that takes in the 28 | target and transforms it. 29 | transforms (callable, optional): A function/transform that takes input sample and its target as entry 30 | and returns a transformed version. 31 | """ 32 | 33 | def __init__(self, root, annFile, transform=None, target_transform=None, transforms=None, 34 | cache_mode=False, local_rank=0, local_size=1): 35 | super(CocoDetection, self).__init__(root, transforms, transform, target_transform) 36 | from pycocotools.coco import COCO 37 | self.coco = COCO(annFile) 38 | self.ids = list(sorted(self.coco.imgs.keys())) 39 | self.cache_mode = cache_mode 40 | self.local_rank = local_rank 41 | self.local_size = local_size 42 | if cache_mode: 43 | self.cache = {} 44 | self.cache_images() 45 | 46 | def cache_images(self): 47 | self.cache = {} 48 | for index, img_id in zip(tqdm.trange(len(self.ids)), self.ids): 49 | if index % self.local_size != self.local_rank: 50 | continue 51 | path = self.coco.loadImgs(img_id)[0]['file_name'] 52 | with open(os.path.join(self.root, path), 'rb') as f: 53 | self.cache[path] = f.read() 54 | 55 | def get_image(self, path): 56 | if self.cache_mode: 57 | if path not in self.cache.keys(): 58 | with open(os.path.join(self.root, path), 'rb') as f: 59 | self.cache[path] = f.read() 60 | return Image.open(BytesIO(self.cache[path])).convert('RGB') 61 | return Image.open(os.path.join(self.root, path)).convert('RGB') 62 | 63 | def __getitem__(self, index): 64 | """ 65 | Args: 66 | index (int): Index 67 | Returns: 68 | tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``. 69 | """ 70 | coco = self.coco 71 | img_id = self.ids[index] 72 | ann_ids = coco.getAnnIds(imgIds=img_id) 73 | target = coco.loadAnns(ann_ids) 74 | 75 | path = coco.loadImgs(img_id)[0]['file_name'] 76 | 77 | img = self.get_image(path) 78 | if self.transforms is not None: 79 | img, target = self.transforms(img, target) 80 | 81 | return img, target 82 | 83 | def __len__(self): 84 | return len(self.ids) 85 | -------------------------------------------------------------------------------- /models/swin_transformer/build.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Sparse DETR 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------ 6 | 7 | 8 | from collections import abc, OrderedDict 9 | import os 10 | import yaml 11 | 12 | from .swin_transformer import SwinTransformer 13 | from .config import Config 14 | 15 | import torch 16 | 17 | 18 | CONFIG_MAP = { 19 | "swin-t": "models/swin_transformer/configs/swin_tiny_patch4_window7_224.yaml", 20 | "swin-s": "models/swin_transformer/configs/swin_small_patch4_window7_224.yaml", 21 | "swin-b": "models/swin_transformer/configs/swin_base_patch4_window7_224.yaml", 22 | "swin-l": "models/swin_transformer/configs/swin_large_patch4_window7_224.yaml", 23 | } 24 | 25 | 26 | CHECKPOINT_MAP = { 27 | "swin-t": "/data/public/rw/team-autolearn/pretrainedmodels/swin/swin_tiny_patch4_window7_224.pth", 28 | } 29 | 30 | 31 | def build_model(name, out_indices, frozen_stages, pretrained): 32 | config_file = CONFIG_MAP[name] 33 | config = load_config_yaml(config_file) 34 | config = Config(config) 35 | config.freeze() 36 | 37 | model_type = config.MODEL.TYPE 38 | if model_type == 'swin': 39 | model = SwinTransformer(pretrain_img_size=config.DATA.IMG_SIZE, 40 | patch_size=config.MODEL.SWIN.PATCH_SIZE, 41 | in_chans=config.MODEL.SWIN.IN_CHANS, 42 | embed_dim=config.MODEL.SWIN.EMBED_DIM, 43 | depths=config.MODEL.SWIN.DEPTHS, 44 | num_heads=config.MODEL.SWIN.NUM_HEADS, 45 | window_size=config.MODEL.SWIN.WINDOW_SIZE, 46 | mlp_ratio=config.MODEL.SWIN.MLP_RATIO, 47 | qkv_bias=config.MODEL.SWIN.QKV_BIAS, 48 | qk_scale=config.MODEL.SWIN.QK_SCALE, 49 | drop_rate=config.MODEL.DROP_RATE, 50 | drop_path_rate=config.MODEL.DROP_PATH_RATE, 51 | ape=config.MODEL.SWIN.APE, 52 | patch_norm=config.MODEL.SWIN.PATCH_NORM, 53 | use_checkpoint=config.TRAIN.USE_CHECKPOINT, 54 | out_indices=out_indices, 55 | frozen_stages=frozen_stages) 56 | else: 57 | raise NotImplementedError(f"Unkown model: {model_type}") 58 | 59 | if pretrained: 60 | ckpt_path = CHECKPOINT_MAP[name] 61 | state_dict = torch.load(ckpt_path) 62 | model.load_state_dict(state_dict['model'], strict=False) 63 | 64 | return model 65 | 66 | 67 | def _update_dict(tar, src): 68 | """recursive dict update.""" 69 | for k, v in src.items(): 70 | if isinstance(v, abc.Mapping): 71 | tar[k] = _update_dict(tar.get(k, {}), v) 72 | else: 73 | tar[k] = v 74 | return tar 75 | 76 | 77 | def load_config_yaml(cfg_file, config=None): 78 | if config is None: 79 | config = OrderedDict() 80 | 81 | with open(cfg_file, 'r') as f: 82 | config_src = yaml.load(f, Loader=yaml.FullLoader) 83 | 84 | for cfg in config_src.setdefault('BASE', ['']): 85 | if cfg: 86 | load_config_yaml( 87 | os.path.join(os.path.dirname(cfg_file), cfg), config 88 | ) 89 | print('=> merge config from {}'.format(cfg_file)) 90 | _update_dict(config, config_src) 91 | return config 92 | -------------------------------------------------------------------------------- /util/dam.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------ 2 | # Sparse DETR 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------ 6 | 7 | 8 | from pathlib import Path 9 | 10 | import numpy as np 11 | import torch 12 | import torch.nn.functional as F 13 | 14 | import matplotlib.pyplot as plt 15 | import matplotlib.patches as patches 16 | 17 | from util.box_ops import box_cxcywh_to_xyxy 18 | from util.misc import unwrap 19 | 20 | 21 | def idx_to_flat_grid(spatial_shapes, idx): 22 | flat_grid_shape = (idx.shape[0], int(torch.sum(spatial_shapes[..., 0] * spatial_shapes[..., 1]))) 23 | flat_grid = torch.zeros(flat_grid_shape, device=idx.device, dtype=torch.float32) 24 | flat_grid.scatter_(1, idx.to(torch.int64), 1) 25 | 26 | return flat_grid 27 | 28 | 29 | def attn_map_to_flat_grid(spatial_shapes, level_start_index, sampling_locations, attention_weights): 30 | # sampling_locations: [N, n_layers, Len_q, n_heads, n_levels, n_points, 2] 31 | # attention_weights: [N, n_layers, Len_q, n_heads, n_levels, n_points] 32 | N, n_layers, _, n_heads, *_ = sampling_locations.shape 33 | sampling_locations = sampling_locations.permute(0, 1, 3, 2, 5, 4, 6).flatten(0, 2).flatten(1, 2) 34 | # [N * n_layers * n_heads, Len_q * n_points, n_levels, 2] 35 | attention_weights = attention_weights.permute(0, 1, 3, 2, 5, 4).flatten(0, 2).flatten(1, 2) 36 | # [N * n_layers * n_heads, Len_q * n_points, n_levels] 37 | 38 | rev_spatial_shapes = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], dim=-1) # hw -> wh (xy) 39 | col_row_float = sampling_locations * rev_spatial_shapes 40 | 41 | col_row_ll = col_row_float.floor().to(torch.int64) 42 | zero = torch.zeros(*col_row_ll.shape[:-1], dtype=torch.int64, device=col_row_ll.device) 43 | one = torch.ones(*col_row_ll.shape[:-1], dtype=torch.int64, device=col_row_ll.device) 44 | col_row_lh = col_row_ll + torch.stack([zero, one], dim=-1) 45 | col_row_hl = col_row_ll + torch.stack([one, zero], dim=-1) 46 | col_row_hh = col_row_ll + 1 47 | 48 | margin_ll = (col_row_float - col_row_ll).prod(dim=-1) 49 | margin_lh = -(col_row_float - col_row_lh).prod(dim=-1) 50 | margin_hl = -(col_row_float - col_row_hl).prod(dim=-1) 51 | margin_hh = (col_row_float - col_row_hh).prod(dim=-1) 52 | 53 | flat_grid_shape = (attention_weights.shape[0], int(torch.sum(spatial_shapes[..., 0] * spatial_shapes[..., 1]))) 54 | flat_grid = torch.zeros(flat_grid_shape, dtype=torch.float32, device=attention_weights.device) 55 | 56 | zipped = [(col_row_ll, margin_hh), (col_row_lh, margin_hl), (col_row_hl, margin_lh), (col_row_hh, margin_ll)] 57 | for col_row, margin in zipped: 58 | valid_mask = torch.logical_and( 59 | torch.logical_and(col_row[..., 0] >= 0, col_row[..., 0] < rev_spatial_shapes[..., 0]), 60 | torch.logical_and(col_row[..., 1] >= 0, col_row[..., 1] < rev_spatial_shapes[..., 1]), 61 | ) 62 | idx = col_row[..., 1] * spatial_shapes[..., 1] + col_row[..., 0] + level_start_index 63 | idx = (idx * valid_mask).flatten(1, 2) 64 | weights = (attention_weights * valid_mask * margin).flatten(1) 65 | flat_grid.scatter_add_(1, idx, weights) 66 | 67 | return flat_grid.reshape(N, n_layers, n_heads, -1) 68 | 69 | 70 | def compute_corr(flat_grid_topk, flat_grid_attn_map, spatial_shapes): 71 | if len(flat_grid_topk.shape) == 1: 72 | flat_grid_topk = flat_grid_topk.unsqueeze(0) 73 | flat_grid_attn_map = flat_grid_attn_map.unsqueeze(0) 74 | 75 | tot = flat_grid_attn_map.sum(-1) 76 | hit = (flat_grid_topk * flat_grid_attn_map).sum(-1) 77 | 78 | corr = [hit / tot] 79 | flat_grid_idx = 0 80 | 81 | for shape in spatial_shapes: 82 | level_range = np.arange(int(flat_grid_idx), int(flat_grid_idx + shape[0] * shape[1])) 83 | tot = (flat_grid_attn_map[:, level_range]).sum(-1) 84 | hit = (flat_grid_topk[:, level_range] * flat_grid_attn_map[:, level_range]).sum(-1) 85 | flat_grid_idx += shape[0] * shape[1] 86 | corr.append(hit / tot) 87 | return corr 88 | 89 | -------------------------------------------------------------------------------- /models/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import time 14 | import torch 15 | import torch.nn as nn 16 | from torch.autograd import gradcheck 17 | 18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 19 | 20 | 21 | N, M, D = 1, 2, 2 22 | Lq, L, P = 2, 2, 2 23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 25 | S = sum([(H*W).item() for H, W in shapes]) 26 | 27 | 28 | torch.manual_seed(3) 29 | 30 | 31 | @torch.no_grad() 32 | def check_forward_equal_with_pytorch_double(): 33 | value = torch.rand(N, S, M, D).cuda() * 0.01 34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 37 | im2col_step = 2 38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 40 | fwdok = torch.allclose(output_cuda, output_pytorch) 41 | max_abs_err = (output_cuda - output_pytorch).abs().max() 42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 43 | 44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 45 | 46 | 47 | @torch.no_grad() 48 | def check_forward_equal_with_pytorch_float(): 49 | value = torch.rand(N, S, M, D).cuda() * 0.01 50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 53 | im2col_step = 2 54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 57 | max_abs_err = (output_cuda - output_pytorch).abs().max() 58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 59 | 60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 61 | 62 | 63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 64 | 65 | value = torch.rand(N, S, M, channels).cuda() * 0.01 66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 69 | im2col_step = 2 70 | func = MSDeformAttnFunction.apply 71 | 72 | value.requires_grad = grad_value 73 | sampling_locations.requires_grad = grad_sampling_loc 74 | attention_weights.requires_grad = grad_attn_weight 75 | 76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 77 | 78 | print(f'* {gradok} check_gradient_numerical(D={channels})') 79 | 80 | 81 | if __name__ == '__main__': 82 | check_forward_equal_with_pytorch_double() 83 | check_forward_equal_with_pytorch_float() 84 | 85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 86 | check_gradient_numerical(channels, True, True, True) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /models/position_encoding.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------ 2 | # Sparse DETR 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------ 6 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 8 | # ------------------------------------------------------------------------------------ 9 | # Modified from DETR (https://github.com/facebookresearch/detr) 10 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 11 | # ------------------------------------------------------------------------------------ 12 | 13 | 14 | """ 15 | Various positional encodings for the transformer. 16 | """ 17 | import math 18 | import torch 19 | from torch import nn 20 | 21 | from util.misc import NestedTensor 22 | 23 | 24 | class PositionEmbeddingSine(nn.Module): 25 | """ 26 | This is a more standard version of the position embedding, very similar to the one 27 | used by the Attention is all you need paper, generalized to work on images. 28 | """ 29 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 30 | super().__init__() 31 | self.num_pos_feats = num_pos_feats 32 | self.temperature = temperature 33 | self.normalize = normalize 34 | if scale is not None and normalize is False: 35 | raise ValueError("normalize should be True if scale is passed") 36 | if scale is None: 37 | scale = 2 * math.pi 38 | self.scale = scale 39 | 40 | def forward(self, tensor_list: NestedTensor): 41 | x = tensor_list.tensors 42 | mask = tensor_list.mask 43 | assert mask is not None 44 | not_mask = ~mask 45 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 46 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 47 | if self.normalize: 48 | eps = 1e-6 49 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale 50 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale 51 | 52 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 53 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 54 | 55 | pos_x = x_embed[:, :, :, None] / dim_t 56 | pos_y = y_embed[:, :, :, None] / dim_t 57 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 58 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 59 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 60 | return pos 61 | 62 | 63 | class PositionEmbeddingLearned(nn.Module): 64 | """ 65 | Absolute pos embedding, learned. 66 | """ 67 | def __init__(self, num_pos_feats=256): 68 | super().__init__() 69 | self.row_embed = nn.Embedding(50, num_pos_feats) 70 | self.col_embed = nn.Embedding(50, num_pos_feats) 71 | self.reset_parameters() 72 | 73 | def reset_parameters(self): 74 | nn.init.uniform_(self.row_embed.weight) 75 | nn.init.uniform_(self.col_embed.weight) 76 | 77 | def forward(self, tensor_list: NestedTensor): 78 | x = tensor_list.tensors 79 | h, w = x.shape[-2:] 80 | i = torch.arange(w, device=x.device) 81 | j = torch.arange(h, device=x.device) 82 | x_emb = self.col_embed(i) 83 | y_emb = self.row_embed(j) 84 | pos = torch.cat([ 85 | x_emb.unsqueeze(0).repeat(h, 1, 1), 86 | y_emb.unsqueeze(1).repeat(1, w, 1), 87 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) 88 | return pos 89 | 90 | 91 | def build_position_encoding(args): 92 | N_steps = args.hidden_dim // 2 93 | if args.position_embedding in ('v2', 'sine'): 94 | # TODO find a better way of exposing other arguments 95 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 96 | elif args.position_embedding in ('v3', 'learned'): 97 | position_embedding = PositionEmbeddingLearned(N_steps) 98 | else: 99 | raise ValueError(f"not supported {args.position_embedding}") 100 | 101 | return position_embedding 102 | -------------------------------------------------------------------------------- /datasets/coco_panoptic.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | import json 11 | from pathlib import Path 12 | 13 | import numpy as np 14 | import torch 15 | from PIL import Image 16 | 17 | from panopticapi.utils import rgb2id 18 | from util.box_ops import masks_to_boxes 19 | 20 | from .coco import make_coco_transforms 21 | 22 | 23 | class CocoPanoptic: 24 | def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True): 25 | with open(ann_file, 'r') as f: 26 | self.coco = json.load(f) 27 | 28 | # sort 'images' field so that they are aligned with 'annotations' 29 | # i.e., in alphabetical order 30 | self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id']) 31 | # sanity check 32 | if "annotations" in self.coco: 33 | for img, ann in zip(self.coco['images'], self.coco['annotations']): 34 | assert img['file_name'][:-4] == ann['file_name'][:-4] 35 | 36 | self.img_folder = img_folder 37 | self.ann_folder = ann_folder 38 | self.ann_file = ann_file 39 | self.transforms = transforms 40 | self.return_masks = return_masks 41 | 42 | def __getitem__(self, idx): 43 | ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx] 44 | img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg') 45 | ann_path = Path(self.ann_folder) / ann_info['file_name'] 46 | 47 | img = Image.open(img_path).convert('RGB') 48 | w, h = img.size 49 | if "segments_info" in ann_info: 50 | masks = np.asarray(Image.open(ann_path), dtype=np.uint32) 51 | masks = rgb2id(masks) 52 | 53 | ids = np.array([ann['id'] for ann in ann_info['segments_info']]) 54 | masks = masks == ids[:, None, None] 55 | 56 | masks = torch.as_tensor(masks, dtype=torch.uint8) 57 | labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64) 58 | 59 | target = {} 60 | target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]]) 61 | if self.return_masks: 62 | target['masks'] = masks 63 | target['labels'] = labels 64 | 65 | target["boxes"] = masks_to_boxes(masks) 66 | 67 | target['size'] = torch.as_tensor([int(h), int(w)]) 68 | target['orig_size'] = torch.as_tensor([int(h), int(w)]) 69 | if "segments_info" in ann_info: 70 | for name in ['iscrowd', 'area']: 71 | target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']]) 72 | 73 | if self.transforms is not None: 74 | img, target = self.transforms(img, target) 75 | 76 | return img, target 77 | 78 | def __len__(self): 79 | return len(self.coco['images']) 80 | 81 | def get_height_and_width(self, idx): 82 | img_info = self.coco['images'][idx] 83 | height = img_info['height'] 84 | width = img_info['width'] 85 | return height, width 86 | 87 | 88 | def build(image_set, args): 89 | img_folder_root = Path(args.coco_path) 90 | ann_folder_root = Path(args.coco_panoptic_path) 91 | assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist' 92 | assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist' 93 | mode = 'panoptic' 94 | PATHS = { 95 | "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'), 96 | "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'), 97 | } 98 | 99 | img_folder, ann_file = PATHS[image_set] 100 | img_folder_path = img_folder_root / img_folder 101 | ann_folder = ann_folder_root / f'{mode}_{img_folder}' 102 | ann_file = ann_folder_root / ann_file 103 | 104 | dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file, 105 | transforms=make_coco_transforms(image_set), return_masks=args.masks) 106 | 107 | return dataset 108 | -------------------------------------------------------------------------------- /util/plot_utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Plotting utilities to visualize training logs. 12 | """ 13 | import torch 14 | import pandas as pd 15 | import seaborn as sns 16 | import matplotlib.pyplot as plt 17 | 18 | from pathlib import Path, PurePath 19 | 20 | 21 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'): 22 | ''' 23 | Function to plot specific fields from training log(s). Plots both training and test results. 24 | 25 | :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file 26 | - fields = which results to plot from each log file - plots both training and test for each field. 27 | - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots 28 | - log_name = optional, name of log file if different than default 'log.txt'. 29 | 30 | :: Outputs - matplotlib plots of results in fields, color coded for each log file. 31 | - solid lines are training results, dashed lines are test results. 32 | 33 | ''' 34 | func_name = "plot_utils.py::plot_logs" 35 | 36 | # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path, 37 | # convert single Path to list to avoid 'not iterable' error 38 | 39 | if not isinstance(logs, list): 40 | if isinstance(logs, PurePath): 41 | logs = [logs] 42 | print(f"{func_name} info: logs param expects a list argument, converted to list[Path].") 43 | else: 44 | raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \ 45 | Expect list[Path] or single Path obj, received {type(logs)}") 46 | 47 | # verify valid dir(s) and that every item in list is Path object 48 | for i, dir in enumerate(logs): 49 | if not isinstance(dir, PurePath): 50 | raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}") 51 | if dir.exists(): 52 | continue 53 | raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}") 54 | 55 | # load log file(s) and plot 56 | dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs] 57 | 58 | fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5)) 59 | 60 | for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))): 61 | for j, field in enumerate(fields): 62 | if field == 'mAP': 63 | coco_eval = pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]).ewm(com=ewm_col).mean() 64 | axs[j].plot(coco_eval, c=color) 65 | else: 66 | df.interpolate().ewm(com=ewm_col).mean().plot( 67 | y=[f'train_{field}', f'test_{field}'], 68 | ax=axs[j], 69 | color=[color] * 2, 70 | style=['-', '--'] 71 | ) 72 | for ax, field in zip(axs, fields): 73 | ax.legend([Path(p).name for p in logs]) 74 | ax.set_title(field) 75 | 76 | 77 | def plot_precision_recall(files, naming_scheme='iter'): 78 | if naming_scheme == 'exp_id': 79 | # name becomes exp_id 80 | names = [f.parts[-3] for f in files] 81 | elif naming_scheme == 'iter': 82 | names = [f.stem for f in files] 83 | else: 84 | raise ValueError(f'not supported {naming_scheme}') 85 | fig, axs = plt.subplots(ncols=2, figsize=(16, 5)) 86 | for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names): 87 | data = torch.load(f) 88 | # precision is n_iou, n_points, n_cat, n_area, max_det 89 | precision = data['precision'] 90 | recall = data['params'].recThrs 91 | scores = data['scores'] 92 | # take precision for all classes, all areas and 100 detections 93 | precision = precision[0, :, :, 0, -1].mean(1) 94 | scores = scores[0, :, :, 0, -1].mean(1) 95 | prec = precision.mean() 96 | rec = data['recall'][0, :, 0, -1].mean() 97 | print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' + 98 | f'score={scores.mean():0.3f}, ' + 99 | f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}' 100 | ) 101 | axs[0].plot(recall, precision, c=color) 102 | axs[1].plot(recall, scores, c=color) 103 | 104 | axs[0].set_title('Precision / Recall') 105 | axs[0].legend(names) 106 | axs[1].set_title('Scores / Recall') 107 | axs[1].legend(names) 108 | return fig, axs 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /util/benchmark.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import time 3 | from typing import Any, Counter, DefaultDict, Tuple, Dict, Optional 4 | import warnings 5 | 6 | import numpy as np 7 | import torch 8 | from torch import nn 9 | import tqdm 10 | 11 | from util.misc import nested_tensor_from_tensor_list 12 | from fvcore.nn import FlopCountAnalysis 13 | from fvcore.nn.jit_handles import Handle 14 | 15 | 16 | @torch.no_grad() 17 | def measure_average_inference_time(model, inputs, num_iters=100, warm_iters=5): 18 | ts = [] 19 | # note that warm-up iters. are excluded from the total iters. 20 | for iter_ in tqdm.tqdm(range(warm_iters + num_iters)): 21 | torch.cuda.synchronize() 22 | t_ = time.perf_counter() 23 | model(inputs) 24 | torch.cuda.synchronize() 25 | t = time.perf_counter() - t_ 26 | if iter_ >= warm_iters: 27 | ts.append(t) 28 | return sum(ts) / len(ts) 29 | 30 | 31 | def python_ops_mode_for_deform_attn(model, ops_mode): 32 | def change_ops_mode(module): 33 | if hasattr(module, "python_ops_for_test"): 34 | module.python_ops_for_test = ops_mode 35 | model.apply(change_ops_mode) 36 | 37 | 38 | @torch.no_grad() 39 | def compute_fps(model, dataset, num_iters=300, warm_iters=5, batch_size=4): 40 | print(f"computing fps.. (num_iters={num_iters}, batch_size={batch_size}) " 41 | f"warm_iters={warm_iters}, batch_size={batch_size}]") 42 | assert num_iters > 0 and warm_iters >= 0 and batch_size > 0 43 | model.cuda() 44 | model.eval() 45 | inputs = nested_tensor_from_tensor_list( 46 | [dataset.__getitem__(0)[0].cuda() for _ in range(batch_size)]) 47 | t = measure_average_inference_time(model, inputs, num_iters, warm_iters) 48 | model.train() 49 | print(f"FPS: {1.0 / t * batch_size}") 50 | return 1.0 / t * batch_size 51 | 52 | 53 | @torch.no_grad() 54 | def compute_gflops(model, dataset, approximated=True): 55 | print(f"computing flops.. (approximated={approximated})") 56 | model.eval() 57 | python_ops_mode_for_deform_attn(model, True) 58 | if approximated: 59 | # use just a single image to approximate the full compuation 60 | # the size of the image was found heuristically 61 | images = [torch.randn((3, 850, 1040))] 62 | else: 63 | # full computation: get the first 100 images of COCO val2017 64 | images = [] 65 | for idx in range(100): 66 | img, _ = dataset[idx] 67 | images.append(img) 68 | 69 | gflops_list = [] 70 | imsize_list = [] 71 | 72 | for img in tqdm.tqdm(images): 73 | inputs = [img.cuda()] 74 | with warnings.catch_warnings(): 75 | warnings.filterwarnings("ignore", category=RuntimeWarning) 76 | res = flop_count_without_warnings(model, (inputs,), )[0] 77 | gflops = sum(res.values()) 78 | gflops_list.append(gflops) 79 | imsize_list.append(list(img.shape)) 80 | 81 | if approximated: 82 | print(f"The image size used for approximation: [3, 850, 1040]") 83 | else: 84 | print("Average image size of first 100 image of COCO val2017 : " 85 | f"{np.array(imsize_list).mean(0)}") 86 | 87 | print(f"GFLOPs : {np.array(gflops_list).mean()}") 88 | model.train() 89 | python_ops_mode_for_deform_attn(model, False) 90 | return gflops 91 | 92 | 93 | def flop_count_without_warnings( 94 | 95 | model: nn.Module, 96 | inputs: Tuple[Any, ...], 97 | supported_ops: Optional[Dict[str, Handle]] = None, 98 | ) -> Tuple[DefaultDict[str, float], Counter[str]]: 99 | """copied and modified from fvcore.nn.flop_count.py 100 | 101 | Given a model and an input to the model, compute the per-operator Gflops 102 | of the given model. 103 | Args: 104 | model (nn.Module): The model to compute flop counts. 105 | inputs (tuple): Inputs that are passed to `model` to count flops. 106 | Inputs need to be in a tuple. 107 | supported_ops (dict(str,Callable) or None) : provide additional 108 | handlers for extra ops, or overwrite the existing handlers for 109 | convolution and matmul and einsum. The key is operator name and the value 110 | is a function that takes (inputs, outputs) of the op. We count 111 | one Multiply-Add as one FLOP. 112 | Returns: 113 | tuple[defaultdict, Counter]: A dictionary that records the number of 114 | gflops for each operation and a Counter that records the number of 115 | unsupported operations. 116 | """ 117 | if supported_ops is None: 118 | supported_ops = {} 119 | flop_counter = FlopCountAnalysis(model, inputs).set_op_handle(**supported_ops) 120 | flop_counter.unsupported_ops_warnings(False) 121 | flop_counter.uncalled_modules_warnings(False) 122 | flop_counter.tracer_warnings("no_tracer_warning") 123 | giga_flops = defaultdict(float) 124 | for op, flop in flop_counter.by_operator().items(): 125 | giga_flops[op] = flop / 1e9 126 | return giga_flops, flop_counter.unsupported_ops() 127 | -------------------------------------------------------------------------------- /models/matcher.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------ 2 | # Sparse DETR 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------ 6 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 8 | # ------------------------------------------------------------------------------------ 9 | # Modified from DETR (https://github.com/facebookresearch/detr) 10 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 11 | # ------------------------------------------------------------------------------------ 12 | 13 | 14 | """ 15 | Modules to compute the matching cost and solve the corresponding LSAP. 16 | """ 17 | import torch 18 | from scipy.optimize import linear_sum_assignment 19 | from torch import nn 20 | 21 | from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou 22 | 23 | 24 | class HungarianMatcher(nn.Module): 25 | """This class computes an assignment between the targets and the predictions of the network 26 | 27 | For efficiency reasons, the targets don't include the no_object. Because of this, in general, 28 | there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, 29 | while the others are un-matched (and thus treated as non-objects). 30 | """ 31 | 32 | def __init__(self, 33 | cost_class: float = 1, 34 | cost_bbox: float = 1, 35 | cost_giou: float = 1): 36 | """Creates the matcher 37 | 38 | Params: 39 | cost_class: This is the relative weight of the classification error in the matching cost 40 | cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost 41 | cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost 42 | """ 43 | super().__init__() 44 | self.cost_class = cost_class 45 | self.cost_bbox = cost_bbox 46 | self.cost_giou = cost_giou 47 | assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0" 48 | 49 | def forward(self, outputs, targets): 50 | """ Performs the matching 51 | 52 | Params: 53 | outputs: This is a dict that contains at least these entries: 54 | "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits 55 | "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates 56 | 57 | targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: 58 | "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth 59 | objects in the target) containing the class labels 60 | "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates 61 | 62 | Returns: 63 | A list of size batch_size, containing tuples of (index_i, index_j) where: 64 | - index_i is the indices of the selected predictions (in order) 65 | - index_j is the indices of the corresponding selected targets (in order) 66 | For each batch element, it holds: 67 | len(index_i) = len(index_j) = min(num_queries, num_target_boxes) 68 | """ 69 | with torch.no_grad(): 70 | bs, num_queries = outputs["pred_logits"].shape[:2] 71 | 72 | # We flatten to compute the cost matrices in a batch 73 | out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() 74 | out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] 75 | 76 | # Also concat the target labels and boxes 77 | tgt_ids = torch.cat([v["labels"] for v in targets]) 78 | tgt_bbox = torch.cat([v["boxes"] for v in targets]) 79 | 80 | # Compute the classification cost. 81 | alpha = 0.25 82 | gamma = 2.0 83 | neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log()) 84 | pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) 85 | cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids] 86 | 87 | # Compute the L1 cost between boxes 88 | cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) 89 | 90 | # Compute the giou cost betwen boxes 91 | cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), 92 | box_cxcywh_to_xyxy(tgt_bbox)) 93 | 94 | # Final cost matrix 95 | C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou 96 | C = C.view(bs, num_queries, -1).cpu() 97 | 98 | sizes = [len(v["boxes"]) for v in targets] 99 | indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] 100 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor([_j % size for _j in j], dtype=torch.int64)) 101 | for (i, j), size in zip(indices, sizes)] 102 | 103 | 104 | def build_matcher(args): 105 | return HungarianMatcher(cost_class=args.set_cost_class, 106 | cost_bbox=args.set_cost_bbox, 107 | cost_giou=args.set_cost_giou) 108 | -------------------------------------------------------------------------------- /datasets/samplers.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from codes in torch.utils.data.distributed 7 | # ------------------------------------------------------------------------ 8 | 9 | import os 10 | import math 11 | import torch 12 | import torch.distributed as dist 13 | from torch.utils.data.sampler import Sampler 14 | 15 | 16 | class DistributedSampler(Sampler): 17 | """Sampler that restricts data loading to a subset of the dataset. 18 | It is especially useful in conjunction with 19 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 20 | process can pass a DistributedSampler instance as a DataLoader sampler, 21 | and load a subset of the original dataset that is exclusive to it. 22 | .. note:: 23 | Dataset is assumed to be of constant size. 24 | Arguments: 25 | dataset: Dataset used for sampling. 26 | num_replicas (optional): Number of processes participating in 27 | distributed training. 28 | rank (optional): Rank of the current process within num_replicas. 29 | """ 30 | 31 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): 32 | if num_replicas is None: 33 | if not dist.is_available(): 34 | raise RuntimeError("Requires distributed package to be available") 35 | num_replicas = dist.get_world_size() 36 | if rank is None: 37 | if not dist.is_available(): 38 | raise RuntimeError("Requires distributed package to be available") 39 | rank = dist.get_rank() 40 | self.dataset = dataset 41 | self.num_replicas = num_replicas 42 | self.rank = rank 43 | self.epoch = 0 44 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 45 | self.total_size = self.num_samples * self.num_replicas 46 | self.shuffle = shuffle 47 | 48 | def __iter__(self): 49 | if self.shuffle: 50 | # deterministically shuffle based on epoch 51 | g = torch.Generator() 52 | g.manual_seed(self.epoch) 53 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 54 | else: 55 | indices = torch.arange(len(self.dataset)).tolist() 56 | 57 | # add extra samples to make it evenly divisible 58 | indices += indices[: (self.total_size - len(indices))] 59 | assert len(indices) == self.total_size 60 | 61 | # subsample 62 | offset = self.num_samples * self.rank 63 | indices = indices[offset : offset + self.num_samples] 64 | assert len(indices) == self.num_samples 65 | 66 | return iter(indices) 67 | 68 | def __len__(self): 69 | return self.num_samples 70 | 71 | def set_epoch(self, epoch): 72 | self.epoch = epoch 73 | 74 | 75 | class NodeDistributedSampler(Sampler): 76 | """Sampler that restricts data loading to a subset of the dataset. 77 | It is especially useful in conjunction with 78 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 79 | process can pass a DistributedSampler instance as a DataLoader sampler, 80 | and load a subset of the original dataset that is exclusive to it. 81 | .. note:: 82 | Dataset is assumed to be of constant size. 83 | Arguments: 84 | dataset: Dataset used for sampling. 85 | num_replicas (optional): Number of processes participating in 86 | distributed training. 87 | rank (optional): Rank of the current process within num_replicas. 88 | """ 89 | 90 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): 91 | if num_replicas is None: 92 | if not dist.is_available(): 93 | raise RuntimeError("Requires distributed package to be available") 94 | num_replicas = dist.get_world_size() 95 | if rank is None: 96 | if not dist.is_available(): 97 | raise RuntimeError("Requires distributed package to be available") 98 | rank = dist.get_rank() 99 | if local_rank is None: 100 | local_rank = int(os.environ.get('LOCAL_RANK', 0)) 101 | if local_size is None: 102 | local_size = int(os.environ.get('LOCAL_SIZE', 1)) 103 | self.dataset = dataset 104 | self.shuffle = shuffle 105 | self.num_replicas = num_replicas 106 | self.num_parts = local_size 107 | self.rank = rank 108 | self.local_rank = local_rank 109 | self.epoch = 0 110 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 111 | self.total_size = self.num_samples * self.num_replicas 112 | 113 | self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts 114 | 115 | def __iter__(self): 116 | if self.shuffle: 117 | # deterministically shuffle based on epoch 118 | g = torch.Generator() 119 | g.manual_seed(self.epoch) 120 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 121 | else: 122 | indices = torch.arange(len(self.dataset)).tolist() 123 | indices = [i for i in indices if i % self.num_parts == self.local_rank] 124 | 125 | # add extra samples to make it evenly divisible 126 | indices += indices[:(self.total_size_parts - len(indices))] 127 | assert len(indices) == self.total_size_parts 128 | 129 | # subsample 130 | indices = indices[self.rank // self.num_parts:self.total_size_parts:self.num_replicas // self.num_parts] 131 | assert len(indices) == self.num_samples 132 | 133 | return iter(indices) 134 | 135 | def __len__(self): 136 | return self.num_samples 137 | 138 | def set_epoch(self, epoch): 139 | self.epoch = epoch 140 | -------------------------------------------------------------------------------- /models/swin_transformer/config.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Sparse DETR 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------ 6 | 7 | 8 | import collections 9 | from collections import OrderedDict 10 | from copy import deepcopy 11 | import logging 12 | from os.path import basename, splitext 13 | from pprint import pformat 14 | from types import SimpleNamespace 15 | import yaml 16 | 17 | 18 | class Config(SimpleNamespace): 19 | """Dictionary-based but also dot-accessible configuration object, which will 20 | rescue you from the messy brackets and quotation marks while accessing 21 | nested dictionaries. 22 | 23 | As the usage example below, a value can be easily assigned to a new field 24 | with hierarchies by using Python's usual assignment syntax. Due to the side 25 | effects of this feature, it is safe that the user call '.freeze()' before 26 | using the Config instance as a fixed configuration. Otherwise, even when 27 | a wanted attribute is called with an incorrect name, AttributeError will be 28 | silently ignored and returns an empty config, which could be resulting in 29 | unwanted consequences. 30 | 31 | Usage: 32 | >>> cfg = Config() 33 | >>> cfg.foo = 1 34 | >>> cfg.bar.baz = 2 35 | >>> cfg['bar']['baz'] == cfg.bar.baz 36 | True 37 | >>> cfg.pprint() 38 | --- 39 | foo: 1 40 | bar: 41 | baz: 2 42 | ... 43 | >>> cfg.freeze() 44 | >>> cfg.new = 3 45 | RuntimeError: Can't set new attribute after being freezed! 46 | 47 | """ 48 | def __init__(self, _dict=None, **kwargs): 49 | super().__init__(**kwargs) 50 | self._freezed = False 51 | self._order = list() 52 | if _dict is not None: 53 | self._set_with_nested_dict(_dict) 54 | 55 | def _set_with_nested_dict(self, _dict): 56 | for key, value in _dict.items(): 57 | if isinstance(value, dict): 58 | self.__setattr__(key, Config(value)) 59 | else: 60 | self.__setattr__(key, value) 61 | self._order.append(key) 62 | 63 | @property 64 | def freezed(self): 65 | return self._freezed 66 | 67 | @classmethod 68 | def from_yaml(cls, yaml_file): 69 | """Initialize configuration with a YAML file.""" 70 | return cls(OrderedDict(yaml.load(open(yaml_file, "r"), 71 | Loader=yaml.FullLoader))) 72 | 73 | def __repr__(self): 74 | return 'Config' + self.to_dict().__repr__() 75 | 76 | def __getitem__(self, item): 77 | return self.__getattr__(item) 78 | 79 | def __getattr__(self, item): 80 | try: 81 | return self.__getattribute__(item) 82 | except AttributeError as e: 83 | if self._freezed: 84 | raise AttributeError(f"Can't find the field: {item}") from e 85 | else: 86 | # if there's no attribute with the given name, 87 | # make new one and assign an empty config. 88 | self.__setattr__(item, Config()) 89 | return self.__getattribute__(item) 90 | 91 | def __setattr__(self, item, value): 92 | if item != '_freezed' and self.__dict__['_freezed']: 93 | raise RuntimeError("Can't set new attribute after being freezed!") 94 | super().__setattr__(item, value) 95 | 96 | def __bool__(self): 97 | return len([k for k in self.to_dict().keys() 98 | if not k.startswith('_')]) > 0 99 | 100 | def __len__(self): 101 | return len(self.to_dict()) 102 | 103 | def __getstate__(self): 104 | return self.to_dict() 105 | 106 | def __setstate__(self, state): 107 | self._set_with_nested_dict(state) 108 | 109 | def __contains__(self, item): 110 | return self.to_dict().__contains__(item) 111 | 112 | def __deepcopy__(self, memodict={}): 113 | return Config(_dict=deepcopy(self.to_dict())) 114 | 115 | def __iter__(self): 116 | # for iterable unpacking 117 | return self.to_dict().__iter__() 118 | 119 | def pformat(self): 120 | return yaml.dump(self.to_dict(), indent=4, sort_keys=False, 121 | explicit_start=True, explicit_end=True) 122 | 123 | def pprint(self): 124 | return print(self.pformat()) 125 | 126 | def freeze(self): 127 | self._freezed = True 128 | for value in self.__dict__.values(): 129 | if isinstance(value, Config): 130 | value.freeze() 131 | 132 | return self 133 | 134 | def defrost(self): 135 | self._freezed = False 136 | for value in self.__dict__.values(): 137 | if isinstance(value, Config): 138 | value.defrost() 139 | return self 140 | 141 | def get(self, *args, **kwargs): 142 | return self.to_dict().get(*args, **kwargs) 143 | 144 | def keys(self): 145 | return self.to_dict().keys() 146 | 147 | def values(self): 148 | return self.to_dict().values() 149 | 150 | def items(self): 151 | return self.to_dict().items() 152 | 153 | def clone(self): 154 | return self.__deepcopy__() 155 | 156 | def update(self, dict_, delimiter='/'): 157 | for k, v in dict_.items(): 158 | self._update(k, v, delimiter) 159 | 160 | def _update(self, key, value, delimiter='/'): 161 | obj = self 162 | keys = key.split(delimiter) 163 | for k in keys[:-1]: 164 | obj = obj.__getattr__(k) 165 | obj.__setattr__(keys[-1], value) 166 | 167 | def to_dict(self): 168 | out_dict = OrderedDict() 169 | for key, value in self.__dict__.items(): 170 | if isinstance(value, Config): 171 | out_dict[key] = value.to_dict() 172 | else: 173 | if not key.startswith('_'): 174 | out_dict[key] = value 175 | return dict(out_dict) 176 | -------------------------------------------------------------------------------- /datasets/coco.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | COCO dataset which returns image_id for evaluation. 12 | 13 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py 14 | """ 15 | from pathlib import Path 16 | 17 | import torch 18 | import torch.utils.data 19 | from pycocotools import mask as coco_mask 20 | 21 | from .torchvision_datasets import CocoDetection as TvCocoDetection 22 | from util.misc import get_local_rank, get_local_size 23 | import datasets.transforms as T 24 | 25 | 26 | class CocoDetection(TvCocoDetection): 27 | def __init__(self, img_folder, ann_file, transforms, return_masks, cache_mode=False, local_rank=0, local_size=1): 28 | super(CocoDetection, self).__init__(img_folder, ann_file, 29 | cache_mode=cache_mode, local_rank=local_rank, local_size=local_size) 30 | self._transforms = transforms 31 | self.prepare = ConvertCocoPolysToMask(return_masks) 32 | 33 | def __getitem__(self, idx): 34 | img, target = super(CocoDetection, self).__getitem__(idx) 35 | image_id = self.ids[idx] 36 | target = {'image_id': image_id, 'annotations': target} 37 | img, target = self.prepare(img, target) 38 | if self._transforms is not None: 39 | img, target = self._transforms(img, target) 40 | return img, target 41 | 42 | 43 | def convert_coco_poly_to_mask(segmentations, height, width): 44 | masks = [] 45 | for polygons in segmentations: 46 | rles = coco_mask.frPyObjects(polygons, height, width) 47 | mask = coco_mask.decode(rles) 48 | if len(mask.shape) < 3: 49 | mask = mask[..., None] 50 | mask = torch.as_tensor(mask, dtype=torch.uint8) 51 | mask = mask.any(dim=2) 52 | masks.append(mask) 53 | if masks: 54 | masks = torch.stack(masks, dim=0) 55 | else: 56 | masks = torch.zeros((0, height, width), dtype=torch.uint8) 57 | return masks 58 | 59 | 60 | class ConvertCocoPolysToMask(object): 61 | def __init__(self, return_masks=False): 62 | self.return_masks = return_masks 63 | 64 | def __call__(self, image, target): 65 | w, h = image.size 66 | 67 | image_id = target["image_id"] 68 | image_id = torch.tensor([image_id]) 69 | 70 | anno = target["annotations"] 71 | 72 | anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0] 73 | 74 | boxes = [obj["bbox"] for obj in anno] 75 | # guard against no boxes via resizing 76 | boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) 77 | boxes[:, 2:] += boxes[:, :2] 78 | boxes[:, 0::2].clamp_(min=0, max=w) 79 | boxes[:, 1::2].clamp_(min=0, max=h) 80 | 81 | classes = [obj["category_id"] for obj in anno] 82 | classes = torch.tensor(classes, dtype=torch.int64) 83 | 84 | if self.return_masks: 85 | segmentations = [obj["segmentation"] for obj in anno] 86 | masks = convert_coco_poly_to_mask(segmentations, h, w) 87 | 88 | keypoints = None 89 | if anno and "keypoints" in anno[0]: 90 | keypoints = [obj["keypoints"] for obj in anno] 91 | keypoints = torch.as_tensor(keypoints, dtype=torch.float32) 92 | num_keypoints = keypoints.shape[0] 93 | if num_keypoints: 94 | keypoints = keypoints.view(num_keypoints, -1, 3) 95 | 96 | keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) 97 | boxes = boxes[keep] 98 | classes = classes[keep] 99 | if self.return_masks: 100 | masks = masks[keep] 101 | if keypoints is not None: 102 | keypoints = keypoints[keep] 103 | 104 | target = {} 105 | target["boxes"] = boxes 106 | target["labels"] = classes 107 | if self.return_masks: 108 | target["masks"] = masks 109 | target["image_id"] = image_id 110 | if keypoints is not None: 111 | target["keypoints"] = keypoints 112 | 113 | # for conversion to coco api 114 | area = torch.tensor([obj["area"] for obj in anno]) 115 | iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) 116 | target["area"] = area[keep] 117 | target["iscrowd"] = iscrowd[keep] 118 | 119 | target["orig_size"] = torch.as_tensor([int(h), int(w)]) 120 | target["size"] = torch.as_tensor([int(h), int(w)]) 121 | 122 | return image, target 123 | 124 | 125 | def make_coco_transforms(image_set): 126 | 127 | normalize = T.Compose([ 128 | T.ToTensor(), 129 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 130 | ]) 131 | 132 | scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800] 133 | 134 | if image_set == 'train': 135 | return T.Compose([ 136 | T.RandomHorizontalFlip(), 137 | T.RandomSelect( 138 | T.RandomResize(scales, max_size=1333), 139 | T.Compose([ 140 | T.RandomResize([400, 500, 600]), 141 | T.RandomSizeCrop(384, 600), 142 | T.RandomResize(scales, max_size=1333), 143 | ]) 144 | ), 145 | normalize, 146 | ]) 147 | 148 | if image_set == 'val': 149 | return T.Compose([ 150 | T.RandomResize([800], max_size=1333), 151 | normalize, 152 | ]) 153 | 154 | raise ValueError(f'unknown {image_set}') 155 | 156 | 157 | def build(image_set, args): 158 | root = Path(args.coco_path) 159 | assert root.exists(), f'provided COCO path {root} does not exist' 160 | mode = 'instances' 161 | PATHS = { 162 | "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'), 163 | "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'), 164 | } 165 | 166 | img_folder, ann_file = PATHS[image_set] 167 | dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, 168 | cache_mode=args.cache_mode, local_rank=get_local_rank(), local_size=get_local_size()) 169 | return dataset 170 | -------------------------------------------------------------------------------- /models/ops/src/cuda/ms_deform_attn_cuda.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | #include "cuda/ms_deform_im2col_cuda.cuh" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | 20 | at::Tensor ms_deform_attn_cuda_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step) 27 | { 28 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 29 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 30 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 31 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 32 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 33 | 34 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 35 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 36 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 37 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 38 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 39 | 40 | const int batch = value.size(0); 41 | const int spatial_size = value.size(1); 42 | const int num_heads = value.size(2); 43 | const int channels = value.size(3); 44 | 45 | const int num_levels = spatial_shapes.size(0); 46 | 47 | const int num_query = sampling_loc.size(1); 48 | const int num_point = sampling_loc.size(4); 49 | 50 | const int im2col_step_ = std::min(batch, im2col_step); 51 | 52 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 53 | 54 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); 55 | 56 | const int batch_n = im2col_step_; 57 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 58 | auto per_value_size = spatial_size * num_heads * channels; 59 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 60 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 61 | for (int n = 0; n < batch/im2col_step_; ++n) 62 | { 63 | auto columns = output_n.select(0, n); 64 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { 65 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), 66 | value.data() + n * im2col_step_ * per_value_size, 67 | spatial_shapes.data(), 68 | level_start_index.data(), 69 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 70 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 71 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 72 | columns.data()); 73 | 74 | })); 75 | } 76 | 77 | output = output.view({batch, num_query, num_heads*channels}); 78 | 79 | return output; 80 | } 81 | 82 | 83 | std::vector ms_deform_attn_cuda_backward( 84 | const at::Tensor &value, 85 | const at::Tensor &spatial_shapes, 86 | const at::Tensor &level_start_index, 87 | const at::Tensor &sampling_loc, 88 | const at::Tensor &attn_weight, 89 | const at::Tensor &grad_output, 90 | const int im2col_step) 91 | { 92 | 93 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 94 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 95 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 96 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 97 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 98 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); 99 | 100 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 101 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 102 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 103 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 104 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 105 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); 106 | 107 | const int batch = value.size(0); 108 | const int spatial_size = value.size(1); 109 | const int num_heads = value.size(2); 110 | const int channels = value.size(3); 111 | 112 | const int num_levels = spatial_shapes.size(0); 113 | 114 | const int num_query = sampling_loc.size(1); 115 | const int num_point = sampling_loc.size(4); 116 | 117 | const int im2col_step_ = std::min(batch, im2col_step); 118 | 119 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 120 | 121 | auto grad_value = at::zeros_like(value); 122 | auto grad_sampling_loc = at::zeros_like(sampling_loc); 123 | auto grad_attn_weight = at::zeros_like(attn_weight); 124 | 125 | const int batch_n = im2col_step_; 126 | auto per_value_size = spatial_size * num_heads * channels; 127 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 128 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 129 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 130 | 131 | for (int n = 0; n < batch/im2col_step_; ++n) 132 | { 133 | auto grad_output_g = grad_output_n.select(0, n); 134 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { 135 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), 136 | grad_output_g.data(), 137 | value.data() + n * im2col_step_ * per_value_size, 138 | spatial_shapes.data(), 139 | level_start_index.data(), 140 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 141 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 142 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 143 | grad_value.data() + n * im2col_step_ * per_value_size, 144 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 145 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); 146 | 147 | })); 148 | } 149 | 150 | return { 151 | grad_value, grad_sampling_loc, grad_attn_weight 152 | }; 153 | } -------------------------------------------------------------------------------- /models/ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------ 2 | # Sparse DETR 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Deformable DETR 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 8 | # ------------------------------------------------------------------------------------------------ 9 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 10 | # ------------------------------------------------------------------------------------------------ 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import warnings 17 | import math 18 | 19 | import torch 20 | from torch import nn 21 | import torch.nn.functional as F 22 | from torch.nn.init import xavier_uniform_, constant_ 23 | 24 | from ..functions import MSDeformAttnFunction 25 | 26 | 27 | def _is_power_of_2(n): 28 | if (not isinstance(n, int)) or (n < 0): 29 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 30 | return (n & (n-1) == 0) and n != 0 31 | 32 | 33 | class MSDeformAttn(nn.Module): 34 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 35 | """ 36 | Multi-Scale Deformable Attention Module 37 | :param d_model hidden dimension 38 | :param n_levels number of feature levels 39 | :param n_heads number of attention heads 40 | :param n_points number of sampling points per attention head per feature level 41 | """ 42 | super().__init__() 43 | if d_model % n_heads != 0: 44 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 45 | _d_per_head = d_model // n_heads 46 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 47 | if not _is_power_of_2(_d_per_head): 48 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 49 | "which is more efficient in our CUDA implementation.") 50 | 51 | self.im2col_step = 64 52 | 53 | self.d_model = d_model 54 | self.n_levels = n_levels 55 | self.n_heads = n_heads 56 | self.n_points = n_points 57 | 58 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 59 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 60 | self.value_proj = nn.Linear(d_model, d_model) 61 | self.output_proj = nn.Linear(d_model, d_model) 62 | self.python_ops_for_test = False 63 | 64 | self._reset_parameters() 65 | 66 | def _reset_parameters(self): 67 | constant_(self.sampling_offsets.weight.data, 0.) 68 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 69 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 70 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 71 | for i in range(self.n_points): 72 | grid_init[:, :, i, :] *= i + 1 73 | with torch.no_grad(): 74 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 75 | constant_(self.attention_weights.weight.data, 0.) 76 | constant_(self.attention_weights.bias.data, 0.) 77 | xavier_uniform_(self.value_proj.weight.data) 78 | constant_(self.value_proj.bias.data, 0.) 79 | xavier_uniform_(self.output_proj.weight.data) 80 | constant_(self.output_proj.bias.data, 0.) 81 | 82 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 83 | """ 84 | :param query (N, Length_{query}, C) 85 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 86 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 87 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 88 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 89 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 90 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 91 | 92 | :return output (N, Length_{query}, C) 93 | """ 94 | N, Len_q, _ = query.shape 95 | N, Len_in, _ = input_flatten.shape 96 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 97 | 98 | value = self.value_proj(input_flatten) 99 | if input_padding_mask is not None: 100 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 101 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 102 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 103 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 104 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 105 | # N, Len_q, n_heads, n_levels, n_points, 2 106 | if reference_points.shape[-1] == 2: 107 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 108 | sampling_locations = reference_points[:, :, None, :, None, :] \ 109 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 110 | elif reference_points.shape[-1] == 4: 111 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 112 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 113 | else: 114 | raise ValueError( 115 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 116 | if not self.python_ops_for_test: 117 | output = MSDeformAttnFunction.apply( 118 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 119 | else: 120 | output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 121 | output = self.output_proj(output) 122 | return output, sampling_locations, attention_weights 123 | 124 | 125 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 126 | # for debug and test only, 127 | # need to use cuda version instead 128 | N_, S_, M_, D_ = value.shape 129 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 130 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 131 | sampling_grids = 2 * sampling_locations - 1 132 | sampling_value_list = [] 133 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 134 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 135 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 136 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 137 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 138 | # N_*M_, D_, Lq_, P_ 139 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 140 | mode='bilinear', padding_mode='zeros', align_corners=False) 141 | sampling_value_list.append(sampling_value_l_) 142 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 143 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 144 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 145 | return output.transpose(1, 2).contiguous() 146 | -------------------------------------------------------------------------------- /engine.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------ 2 | # Sparse DETR 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------ 6 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 8 | # ------------------------------------------------------------------------------------ 9 | # Modified from DETR (https://github.com/facebookresearch/detr) 10 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 11 | # ------------------------------------------------------------------------------------ 12 | 13 | 14 | """ 15 | Train and eval functions used in main.py 16 | """ 17 | import math 18 | import os 19 | import sys 20 | from typing import Iterable 21 | 22 | import torch 23 | import util.misc as utils 24 | from datasets.coco_eval import CocoEvaluator 25 | from datasets.panoptic_eval import PanopticEvaluator 26 | from datasets.data_prefetcher import data_prefetcher 27 | 28 | from util.misc import check_unused_parameters 29 | 30 | 31 | def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, 32 | data_loader: Iterable, optimizer: torch.optim.Optimizer, 33 | device: torch.device, epoch: int, max_norm: float = 0, 34 | writer=None, total_iter=0): 35 | model.train() 36 | criterion.train() 37 | metric_logger = utils.MetricLogger(delimiter=" ") 38 | metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) 39 | metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) 40 | metric_logger.add_meter('grad_norm', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) 41 | header = 'Epoch: [{}]'.format(epoch) 42 | print_freq = 10 43 | 44 | prefetcher = data_prefetcher(data_loader, device, prefetch=True) 45 | samples, targets = prefetcher.next() 46 | 47 | for i in metric_logger.log_every(range(len(data_loader)), print_freq, header): 48 | outputs = model(samples) 49 | loss_dict = criterion(outputs, targets) 50 | weight_dict = criterion.weight_dict 51 | losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) 52 | 53 | # reduce losses over all GPUs for logging purposes 54 | loss_dict_reduced = utils.reduce_dict(loss_dict) 55 | loss_dict_reduced_unscaled = {f'{k}_unscaled': v 56 | for k, v in loss_dict_reduced.items()} 57 | loss_dict_reduced_scaled = {k: v * weight_dict[k] 58 | for k, v in loss_dict_reduced.items() if k in weight_dict} 59 | losses_reduced_scaled = sum(loss_dict_reduced_scaled.values()) 60 | 61 | loss_value = losses_reduced_scaled.item() 62 | 63 | if not math.isfinite(loss_value): 64 | print("Loss is {}, stopping training".format(loss_value)) 65 | print(loss_dict_reduced) 66 | sys.exit(1) 67 | 68 | optimizer.zero_grad() 69 | losses.backward() 70 | 71 | if i == 0: 72 | check_unused_parameters(model, loss_dict, weight_dict) 73 | 74 | if max_norm > 0: 75 | grad_total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) 76 | else: 77 | grad_total_norm = utils.get_total_grad_norm(model.parameters(), max_norm) 78 | 79 | metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled) 80 | metric_logger.update(class_error=loss_dict_reduced['class_error']) 81 | metric_logger.update(lr=optimizer.param_groups[0]["lr"]) 82 | metric_logger.update(grad_norm=grad_total_norm) 83 | 84 | optimizer.step() 85 | 86 | if total_iter % (print_freq*10) == 0 and utils.is_main_process(): 87 | writer.add_scalar('train/loss', loss_value, total_iter) 88 | writer.add_scalar('train/class_error', loss_dict_reduced['class_error'], total_iter) 89 | writer.add_scalar('lr', optimizer.param_groups[0]["lr"], total_iter) 90 | writer.add_scalar('train/grad_norm', grad_total_norm, total_iter) 91 | for key, value in loss_dict_reduced_scaled.items(): 92 | writer.add_scalar('train/'+key, value, total_iter) 93 | for key, value in loss_dict_reduced_unscaled.items(): 94 | if "corr" in key: 95 | writer.add_scalar('train/'+key, value, total_iter) 96 | 97 | total_iter += 1 98 | samples, targets = prefetcher.next() 99 | 100 | # gather the stats from all processes 101 | metric_logger.synchronize_between_processes() 102 | print("Averaged stats:", metric_logger) 103 | return {k: meter.global_avg for k, meter in metric_logger.meters.items()}, total_iter 104 | 105 | 106 | @torch.no_grad() 107 | def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, args): 108 | model.eval() 109 | criterion.eval() 110 | 111 | metric_logger = utils.MetricLogger(delimiter=" ") 112 | metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) 113 | header = 'Test:' 114 | 115 | iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys()) 116 | coco_evaluator = CocoEvaluator(base_ds, iou_types) 117 | 118 | panoptic_evaluator = None 119 | if 'panoptic' in postprocessors.keys(): 120 | panoptic_evaluator = PanopticEvaluator( 121 | data_loader.dataset.ann_file, 122 | data_loader.dataset.ann_folder, 123 | output_dir=os.path.join(args.output_dir, "panoptic_eval"), 124 | ) 125 | 126 | for step, (samples, targets) in enumerate(metric_logger.log_every(data_loader, 10, header)): 127 | samples = samples.to(device) 128 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets] 129 | 130 | outputs = model(samples) 131 | loss_dict = criterion(outputs, targets) 132 | weight_dict = criterion.weight_dict 133 | 134 | # reduce losses over all GPUs for logging purposes 135 | loss_dict_reduced = utils.reduce_dict(loss_dict) 136 | loss_dict_reduced_scaled = {k: v * weight_dict[k] 137 | for k, v in loss_dict_reduced.items() if k in weight_dict} 138 | loss_dict_reduced_unscaled = {f'{k}_unscaled': v 139 | for k, v in loss_dict_reduced.items()} 140 | metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()), 141 | **loss_dict_reduced_scaled, 142 | **loss_dict_reduced_unscaled) 143 | metric_logger.update(class_error=loss_dict_reduced['class_error']) 144 | 145 | orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) 146 | results = postprocessors['bbox'](outputs, orig_target_sizes) 147 | if 'segm' in postprocessors.keys(): 148 | target_sizes = torch.stack([t["size"] for t in targets], dim=0) 149 | results = postprocessors['segm'](results, outputs, orig_target_sizes, target_sizes) 150 | res = {target['image_id'].item(): output for target, output in zip(targets, results)} 151 | if coco_evaluator is not None: 152 | coco_evaluator.update(res) 153 | 154 | if panoptic_evaluator is not None: 155 | res_pano = postprocessors["panoptic"](outputs, target_sizes, orig_target_sizes) 156 | for i, target in enumerate(targets): 157 | image_id = target["image_id"].item() 158 | file_name = f"{image_id:012d}.png" 159 | res_pano[i]["image_id"] = image_id 160 | res_pano[i]["file_name"] = file_name 161 | 162 | panoptic_evaluator.update(res_pano) 163 | 164 | 165 | 166 | # gather the stats from all processes 167 | metric_logger.synchronize_between_processes() 168 | print("Averaged stats:", metric_logger) 169 | if coco_evaluator is not None: 170 | coco_evaluator.synchronize_between_processes() 171 | if panoptic_evaluator is not None: 172 | panoptic_evaluator.synchronize_between_processes() 173 | 174 | # accumulate predictions from all images 175 | if coco_evaluator is not None: 176 | coco_evaluator.accumulate() 177 | coco_evaluator.summarize() 178 | panoptic_res = None 179 | if panoptic_evaluator is not None: 180 | panoptic_res = panoptic_evaluator.summarize() 181 | stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} 182 | if coco_evaluator is not None: 183 | if 'bbox' in postprocessors.keys(): 184 | stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist() 185 | if 'segm' in postprocessors.keys(): 186 | stats['coco_eval_masks'] = coco_evaluator.coco_eval['segm'].stats.tolist() 187 | if panoptic_res is not None: 188 | stats['PQ_all'] = panoptic_res["All"] 189 | stats['PQ_th'] = panoptic_res["Things"] 190 | stats['PQ_st'] = panoptic_res["Stuff"] 191 | return stats, coco_evaluator 192 | -------------------------------------------------------------------------------- /tools/launch.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------------------------------------------- 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # -------------------------------------------------------------------------------------------------------------------------- 6 | # Modified from https://github.com/pytorch/pytorch/blob/173f224570017b4b1a3a1a13d0bff280a54d9cd9/torch/distributed/launch.py 7 | # -------------------------------------------------------------------------------------------------------------------------- 8 | 9 | r""" 10 | `torch.distributed.launch` is a module that spawns up multiple distributed 11 | training processes on each of the training nodes. 12 | The utility can be used for single-node distributed training, in which one or 13 | more processes per node will be spawned. The utility can be used for either 14 | CPU training or GPU training. If the utility is used for GPU training, 15 | each distributed process will be operating on a single GPU. This can achieve 16 | well-improved single-node training performance. It can also be used in 17 | multi-node distributed training, by spawning up multiple processes on each node 18 | for well-improved multi-node distributed training performance as well. 19 | This will especially be benefitial for systems with multiple Infiniband 20 | interfaces that have direct-GPU support, since all of them can be utilized for 21 | aggregated communication bandwidth. 22 | In both cases of single-node distributed training or multi-node distributed 23 | training, this utility will launch the given number of processes per node 24 | (``--nproc_per_node``). If used for GPU training, this number needs to be less 25 | or euqal to the number of GPUs on the current system (``nproc_per_node``), 26 | and each process will be operating on a single GPU from *GPU 0 to 27 | GPU (nproc_per_node - 1)*. 28 | **How to use this module:** 29 | 1. Single-Node multi-process distributed training 30 | :: 31 | >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE 32 | YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other 33 | arguments of your training script) 34 | 2. Multi-Node multi-process distributed training: (e.g. two nodes) 35 | Node 1: *(IP: 192.168.1.1, and has a free port: 1234)* 36 | :: 37 | >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE 38 | --nnodes=2 --node_rank=0 --master_addr="192.168.1.1" 39 | --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 40 | and all other arguments of your training script) 41 | Node 2: 42 | :: 43 | >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE 44 | --nnodes=2 --node_rank=1 --master_addr="192.168.1.1" 45 | --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 46 | and all other arguments of your training script) 47 | 3. To look up what optional arguments this module offers: 48 | :: 49 | >>> python -m torch.distributed.launch --help 50 | **Important Notices:** 51 | 1. This utilty and multi-process distributed (single-node or 52 | multi-node) GPU training currently only achieves the best performance using 53 | the NCCL distributed backend. Thus NCCL backend is the recommended backend to 54 | use for GPU training. 55 | 2. In your training program, you must parse the command-line argument: 56 | ``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by this module. 57 | If your training program uses GPUs, you should ensure that your code only 58 | runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by: 59 | Parsing the local_rank argument 60 | :: 61 | >>> import argparse 62 | >>> parser = argparse.ArgumentParser() 63 | >>> parser.add_argument("--local_rank", type=int) 64 | >>> args = parser.parse_args() 65 | Set your device to local rank using either 66 | :: 67 | >>> torch.cuda.set_device(arg.local_rank) # before your code runs 68 | or 69 | :: 70 | >>> with torch.cuda.device(arg.local_rank): 71 | >>> # your code to run 72 | 3. In your training program, you are supposed to call the following function 73 | at the beginning to start the distributed backend. You need to make sure that 74 | the init_method uses ``env://``, which is the only supported ``init_method`` 75 | by this module. 76 | :: 77 | torch.distributed.init_process_group(backend='YOUR BACKEND', 78 | init_method='env://') 79 | 4. In your training program, you can either use regular distributed functions 80 | or use :func:`torch.nn.parallel.DistributedDataParallel` module. If your 81 | training program uses GPUs for training and you would like to use 82 | :func:`torch.nn.parallel.DistributedDataParallel` module, 83 | here is how to configure it. 84 | :: 85 | model = torch.nn.parallel.DistributedDataParallel(model, 86 | device_ids=[arg.local_rank], 87 | output_device=arg.local_rank) 88 | Please ensure that ``device_ids`` argument is set to be the only GPU device id 89 | that your code will be operating on. This is generally the local rank of the 90 | process. In other words, the ``device_ids`` needs to be ``[args.local_rank]``, 91 | and ``output_device`` needs to be ``args.local_rank`` in order to use this 92 | utility 93 | 5. Another way to pass ``local_rank`` to the subprocesses via environment variable 94 | ``LOCAL_RANK``. This behavior is enabled when you launch the script with 95 | ``--use_env=True``. You must adjust the subprocess example above to replace 96 | ``args.local_rank`` with ``os.environ['LOCAL_RANK']``; the launcher 97 | will not pass ``--local_rank`` when you specify this flag. 98 | .. warning:: 99 | ``local_rank`` is NOT globally unique: it is only unique per process 100 | on a machine. Thus, don't use it to decide if you should, e.g., 101 | write to a networked filesystem. See 102 | https://github.com/pytorch/pytorch/issues/12042 for an example of 103 | how things can go wrong if you don't do this correctly. 104 | """ 105 | 106 | 107 | import sys 108 | import subprocess 109 | import os 110 | import socket 111 | from argparse import ArgumentParser, REMAINDER 112 | 113 | import torch 114 | 115 | 116 | def parse_args(): 117 | """ 118 | Helper function parsing the command line options 119 | @retval ArgumentParser 120 | """ 121 | parser = ArgumentParser(description="PyTorch distributed training launch " 122 | "helper utilty that will spawn up " 123 | "multiple distributed processes") 124 | 125 | # Optional arguments for the launch helper 126 | parser.add_argument("--nnodes", type=int, default=1, 127 | help="The number of nodes to use for distributed " 128 | "training") 129 | parser.add_argument("--node_rank", type=int, default=0, 130 | help="The rank of the node for multi-node distributed " 131 | "training") 132 | parser.add_argument("--nproc_per_node", type=int, default=1, 133 | help="The number of processes to launch on each node, " 134 | "for GPU training, this is recommended to be set " 135 | "to the number of GPUs in your system so that " 136 | "each process can be bound to a single GPU.") 137 | parser.add_argument("--master_addr", default="127.0.0.1", type=str, 138 | help="Master node (rank 0)'s address, should be either " 139 | "the IP address or the hostname of node 0, for " 140 | "single node multi-proc training, the " 141 | "--master_addr can simply be 127.0.0.1") 142 | parser.add_argument("--master_port", default=29500, type=int, 143 | help="Master node (rank 0)'s free port that needs to " 144 | "be used for communciation during distributed " 145 | "training") 146 | 147 | # positional 148 | parser.add_argument("training_script", type=str, 149 | help="The full path to the single GPU training " 150 | "program/script to be launched in parallel, " 151 | "followed by all the arguments for the " 152 | "training script") 153 | 154 | # rest from the training program 155 | parser.add_argument('training_script_args', nargs=REMAINDER) 156 | return parser.parse_args() 157 | 158 | 159 | def main(): 160 | args = parse_args() 161 | 162 | # world size in terms of number of processes 163 | dist_world_size = args.nproc_per_node * args.nnodes 164 | 165 | # set PyTorch distributed related environmental variables 166 | current_env = os.environ.copy() 167 | current_env["MASTER_ADDR"] = args.master_addr 168 | current_env["MASTER_PORT"] = str(args.master_port) 169 | current_env["WORLD_SIZE"] = str(dist_world_size) 170 | 171 | processes = [] 172 | 173 | for local_rank in range(0, args.nproc_per_node): 174 | # each process's rank 175 | dist_rank = args.nproc_per_node * args.node_rank + local_rank 176 | current_env["RANK"] = str(dist_rank) 177 | current_env["LOCAL_RANK"] = str(local_rank) 178 | 179 | cmd = [args.training_script] + args.training_script_args 180 | 181 | process = subprocess.Popen(cmd, env=current_env) 182 | processes.append(process) 183 | 184 | for process in processes: 185 | process.wait() 186 | if process.returncode != 0: 187 | raise subprocess.CalledProcessError(returncode=process.returncode, 188 | cmd=process.args) 189 | 190 | 191 | if __name__ == "__main__": 192 | main() -------------------------------------------------------------------------------- /datasets/transforms.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Transforms and data augmentation for both image + bbox. 12 | """ 13 | import random 14 | 15 | import PIL 16 | import torch 17 | import torchvision.transforms as T 18 | import torchvision.transforms.functional as F 19 | 20 | from util.box_ops import box_xyxy_to_cxcywh 21 | from util.misc import interpolate 22 | 23 | 24 | def crop(image, target, region): 25 | cropped_image = F.crop(image, *region) 26 | 27 | target = target.copy() 28 | i, j, h, w = region 29 | 30 | # should we do something wrt the original size? 31 | target["size"] = torch.tensor([h, w]) 32 | 33 | fields = ["labels", "area", "iscrowd"] 34 | 35 | if "boxes" in target: 36 | boxes = target["boxes"] 37 | max_size = torch.as_tensor([w, h], dtype=torch.float32) 38 | cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) 39 | cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) 40 | cropped_boxes = cropped_boxes.clamp(min=0) 41 | area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) 42 | target["boxes"] = cropped_boxes.reshape(-1, 4) 43 | target["area"] = area 44 | fields.append("boxes") 45 | 46 | if "masks" in target: 47 | # FIXME should we update the area here if there are no boxes? 48 | target['masks'] = target['masks'][:, i:i + h, j:j + w] 49 | fields.append("masks") 50 | 51 | # remove elements for which the boxes or masks that have zero area 52 | if "boxes" in target or "masks" in target: 53 | # favor boxes selection when defining which elements to keep 54 | # this is compatible with previous implementation 55 | if "boxes" in target: 56 | cropped_boxes = target['boxes'].reshape(-1, 2, 2) 57 | keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) 58 | else: 59 | keep = target['masks'].flatten(1).any(1) 60 | 61 | for field in fields: 62 | target[field] = target[field][keep] 63 | 64 | return cropped_image, target 65 | 66 | 67 | def hflip(image, target): 68 | flipped_image = F.hflip(image) 69 | 70 | w, h = image.size 71 | 72 | target = target.copy() 73 | if "boxes" in target: 74 | boxes = target["boxes"] 75 | boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0]) 76 | target["boxes"] = boxes 77 | 78 | if "masks" in target: 79 | target['masks'] = target['masks'].flip(-1) 80 | 81 | return flipped_image, target 82 | 83 | 84 | def resize(image, target, size, max_size=None): 85 | # size can be min_size (scalar) or (w, h) tuple 86 | 87 | def get_size_with_aspect_ratio(image_size, size, max_size=None): 88 | w, h = image_size 89 | if max_size is not None: 90 | min_original_size = float(min((w, h))) 91 | max_original_size = float(max((w, h))) 92 | if max_original_size / min_original_size * size > max_size: 93 | size = int(round(max_size * min_original_size / max_original_size)) 94 | 95 | if (w <= h and w == size) or (h <= w and h == size): 96 | return (h, w) 97 | 98 | if w < h: 99 | ow = size 100 | oh = int(size * h / w) 101 | else: 102 | oh = size 103 | ow = int(size * w / h) 104 | 105 | return (oh, ow) 106 | 107 | def get_size(image_size, size, max_size=None): 108 | if isinstance(size, (list, tuple)): 109 | return size[::-1] 110 | else: 111 | return get_size_with_aspect_ratio(image_size, size, max_size) 112 | 113 | size = get_size(image.size, size, max_size) 114 | rescaled_image = F.resize(image, size) 115 | 116 | if target is None: 117 | return rescaled_image, None 118 | 119 | ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) 120 | ratio_width, ratio_height = ratios 121 | 122 | target = target.copy() 123 | if "boxes" in target: 124 | boxes = target["boxes"] 125 | scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) 126 | target["boxes"] = scaled_boxes 127 | 128 | if "area" in target: 129 | area = target["area"] 130 | scaled_area = area * (ratio_width * ratio_height) 131 | target["area"] = scaled_area 132 | 133 | h, w = size 134 | target["size"] = torch.tensor([h, w]) 135 | 136 | if "masks" in target: 137 | target['masks'] = interpolate( 138 | target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5 139 | 140 | return rescaled_image, target 141 | 142 | 143 | def pad(image, target, padding): 144 | # assumes that we only pad on the bottom right corners 145 | padded_image = F.pad(image, (0, 0, padding[0], padding[1])) 146 | if target is None: 147 | return padded_image, None 148 | target = target.copy() 149 | # should we do something wrt the original size? 150 | target["size"] = torch.tensor(padded_image[::-1]) 151 | if "masks" in target: 152 | target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1])) 153 | return padded_image, target 154 | 155 | 156 | class RandomCrop(object): 157 | def __init__(self, size): 158 | self.size = size 159 | 160 | def __call__(self, img, target): 161 | region = T.RandomCrop.get_params(img, self.size) 162 | return crop(img, target, region) 163 | 164 | 165 | class RandomSizeCrop(object): 166 | def __init__(self, min_size: int, max_size: int): 167 | self.min_size = min_size 168 | self.max_size = max_size 169 | 170 | def __call__(self, img: PIL.Image.Image, target: dict): 171 | w = random.randint(self.min_size, min(img.width, self.max_size)) 172 | h = random.randint(self.min_size, min(img.height, self.max_size)) 173 | region = T.RandomCrop.get_params(img, [h, w]) 174 | return crop(img, target, region) 175 | 176 | 177 | class CenterCrop(object): 178 | def __init__(self, size): 179 | self.size = size 180 | 181 | def __call__(self, img, target): 182 | image_width, image_height = img.size 183 | crop_height, crop_width = self.size 184 | crop_top = int(round((image_height - crop_height) / 2.)) 185 | crop_left = int(round((image_width - crop_width) / 2.)) 186 | return crop(img, target, (crop_top, crop_left, crop_height, crop_width)) 187 | 188 | 189 | class RandomHorizontalFlip(object): 190 | def __init__(self, p=0.5): 191 | self.p = p 192 | 193 | def __call__(self, img, target): 194 | if random.random() < self.p: 195 | return hflip(img, target) 196 | return img, target 197 | 198 | 199 | class RandomResize(object): 200 | def __init__(self, sizes, max_size=None): 201 | assert isinstance(sizes, (list, tuple)) 202 | self.sizes = sizes 203 | self.max_size = max_size 204 | 205 | def __call__(self, img, target=None): 206 | size = random.choice(self.sizes) 207 | return resize(img, target, size, self.max_size) 208 | 209 | 210 | class RandomPad(object): 211 | def __init__(self, max_pad): 212 | self.max_pad = max_pad 213 | 214 | def __call__(self, img, target): 215 | pad_x = random.randint(0, self.max_pad) 216 | pad_y = random.randint(0, self.max_pad) 217 | return pad(img, target, (pad_x, pad_y)) 218 | 219 | 220 | class RandomSelect(object): 221 | """ 222 | Randomly selects between transforms1 and transforms2, 223 | with probability p for transforms1 and (1 - p) for transforms2 224 | """ 225 | def __init__(self, transforms1, transforms2, p=0.5): 226 | self.transforms1 = transforms1 227 | self.transforms2 = transforms2 228 | self.p = p 229 | 230 | def __call__(self, img, target): 231 | if random.random() < self.p: 232 | return self.transforms1(img, target) 233 | return self.transforms2(img, target) 234 | 235 | 236 | class ToTensor(object): 237 | def __call__(self, img, target): 238 | return F.to_tensor(img), target 239 | 240 | 241 | class RandomErasing(object): 242 | 243 | def __init__(self, *args, **kwargs): 244 | self.eraser = T.RandomErasing(*args, **kwargs) 245 | 246 | def __call__(self, img, target): 247 | return self.eraser(img), target 248 | 249 | 250 | class Normalize(object): 251 | def __init__(self, mean, std): 252 | self.mean = mean 253 | self.std = std 254 | 255 | def __call__(self, image, target=None): 256 | image = F.normalize(image, mean=self.mean, std=self.std) 257 | if target is None: 258 | return image, None 259 | target = target.copy() 260 | h, w = image.shape[-2:] 261 | if "boxes" in target: 262 | boxes = target["boxes"] 263 | boxes = box_xyxy_to_cxcywh(boxes) 264 | boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32) 265 | target["boxes"] = boxes 266 | return image, target 267 | 268 | 269 | class Compose(object): 270 | def __init__(self, transforms): 271 | self.transforms = transforms 272 | 273 | def __call__(self, image, target): 274 | for t in self.transforms: 275 | image, target = t(image, target) 276 | return image, target 277 | 278 | def __repr__(self): 279 | format_string = self.__class__.__name__ + "(" 280 | for t in self.transforms: 281 | format_string += "\n" 282 | format_string += " {0}".format(t) 283 | format_string += "\n)" 284 | return format_string 285 | -------------------------------------------------------------------------------- /models/backbone.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------ 2 | # Sparse DETR 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------ 6 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 8 | # ------------------------------------------------------------------------------------ 9 | # Modified from DETR (https://github.com/facebookresearch/detr) 10 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 11 | # ------------------------------------------------------------------------------------ 12 | 13 | 14 | """ 15 | Backbone modules. 16 | """ 17 | from collections import OrderedDict 18 | 19 | import torch 20 | import torch.nn.functional as F 21 | import torchvision 22 | from torch import nn 23 | from torchvision.models._utils import IntermediateLayerGetter 24 | from typing import Dict, List 25 | 26 | from models import swin_transformer 27 | from util.misc import NestedTensor, is_main_process 28 | 29 | from .position_encoding import build_position_encoding 30 | 31 | 32 | class FrozenBatchNorm2d(torch.nn.Module): 33 | """ 34 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 35 | 36 | Copy-paste from torchvision.misc.ops with added eps before rsqrt, 37 | without which any other models than torchvision.models.resnet[18,34,50,101] 38 | produce nans. 39 | """ 40 | 41 | def __init__(self, n, eps=1e-5): 42 | super(FrozenBatchNorm2d, self).__init__() 43 | self.register_buffer("weight", torch.ones(n)) 44 | self.register_buffer("bias", torch.zeros(n)) 45 | self.register_buffer("running_mean", torch.zeros(n)) 46 | self.register_buffer("running_var", torch.ones(n)) 47 | self.eps = eps 48 | 49 | def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, 50 | missing_keys, unexpected_keys, error_msgs): 51 | num_batches_tracked_key = prefix + 'num_batches_tracked' 52 | if num_batches_tracked_key in state_dict: 53 | del state_dict[num_batches_tracked_key] 54 | 55 | super(FrozenBatchNorm2d, self)._load_from_state_dict( 56 | state_dict, prefix, local_metadata, strict, 57 | missing_keys, unexpected_keys, error_msgs) 58 | 59 | def forward(self, x): 60 | # move reshapes to the beginning 61 | # to make it fuser-friendly 62 | w = self.weight.reshape(1, -1, 1, 1) 63 | b = self.bias.reshape(1, -1, 1, 1) 64 | rv = self.running_var.reshape(1, -1, 1, 1) 65 | rm = self.running_mean.reshape(1, -1, 1, 1) 66 | eps = self.eps 67 | scale = w * (rv + eps).rsqrt() 68 | bias = b - rm * scale 69 | return x * scale + bias 70 | 71 | 72 | class BackboneBase(nn.Module): 73 | 74 | def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool, args): 75 | # TODO: args -> duplicated args 76 | super().__init__() 77 | if 'none' in args.backbone: 78 | self.strides = [1] # not used, actually (length only matters) 79 | self.num_channels = [3] 80 | return_layers = self.get_return_layers('identity', (0,)) 81 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) 82 | 83 | elif 'resnet' in args.backbone: 84 | 85 | if not args.backbone_from_scratch and not args.finetune_early_layers: 86 | print("Freeze early layers.") 87 | for name, parameter in backbone.named_parameters(): 88 | if not train_backbone or all([k not in name for k in ['layer2', 'layer3', 'layer4']]): 89 | parameter.requires_grad_(False) 90 | else: 91 | print('Finetune early layers as well.') 92 | 93 | layer_name = "layer" 94 | if return_interm_layers: 95 | return_layers = self.get_return_layers(layer_name, (2, 3, 4)) 96 | self.strides = [8, 16, 32] 97 | self.num_channels = [512, 1024, 2048] 98 | else: 99 | return_layers = self.get_return_layers(layer_name, (4,)) 100 | self.strides = [32] 101 | self.num_channels = [2048] 102 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) 103 | 104 | elif 'swin' in args.backbone: 105 | if return_interm_layers: 106 | num_channels = [int(backbone.embed_dim * 2 ** i) for i in range(backbone.num_layers)] 107 | return_layers = [2, 3, 4] 108 | self.strides = [8, 16, 32] 109 | self.num_channels = num_channels[1:] 110 | else: 111 | return_layers = [4] 112 | self.strides = [32] 113 | self.num_channels = num_channels[-1] 114 | self.body = backbone 115 | 116 | else: 117 | raise ValueError(f"Unknown backbone name: {args.backbone}") 118 | 119 | @staticmethod 120 | def get_return_layers(name: str, layer_ids): 121 | return {name + str(n): str(i) for i, n in enumerate(layer_ids)} 122 | 123 | def forward(self, tensor_list: NestedTensor): 124 | xs = self.body(tensor_list.tensors) 125 | out: Dict[str, NestedTensor] = {} 126 | for name, x in xs.items(): 127 | m = tensor_list.mask 128 | assert m is not None 129 | mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 130 | out[name] = NestedTensor(x, mask) 131 | return out 132 | 133 | 134 | class DummyBackbone(torch.nn.Module): 135 | def __init__(self): 136 | super().__init__() 137 | self.identity0 = torch.nn.Identity() 138 | 139 | 140 | class Backbone(BackboneBase): 141 | """ResNet backbone with frozen BatchNorm.""" 142 | def __init__(self, name: str, 143 | train_backbone: bool, 144 | return_interm_layers: bool, 145 | dilation: bool, 146 | args): 147 | print(f"Backbone: {name}") 148 | pretrained = is_main_process() and not args.backbone_from_scratch and not args.scrl_pretrained_path 149 | if not pretrained: 150 | print("Train backbone from scratch.") 151 | else: 152 | print("Load pretrained weights") 153 | 154 | if "none" in name: 155 | backbone = DummyBackbone() 156 | elif "resnet" in name: 157 | assert name not in ("resnet18", "resnet34"), "number of channels are hard coded" 158 | backbone = getattr(torchvision.models, name)( 159 | replace_stride_with_dilation=[False, False, dilation], 160 | pretrained=pretrained, norm_layer=FrozenBatchNorm2d) 161 | elif "swin" in name: 162 | assert not dilation, "not supported" 163 | if not args.backbone_from_scratch and not args.finetune_early_layers: 164 | print("Freeze early layers.") 165 | frozen_stages = 2 166 | else: 167 | print('Finetune early layers as well.') 168 | frozen_stages = -1 169 | if return_interm_layers: 170 | out_indices = [1, 2, 3] 171 | else: 172 | out_indices = [3] 173 | 174 | backbone = swin_transformer.build_model( 175 | name, out_indices=out_indices, frozen_stages=frozen_stages, pretrained=pretrained) 176 | else: 177 | raise ValueError(f"Unknown backbone name: {args.backbone}") 178 | 179 | if args.scrl_pretrained_path: 180 | assert "resnet" in name, "Currently only resnet50 is available." 181 | ckpt = torch.load(args.scrl_pretrained_path, map_location="cpu") 182 | translate_map = { 183 | "encoder.0" : "conv1", 184 | "encoder.1" : "bn1", 185 | "encoder.4" : "layer1", 186 | "encoder.5" : "layer2", 187 | "encoder.6" : "layer3", 188 | "encoder.7" : "layer4", 189 | } 190 | state_dict = { 191 | translate_map[k[:9]] + k[9:] : v 192 | for k, v in ckpt["online_network_state_dict"].items() 193 | if "encoder" in k 194 | } 195 | backbone.load_state_dict(state_dict, strict=False) 196 | 197 | super().__init__(backbone, train_backbone, return_interm_layers, args) 198 | if dilation and "resnet" in name: 199 | self.strides[-1] = self.strides[-1] // 2 200 | 201 | 202 | class Joiner(nn.Sequential): 203 | def __init__(self, backbone, position_embedding): 204 | super().__init__(backbone, position_embedding) 205 | self.strides = backbone.strides 206 | self.num_channels = backbone.num_channels 207 | 208 | def forward(self, tensor_list: NestedTensor): 209 | xs = self[0](tensor_list) 210 | out: List[NestedTensor] = [] 211 | pos = [] 212 | for name, x in sorted(xs.items()): 213 | out.append(x) 214 | 215 | # position encoding 216 | for x in out: 217 | pos.append(self[1](x).to(x.tensors.dtype)) 218 | 219 | return out, pos 220 | 221 | 222 | def test_backbone(backbone): 223 | imgs = [ 224 | torch.randn(2, 3, 633, 122), 225 | torch.randn(2, 3, 322, 532), 226 | torch.randn(2, 3, 236, 42), 227 | ] 228 | return [backbone(img).shape for img in imgs] 229 | 230 | 231 | def build_backbone(args): 232 | # test_backbone(torchvision.models.resnet50()) 233 | position_embedding = build_position_encoding(args) 234 | train_backbone = args.lr_backbone > 0 235 | return_interm_layers = args.masks or (args.num_feature_levels > 1) 236 | backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation, args) 237 | model = Joiner(backbone, position_embedding) 238 | return model 239 | -------------------------------------------------------------------------------- /datasets/coco_eval.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | COCO evaluator that works in distributed mode. 12 | 13 | Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py 14 | The difference is that there is less copy-pasting from pycocotools 15 | in the end of the file, as python3 can suppress prints with contextlib 16 | """ 17 | import os 18 | import contextlib 19 | import copy 20 | import numpy as np 21 | import torch 22 | 23 | from pycocotools.cocoeval import COCOeval 24 | from pycocotools.coco import COCO 25 | import pycocotools.mask as mask_util 26 | 27 | from util.misc import all_gather 28 | 29 | 30 | class CocoEvaluator(object): 31 | def __init__(self, coco_gt, iou_types): 32 | assert isinstance(iou_types, (list, tuple)) 33 | coco_gt = copy.deepcopy(coco_gt) 34 | self.coco_gt = coco_gt 35 | 36 | self.iou_types = iou_types 37 | self.coco_eval = {} 38 | for iou_type in iou_types: 39 | self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type) 40 | 41 | self.img_ids = [] 42 | self.eval_imgs = {k: [] for k in iou_types} 43 | 44 | def update(self, predictions): 45 | img_ids = list(np.unique(list(predictions.keys()))) 46 | self.img_ids.extend(img_ids) 47 | 48 | for iou_type in self.iou_types: 49 | results = self.prepare(predictions, iou_type) 50 | 51 | # suppress pycocotools prints 52 | with open(os.devnull, 'w') as devnull: 53 | with contextlib.redirect_stdout(devnull): 54 | coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO() 55 | coco_eval = self.coco_eval[iou_type] 56 | 57 | coco_eval.cocoDt = coco_dt 58 | coco_eval.params.imgIds = list(img_ids) 59 | img_ids, eval_imgs = evaluate(coco_eval) 60 | 61 | self.eval_imgs[iou_type].append(eval_imgs) 62 | 63 | def synchronize_between_processes(self): 64 | for iou_type in self.iou_types: 65 | self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2) 66 | create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type]) 67 | 68 | def accumulate(self): 69 | for coco_eval in self.coco_eval.values(): 70 | coco_eval.accumulate() 71 | 72 | def summarize(self): 73 | for iou_type, coco_eval in self.coco_eval.items(): 74 | print("IoU metric: {}".format(iou_type)) 75 | coco_eval.summarize() 76 | 77 | def prepare(self, predictions, iou_type): 78 | if iou_type == "bbox": 79 | return self.prepare_for_coco_detection(predictions) 80 | elif iou_type == "segm": 81 | return self.prepare_for_coco_segmentation(predictions) 82 | elif iou_type == "keypoints": 83 | return self.prepare_for_coco_keypoint(predictions) 84 | else: 85 | raise ValueError("Unknown iou type {}".format(iou_type)) 86 | 87 | def prepare_for_coco_detection(self, predictions): 88 | coco_results = [] 89 | for original_id, prediction in predictions.items(): 90 | if len(prediction) == 0: 91 | continue 92 | 93 | boxes = prediction["boxes"] 94 | boxes = convert_to_xywh(boxes).tolist() 95 | scores = prediction["scores"].tolist() 96 | labels = prediction["labels"].tolist() 97 | 98 | coco_results.extend( 99 | [ 100 | { 101 | "image_id": original_id, 102 | "category_id": labels[k], 103 | "bbox": box, 104 | "score": scores[k], 105 | } 106 | for k, box in enumerate(boxes) 107 | ] 108 | ) 109 | return coco_results 110 | 111 | def prepare_for_coco_segmentation(self, predictions): 112 | coco_results = [] 113 | for original_id, prediction in predictions.items(): 114 | if len(prediction) == 0: 115 | continue 116 | 117 | scores = prediction["scores"] 118 | labels = prediction["labels"] 119 | masks = prediction["masks"] 120 | 121 | masks = masks > 0.5 122 | 123 | scores = prediction["scores"].tolist() 124 | labels = prediction["labels"].tolist() 125 | 126 | rles = [ 127 | mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] 128 | for mask in masks 129 | ] 130 | for rle in rles: 131 | rle["counts"] = rle["counts"].decode("utf-8") 132 | 133 | coco_results.extend( 134 | [ 135 | { 136 | "image_id": original_id, 137 | "category_id": labels[k], 138 | "segmentation": rle, 139 | "score": scores[k], 140 | } 141 | for k, rle in enumerate(rles) 142 | ] 143 | ) 144 | return coco_results 145 | 146 | def prepare_for_coco_keypoint(self, predictions): 147 | coco_results = [] 148 | for original_id, prediction in predictions.items(): 149 | if len(prediction) == 0: 150 | continue 151 | 152 | boxes = prediction["boxes"] 153 | boxes = convert_to_xywh(boxes).tolist() 154 | scores = prediction["scores"].tolist() 155 | labels = prediction["labels"].tolist() 156 | keypoints = prediction["keypoints"] 157 | keypoints = keypoints.flatten(start_dim=1).tolist() 158 | 159 | coco_results.extend( 160 | [ 161 | { 162 | "image_id": original_id, 163 | "category_id": labels[k], 164 | 'keypoints': keypoint, 165 | "score": scores[k], 166 | } 167 | for k, keypoint in enumerate(keypoints) 168 | ] 169 | ) 170 | return coco_results 171 | 172 | 173 | def convert_to_xywh(boxes): 174 | xmin, ymin, xmax, ymax = boxes.unbind(1) 175 | return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1) 176 | 177 | 178 | def merge(img_ids, eval_imgs): 179 | all_img_ids = all_gather(img_ids) 180 | all_eval_imgs = all_gather(eval_imgs) 181 | 182 | merged_img_ids = [] 183 | for p in all_img_ids: 184 | merged_img_ids.extend(p) 185 | 186 | merged_eval_imgs = [] 187 | for p in all_eval_imgs: 188 | merged_eval_imgs.append(p) 189 | 190 | merged_img_ids = np.array(merged_img_ids) 191 | merged_eval_imgs = np.concatenate(merged_eval_imgs, 2) 192 | 193 | # keep only unique (and in sorted order) images 194 | merged_img_ids, idx = np.unique(merged_img_ids, return_index=True) 195 | merged_eval_imgs = merged_eval_imgs[..., idx] 196 | 197 | return merged_img_ids, merged_eval_imgs 198 | 199 | 200 | def create_common_coco_eval(coco_eval, img_ids, eval_imgs): 201 | img_ids, eval_imgs = merge(img_ids, eval_imgs) 202 | img_ids = list(img_ids) 203 | eval_imgs = list(eval_imgs.flatten()) 204 | 205 | coco_eval.evalImgs = eval_imgs 206 | coco_eval.params.imgIds = img_ids 207 | coco_eval._paramsEval = copy.deepcopy(coco_eval.params) 208 | 209 | 210 | ################################################################# 211 | # From pycocotools, just removed the prints and fixed 212 | # a Python3 bug about unicode not defined 213 | ################################################################# 214 | 215 | 216 | def evaluate(self): 217 | ''' 218 | Run per image evaluation on given images and store results (a list of dict) in self.evalImgs 219 | :return: None 220 | ''' 221 | # tic = time.time() 222 | # print('Running per image evaluation...') 223 | p = self.params 224 | # add backward compatibility if useSegm is specified in params 225 | if p.useSegm is not None: 226 | p.iouType = 'segm' if p.useSegm == 1 else 'bbox' 227 | print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) 228 | # print('Evaluate annotation type *{}*'.format(p.iouType)) 229 | p.imgIds = list(np.unique(p.imgIds)) 230 | if p.useCats: 231 | p.catIds = list(np.unique(p.catIds)) 232 | p.maxDets = sorted(p.maxDets) 233 | self.params = p 234 | 235 | self._prepare() 236 | # loop through images, area range, max detection number 237 | catIds = p.catIds if p.useCats else [-1] 238 | 239 | if p.iouType == 'segm' or p.iouType == 'bbox': 240 | computeIoU = self.computeIoU 241 | elif p.iouType == 'keypoints': 242 | computeIoU = self.computeOks 243 | self.ious = { 244 | (imgId, catId): computeIoU(imgId, catId) 245 | for imgId in p.imgIds 246 | for catId in catIds} 247 | 248 | evaluateImg = self.evaluateImg 249 | maxDet = p.maxDets[-1] 250 | evalImgs = [ 251 | evaluateImg(imgId, catId, areaRng, maxDet) 252 | for catId in catIds 253 | for areaRng in p.areaRng 254 | for imgId in p.imgIds 255 | ] 256 | # this is NOT in the pycocotools code, but could be done outside 257 | evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds)) 258 | self._paramsEval = copy.deepcopy(self.params) 259 | # toc = time.time() 260 | # print('DONE (t={:0.2f}s).'.format(toc-tic)) 261 | return p.imgIds, evalImgs 262 | 263 | ################################################################# 264 | # end of straight copy from pycocotools, just removing the prints 265 | ################################################################# 266 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![KakaoBrain](https://img.shields.io/badge/kakao-brain-ffcd00.svg)](http://kakaobrain.com/) 2 | [![pytorch](https://img.shields.io/badge/pytorch-1.6.0-%2523ee4c2c.svg)](https://pytorch.org/) 3 | [![pytorch](https://img.shields.io/badge/pytorch-1.7.1-%2523ee4c2c.svg)](https://pytorch.org/) 4 | 5 | Sparse DETR (ICLR'22) 6 | ======== 7 | 8 | By [Byungseok Roh](https://scholar.google.com/citations?user=H4VWYHwAAAAJ)\*, [Jaewoong Shin](https://scholar.google.com/citations?user=i_o_95kAAAAJ)\*, [Wuhyun Shin](https://scholar.google.com/citations?user=bGwfkakAAAAJ)\*, and [Saehoon Kim](https://scholar.google.com/citations?user=_ZfueMIAAAAJ) at [Kakao Brain](https://www.kakaobrain.com). 9 | (*: Equal contribution) 10 | 11 | * This repository is an official implementation of the paper [Sparse DETR: Efficient End-to-End Object Detection with Learnable Sparsity](https://arxiv.org/abs/2111.14330). 12 | * The code and some instructions are built upon the official [Deformable DETR repository](https://github.com/fundamentalvision/Deformable-DETR). 13 | 14 | 15 | 16 | # Introduction 17 | 18 | **TL; DR.** Sparse DETR is an efficient end-to-end object detector that **sparsifies encoder tokens** by using the learnable DAM(Decoder Attention Map) predictor. It achieves better performance than Deformable DETR even with only 10% encoder queries on the COCO dataset. 19 | 20 |

21 | 22 |

23 | 24 | **Abstract.** DETR is the first end-to-end object detector using a transformer encoder-decoder architecture and demonstrates competitive performance but low computational efficiency on high resolution feature maps. 25 | The subsequent work, Deformable DETR, enhances the efficiency of DETR by replacing dense attention with deformable attention, which achieves 10x faster convergence and improved performance. 26 | Deformable DETR uses the multiscale feature to ameliorate performance, however, the number of encoder tokens increases by 20x compared to DETR, and the computation cost of the encoder attention remains a bottleneck. 27 | In our preliminary experiment, we observe that the detection performance hardly deteriorates even if only a part of the encoder token is updated. 28 | Inspired by this observation, we propose *Sparse DETR* that selectively updates only the tokens expected to be referenced by the decoder, thus help the model effectively detect objects. 29 | In addition, we show that applying an auxiliary detection loss on the selected tokens in the encoder improves the performance while minimizing computational overhead. 30 | We validate that *Sparse DETR* achieves better performance than Deformable DETR even with only 10\% encoder tokens on the COCO dataset. 31 | Albeit only the encoder tokens are sparsified, the total computation cost decreases by 38\% and the frames per second (FPS) increases by 42\% compared to Deformable DETR. 32 | 33 | 34 | # Installation 35 | 36 | ## Requirements 37 | 38 | We have tested the code on the following environments: 39 | * Python 3.7.7 / Pytorch 1.6.0 / torchvisoin 0.7.0 / CUDA 10.1 / Ubuntu 18.04 40 | * Python 3.8.3 / Pytorch 1.7.1 / torchvisoin 0.8.2 / CUDA 11.1 / Ubuntu 18.04 41 | 42 | Run the following command to install dependencies: 43 | ```bash 44 | pip install -r requirements.txt 45 | ``` 46 | 47 | ## Compiling CUDA operators 48 | ```bash 49 | cd ./models/ops 50 | sh ./make.sh 51 | # unit test (should see all checking is True) 52 | python test.py 53 | ``` 54 | 55 | # Usage 56 | 57 | ## Dataset preparation 58 | 59 | Please download [COCO 2017 dataset](https://cocodataset.org/) and organize them as follows: 60 | 61 | ``` 62 | code_root/ 63 | └── data/ 64 | └── coco/ 65 | ├── train2017/ 66 | ├── val2017/ 67 | └── annotations/ 68 | ├── instances_train2017.json 69 | └── instances_val2017.json 70 | ``` 71 | 72 | ## Training 73 | 74 | ### Training on a single node 75 | 76 | For example, the command for training Sparse DETR with the keeping ratio of 10% on 8 GPUs is as follows: 77 | 78 | ```bash 79 | $ GPUS_PER_NODE=8 ./tools/run_dist_launch.sh 8 ./configs/swint_sparse_detr_rho_0.1.sh 80 | ``` 81 | 82 | ### Training on multiple nodes 83 | 84 | For example, the command Sparse DETR with the keeping ratio of 10% on 2 nodes of each with 8 GPUs is as follows: 85 | 86 | On node 1: 87 | 88 | ```bash 89 | $ MASTER_ADDR= NODE_RANK=0 GPUS_PER_NODE=8 ./tools/run_dist_launch.sh 16 ./configs/swint_sparse_detr_rho_0.1.sh 90 | ``` 91 | 92 | On node 2: 93 | 94 | ```bash 95 | $ MASTER_ADDR= NODE_RANK=1 GPUS_PER_NODE=8 ./tools/run_dist_launch.sh 16 ./configs/swint_sparse_detr_rho_0.1.sh 96 | ``` 97 | 98 | ### Direct argument control 99 | 100 | ```bash 101 | # Deformable DETR (with bounding-box-refinement and two-stage argument, if wanted) 102 | $ GPUS_PER_NODE=8 ./tools/run_dist_launch.sh 8 python main.py --with_box_refine --two_stage 103 | # Efficient DETR (with the class-specific head as describe in their paper) 104 | $ GPUS_PER_NODE=8 ./tools/run_dist_launch.sh 8 python main.py --with_box_refine --two_stage --eff_query_init --eff_specific_head 105 | # Sparse DETR (with the keeping ratio of 10% and encoder auxiliary loss) 106 | $ GPUS_PER_NODE=8 ./tools/run_dist_launch.sh 8 python main.py --with_box_refine --two_stage --eff_query_init --eff_specific_head --rho 0.1 --use_enc_aux_loss 107 | ``` 108 | 109 | ### Some tips to speed-up training 110 | * If your file system is slow to read images, you may consider enabling '--cache_mode' option to load the whole dataset into memory at the beginning of training. 111 | * You may increase the batch size to maximize the GPU utilization, according to GPU memory of yours, e.g., set '--batch_size 3' or '--batch_size 4'. 112 | 113 | ## Evaluation 114 | 115 | You can get the pre-trained model of Sparse DETR (the link is in "Main Results" session), then run the following command to evaluate it on COCO 2017 validation set: 116 | 117 | ```bash 118 | # Note that you should run the command with the corresponding configuration. 119 | $ ./configs/swint_sparse_detr_rho_0.1.sh --resume --eval 120 | ``` 121 | 122 | You can also run distributed evaluation by using ```./tools/run_dist_launch.sh```. 123 | 124 | # Main Results 125 | The tables below demonstrate the detection performance of Sparse DETR on the COCO 2017 validation set when using different backbones. 126 | * **Top-k** : sampling the top-k object queries instead of using the learned object queries(as in Efficient DETR). 127 | * **BBR** : performing bounding box refinement in the decoder block(as in Deformable DETR). 128 | * The **encoder auxiliary loss** proposed in our paper is only applied to Sparse DETR. 129 | * **FLOPs** and **FPS** are measured in the same way as used in Deformable DETR. 130 | * Refer to **Table 1** in the paper for more details. 131 | 132 | 133 | 134 | ## ResNet-50 backbone 135 | | Method | Epochs | ρ | Top-k & BBR | AP | #Params(M) | GFLOPs | B4FPS | Download | 136 | |:------------------:|:------:|:---:|:-----------:|:----:|:----------:|:------:|:-----:|:--------:| 137 | | Faster R-CNN + FPN | 109 | N/A | | 42.0 | 42M | 180G | 26 | | 138 | | DETR | 50 | N/A | | 35.0 | 41M | 86G | 28 | | 139 | | DETR | 500 | N/A | | 42.0 | 41M | 86G | 28 | | 140 | | DETR-DC5 | 500 | N/A | | 43.3 | 41M | 187G | 12 | | 141 | | PnP-DETR | 500 | 33% | | 41.1 | | | | | 142 | | | 500 | 50% | | 41.8 | | | | | 143 | | PnP-DETR-DC5 | 500 | 33% | | 42.7 | | | | | 144 | | | 500 | 50% | | 43.1 | | | | | 145 | | Deformable-DETR | 50 | N/A | | 43.9 | 39.8M | 172.9G | 19.1 | | 146 | | | 50 | N/A | o | 46.0 | 40.8M | 177.3G | 18.2 | | 147 | | Sparse-DETR | 50 | 10% | o | 45.3 | 40.9M | 105.4G | 26.5 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_10.pth) | 148 | | | 50 | 20% | o | 45.6 | 40.9M | 112.9G | 24.8 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_20.pth) | 149 | | | 50 | 30% | o | 46.0 | 40.9M | 120.5G | 23.2 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_30.pth) | 150 | | | 50 | 40% | o | 46.2 | 40.9M | 128.0G | 21.8 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_40.pth) | 151 | | | 50 | 50% | o | 46.3 | 40.9M | 135.6G | 20.5 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_50.pth) | 152 | 153 | 154 | 155 | ## Swin-T backbone 156 | | Method | Epochs | ρ | Top-k & BBR | AP | #Params(M) | GFLOPs | B4FPS | Download | 157 | |:---------------:|:------:|:---:|:-----------:|:----:|:----------:|:------:|:-----:|:--------:| 158 | | DETR | 50 | N/A | | 35.9 | 45.0M | 91.6G | 26.8 | | 159 | | DETR | 500 | N/A | | 45.4 | 45.0M | 91.6G | 26.8 | | 160 | | Deformable-DETR | 50 | N/A | | 45.7 | 40.3M | 180.4G | 15.9 | | 161 | | | 50 | N/A | o | 48.0 | 41.3M | 184.8G | 15.4 | | 162 | | Sparse-DETR | 50 | 10% | o | 48.2 | 41.4M | 113.4G | 21.2 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_swint_10.pth) | 163 | | | 50 | 20% | o | 48.8 | 41.4M | 121.0G | 20 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_swint_20.pth) | 164 | | | 50 | 30% | o | 49.1 | 41.4M | 128.5G | 18.9 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_swint_30.pth) | 165 | | | 50 | 40% | o | 49.2 | 41.4M | 136.1G | 18 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_swint_40.pth) | 166 | | | 50 | 50% | o | 49.3 | 41.4M | 143.7G | 17.2 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_swint_50.pth) | 167 | 168 | 169 | ## Initializing ResNet-50 backbone with SCRL 170 | The performance of Sparse DETR can be further improved when the backbone network is initialized with the `SCRL`([Spatially Consistent Representation Learning](https://arxiv.org/abs/2103.06122)) that aims to learn dense representations in a self-supervised way, compared to the default initialization with the ImageNet pre-trained one, denoted as `IN-sup` in the table below. 171 | * We obtained pre-trained weights from [Torchvision](https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html#sphx-glr-beginner-finetuning-torchvision-models-tutorial-py) for `IN-sup`, and the [SCRL GitHub repository](https://github.com/kakaobrain/scrl) for `SCRL`. 172 | * To reproduce the `SCRL` results, add `--scrl_pretrained_path ` to the training command. 173 | 174 | | Method | ρ | AP(IN-sup) | AP(SCRL) | AP(gain) | Download | 175 | |:-----------:|:---:|:-----------:|:--------:|:--------:|:--------:| 176 | | Sparse DETR | 10% | 45.3 | 46.9 | +1.6 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_scrl_10.pth) | 177 | | | 20% | 45.6 | 47.2 | +1.7 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_scrl_20.pth) | 178 | | | 30% | 46.0 | 47.4 | +1.4 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_scrl_30.pth) | 179 | | | 40% | 46.2 | 47.7 | +1.5 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_scrl_40.pth) | 180 | | | 50% | 46.3 | 47.9 | +1.6 | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_scrl_50.pth) | 181 | 182 | 183 | # Citation 184 | If you find Sparse DETR useful in your research, please consider citing: 185 | ```bibtex 186 | @inproceedings{roh2022sparse, 187 | title={Sparse DETR: Efficient End-to-End Object Detection with Learnable Sparsity}, 188 | author={Roh, Byungseok and Shin, JaeWoong and Shin, Wuhyun and Kim, Saehoon}, 189 | booktitle={ICLR}, 190 | year={2022} 191 | } 192 | ``` 193 | 194 | # License 195 | 196 | This project is released under the [Apache 2.0 license](./LICENSE). 197 | Copyright 2021 [Kakao Brain Corp](https://www.kakaobrain.com). All Rights Reserved. 198 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2021 KAKAO BRAIN Corp. All Rights Reserved. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | 204 | 205 | Deformable DETR 206 | 207 | Copyright 2020 SenseTime 208 | 209 | Licensed under the Apache License, Version 2.0 (the "License"); 210 | you may not use this file except in compliance with the License. 211 | You may obtain a copy of the License at 212 | 213 | http://www.apache.org/licenses/LICENSE-2.0 214 | 215 | Unless required by applicable law or agreed to in writing, software 216 | distributed under the License is distributed on an "AS IS" BASIS, 217 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 218 | See the License for the specific language governing permissions and 219 | limitations under the License. 220 | 221 | 222 | 223 | DETR 224 | 225 | Copyright 2020 - present, Facebook, Inc 226 | 227 | Licensed under the Apache License, Version 2.0 (the "License"); 228 | you may not use this file except in compliance with the License. 229 | You may obtain a copy of the License at 230 | 231 | http://www.apache.org/licenses/LICENSE-2.0 232 | 233 | Unless required by applicable law or agreed to in writing, software 234 | distributed under the License is distributed on an "AS IS" BASIS, 235 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 236 | See the License for the specific language governing permissions and 237 | limitations under the License. 238 | -------------------------------------------------------------------------------- /models/segmentation.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------ 2 | # Sparse DETR 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------ 6 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 8 | # ------------------------------------------------------------------------------------ 9 | # Modified from DETR (https://github.com/facebookresearch/detr) 10 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 11 | # ------------------------------------------------------------------------------------ 12 | 13 | 14 | """ 15 | This file provides the definition of the convolutional heads used to predict masks, as well as the losses 16 | """ 17 | import io 18 | from collections import defaultdict 19 | 20 | import torch 21 | import torch.nn as nn 22 | import torch.nn.functional as F 23 | from PIL import Image 24 | 25 | import util.box_ops as box_ops 26 | from util.misc import NestedTensor, interpolate, nested_tensor_from_tensor_list 27 | 28 | try: 29 | from panopticapi.utils import id2rgb, rgb2id 30 | except ImportError: 31 | pass 32 | 33 | 34 | class DETRsegm(nn.Module): 35 | def __init__(self, detr, freeze_detr=False): 36 | super().__init__() 37 | self.detr = detr 38 | 39 | if freeze_detr: 40 | for p in self.parameters(): 41 | p.requires_grad_(False) 42 | 43 | hidden_dim, nheads = detr.transformer.d_model, detr.transformer.nhead 44 | self.bbox_attention = MHAttentionMap(hidden_dim, hidden_dim, nheads, dropout=0) 45 | self.mask_head = MaskHeadSmallConv(hidden_dim + nheads, [1024, 512, 256], hidden_dim) 46 | 47 | def forward(self, samples: NestedTensor): 48 | if not isinstance(samples, NestedTensor): 49 | samples = nested_tensor_from_tensor_list(samples) 50 | features, pos = self.detr.backbone(samples) 51 | 52 | bs = features[-1].tensors.shape[0] 53 | 54 | src, mask = features[-1].decompose() 55 | src_proj = self.detr.input_proj(src) 56 | hs, memory = self.detr.transformer(src_proj, mask, self.detr.query_embed.weight, pos[-1]) 57 | 58 | outputs_class = self.detr.class_embed(hs) 59 | outputs_coord = self.detr.bbox_embed(hs).sigmoid() 60 | out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]} 61 | if self.detr.aux_loss: 62 | out["aux_outputs"] = [ 63 | {"pred_logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1]) 64 | ] 65 | 66 | # FIXME h_boxes takes the last one computed, keep this in mind 67 | bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask) 68 | 69 | seg_masks = self.mask_head(src_proj, bbox_mask, [features[2].tensors, features[1].tensors, features[0].tensors]) 70 | outputs_seg_masks = seg_masks.view(bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1]) 71 | 72 | out["pred_masks"] = outputs_seg_masks 73 | return out 74 | 75 | 76 | class MaskHeadSmallConv(nn.Module): 77 | """ 78 | Simple convolutional head, using group norm. 79 | Upsampling is done using a FPN approach 80 | """ 81 | 82 | def __init__(self, dim, fpn_dims, context_dim): 83 | super().__init__() 84 | 85 | inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64] 86 | self.lay1 = torch.nn.Conv2d(dim, dim, 3, padding=1) 87 | self.gn1 = torch.nn.GroupNorm(8, dim) 88 | self.lay2 = torch.nn.Conv2d(dim, inter_dims[1], 3, padding=1) 89 | self.gn2 = torch.nn.GroupNorm(8, inter_dims[1]) 90 | self.lay3 = torch.nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1) 91 | self.gn3 = torch.nn.GroupNorm(8, inter_dims[2]) 92 | self.lay4 = torch.nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1) 93 | self.gn4 = torch.nn.GroupNorm(8, inter_dims[3]) 94 | self.lay5 = torch.nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1) 95 | self.gn5 = torch.nn.GroupNorm(8, inter_dims[4]) 96 | self.out_lay = torch.nn.Conv2d(inter_dims[4], 1, 3, padding=1) 97 | 98 | self.dim = dim 99 | 100 | self.adapter1 = torch.nn.Conv2d(fpn_dims[0], inter_dims[1], 1) 101 | self.adapter2 = torch.nn.Conv2d(fpn_dims[1], inter_dims[2], 1) 102 | self.adapter3 = torch.nn.Conv2d(fpn_dims[2], inter_dims[3], 1) 103 | 104 | for m in self.modules(): 105 | if isinstance(m, nn.Conv2d): 106 | nn.init.kaiming_uniform_(m.weight, a=1) 107 | nn.init.constant_(m.bias, 0) 108 | 109 | def forward(self, x, bbox_mask, fpns): 110 | def expand(tensor, length): 111 | return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1) 112 | 113 | x = torch.cat([expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1) 114 | 115 | x = self.lay1(x) 116 | x = self.gn1(x) 117 | x = F.relu(x) 118 | x = self.lay2(x) 119 | x = self.gn2(x) 120 | x = F.relu(x) 121 | 122 | cur_fpn = self.adapter1(fpns[0]) 123 | if cur_fpn.size(0) != x.size(0): 124 | cur_fpn = expand(cur_fpn, x.size(0) / cur_fpn.size(0)) 125 | x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") 126 | x = self.lay3(x) 127 | x = self.gn3(x) 128 | x = F.relu(x) 129 | 130 | cur_fpn = self.adapter2(fpns[1]) 131 | if cur_fpn.size(0) != x.size(0): 132 | cur_fpn = expand(cur_fpn, x.size(0) / cur_fpn.size(0)) 133 | x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") 134 | x = self.lay4(x) 135 | x = self.gn4(x) 136 | x = F.relu(x) 137 | 138 | cur_fpn = self.adapter3(fpns[2]) 139 | if cur_fpn.size(0) != x.size(0): 140 | cur_fpn = expand(cur_fpn, x.size(0) / cur_fpn.size(0)) 141 | x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") 142 | x = self.lay5(x) 143 | x = self.gn5(x) 144 | x = F.relu(x) 145 | 146 | x = self.out_lay(x) 147 | return x 148 | 149 | 150 | class MHAttentionMap(nn.Module): 151 | """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)""" 152 | 153 | def __init__(self, query_dim, hidden_dim, num_heads, dropout=0, bias=True): 154 | super().__init__() 155 | self.num_heads = num_heads 156 | self.hidden_dim = hidden_dim 157 | self.dropout = nn.Dropout(dropout) 158 | 159 | self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias) 160 | self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias) 161 | 162 | nn.init.zeros_(self.k_linear.bias) 163 | nn.init.zeros_(self.q_linear.bias) 164 | nn.init.xavier_uniform_(self.k_linear.weight) 165 | nn.init.xavier_uniform_(self.q_linear.weight) 166 | self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5 167 | 168 | def forward(self, q, k, mask=None): 169 | q = self.q_linear(q) 170 | k = F.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias) 171 | qh = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads) 172 | kh = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]) 173 | weights = torch.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh) 174 | 175 | if mask is not None: 176 | weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf")) 177 | weights = F.softmax(weights.flatten(2), dim=-1).view_as(weights) 178 | weights = self.dropout(weights) 179 | return weights 180 | 181 | 182 | def dice_loss(inputs, targets, num_boxes): 183 | """ 184 | Compute the DICE loss, similar to generalized IOU for masks 185 | Args: 186 | inputs: A float tensor of arbitrary shape. 187 | The predictions for each example. 188 | targets: A float tensor with the same shape as inputs. Stores the binary 189 | classification label for each element in inputs 190 | (0 for the negative class and 1 for the positive class). 191 | """ 192 | inputs = inputs.sigmoid() 193 | inputs = inputs.flatten(1) 194 | numerator = 2 * (inputs * targets).sum(1) 195 | denominator = inputs.sum(-1) + targets.sum(-1) 196 | loss = 1 - (numerator + 1) / (denominator + 1) 197 | return loss.sum() / num_boxes 198 | 199 | 200 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, idx=None): 201 | """ 202 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. 203 | Args: 204 | inputs: A float tensor of arbitrary shape. 205 | The predictions for each example. 206 | targets: A float tensor with the same shape as inputs. Stores the binary 207 | classification label for each element in inputs 208 | (0 for the negative class and 1 for the positive class). 209 | alpha: (optional) Weighting factor in range (0,1) to balance 210 | positive vs negative examples. Default = -1 (no weighting). 211 | gamma: Exponent of the modulating factor (1 - p_t) to 212 | balance easy vs hard examples. 213 | Returns: 214 | Loss tensor 215 | """ 216 | prob = inputs.sigmoid() 217 | ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") 218 | p_t = prob * targets + (1 - prob) * (1 - targets) 219 | loss = ce_loss * ((1 - p_t) ** gamma) 220 | 221 | if alpha >= 0: 222 | alpha_t = alpha * targets + (1 - alpha) * (1 - targets) 223 | loss = alpha_t * loss 224 | if idx is not None: 225 | return loss[idx].mean(1).sum() / num_boxes 226 | return loss.mean(1).sum() / num_boxes 227 | 228 | 229 | class PostProcessSegm(nn.Module): 230 | def __init__(self, threshold=0.5): 231 | super().__init__() 232 | self.threshold = threshold 233 | 234 | @torch.no_grad() 235 | def forward(self, results, outputs, orig_target_sizes, max_target_sizes): 236 | assert len(orig_target_sizes) == len(max_target_sizes) 237 | max_h, max_w = max_target_sizes.max(0)[0].tolist() 238 | outputs_masks = outputs["pred_masks"].squeeze(2) 239 | outputs_masks = F.interpolate(outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False) 240 | outputs_masks = (outputs_masks.sigmoid() > self.threshold).cpu() 241 | 242 | for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)): 243 | img_h, img_w = t[0], t[1] 244 | results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1) 245 | results[i]["masks"] = F.interpolate( 246 | results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest" 247 | ).byte() 248 | 249 | return results 250 | 251 | 252 | class PostProcessPanoptic(nn.Module): 253 | """This class converts the output of the model to the final panoptic result, in the format expected by the 254 | coco panoptic API """ 255 | 256 | def __init__(self, is_thing_map, threshold=0.85): 257 | """ 258 | Parameters: 259 | is_thing_map: This is a whose keys are the class ids, and the values a boolean indicating whether 260 | the class is a thing (True) or a stuff (False) class 261 | threshold: confidence threshold: segments with confidence lower than this will be deleted 262 | """ 263 | super().__init__() 264 | self.threshold = threshold 265 | self.is_thing_map = is_thing_map 266 | 267 | def forward(self, outputs, processed_sizes, target_sizes=None): 268 | """ This function computes the panoptic prediction from the model's predictions. 269 | Parameters: 270 | outputs: This is a dict coming directly from the model. See the model doc for the content. 271 | processed_sizes: This is a list of tuples (or torch tensors) of sizes of the images that were passed to the 272 | model, ie the size after data augmentation but before batching. 273 | target_sizes: This is a list of tuples (or torch tensors) corresponding to the requested final size 274 | of each prediction. If left to None, it will default to the processed_sizes 275 | """ 276 | if target_sizes is None: 277 | target_sizes = processed_sizes 278 | assert len(processed_sizes) == len(target_sizes) 279 | out_logits, raw_masks, raw_boxes = outputs["pred_logits"], outputs["pred_masks"], outputs["pred_boxes"] 280 | assert len(out_logits) == len(raw_masks) == len(target_sizes) 281 | preds = [] 282 | 283 | def to_tuple(tup): 284 | if isinstance(tup, tuple): 285 | return tup 286 | return tuple(tup.cpu().tolist()) 287 | 288 | for cur_logits, cur_masks, cur_boxes, size, target_size in zip( 289 | out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes 290 | ): 291 | # we filter empty queries and detection below threshold 292 | scores, labels = cur_logits.softmax(-1).max(-1) 293 | keep = labels.ne(outputs["pred_logits"].shape[-1] - 1) & (scores > self.threshold) 294 | cur_scores, cur_classes = cur_logits.softmax(-1).max(-1) 295 | cur_scores = cur_scores[keep] 296 | cur_classes = cur_classes[keep] 297 | cur_masks = cur_masks[keep] 298 | cur_masks = interpolate(cur_masks[None], to_tuple(size), mode="bilinear").squeeze(0) 299 | cur_boxes = box_ops.box_cxcywh_to_xyxy(cur_boxes[keep]) 300 | 301 | h, w = cur_masks.shape[-2:] 302 | assert len(cur_boxes) == len(cur_classes) 303 | 304 | # It may be that we have several predicted masks for the same stuff class. 305 | # In the following, we track the list of masks ids for each stuff class (they are merged later on) 306 | cur_masks = cur_masks.flatten(1) 307 | stuff_equiv_classes = defaultdict(lambda: []) 308 | for k, label in enumerate(cur_classes): 309 | if not self.is_thing_map[label.item()]: 310 | stuff_equiv_classes[label.item()].append(k) 311 | 312 | def get_ids_area(masks, scores, dedup=False): 313 | # This helper function creates the final panoptic segmentation image 314 | # It also returns the area of the masks that appears on the image 315 | 316 | m_id = masks.transpose(0, 1).softmax(-1) 317 | 318 | if m_id.shape[-1] == 0: 319 | # We didn't detect any mask :( 320 | m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device) 321 | else: 322 | m_id = m_id.argmax(-1).view(h, w) 323 | 324 | if dedup: 325 | # Merge the masks corresponding to the same stuff class 326 | for equiv in stuff_equiv_classes.values(): 327 | if len(equiv) > 1: 328 | for eq_id in equiv: 329 | m_id.masked_fill_(m_id.eq(eq_id), equiv[0]) 330 | 331 | final_h, final_w = to_tuple(target_size) 332 | 333 | seg_img = Image.fromarray(id2rgb(m_id.view(h, w).cpu().numpy())) 334 | seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST) 335 | 336 | np_seg_img = ( 337 | torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())).view(final_h, final_w, 3).numpy() 338 | ) 339 | m_id = torch.from_numpy(rgb2id(np_seg_img)) 340 | 341 | area = [] 342 | for i in range(len(scores)): 343 | area.append(m_id.eq(i).sum().item()) 344 | return area, seg_img 345 | 346 | area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True) 347 | if cur_classes.numel() > 0: 348 | # We know filter empty masks as long as we find some 349 | while True: 350 | filtered_small = torch.as_tensor( 351 | [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device 352 | ) 353 | if filtered_small.any().item(): 354 | cur_scores = cur_scores[~filtered_small] 355 | cur_classes = cur_classes[~filtered_small] 356 | cur_masks = cur_masks[~filtered_small] 357 | area, seg_img = get_ids_area(cur_masks, cur_scores) 358 | else: 359 | break 360 | 361 | else: 362 | cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device) 363 | 364 | segments_info = [] 365 | for i, a in enumerate(area): 366 | cat = cur_classes[i].item() 367 | segments_info.append({"id": i, "isthing": self.is_thing_map[cat], "category_id": cat, "area": a}) 368 | del cur_classes 369 | 370 | with io.BytesIO() as out: 371 | seg_img.save(out, format="PNG") 372 | predictions = {"png_string": out.getvalue(), "segments_info": segments_info} 373 | preds.append(predictions) 374 | return preds 375 | --------------------------------------------------------------------------------