├── requirements.txt
├── figs
    ├── architecture.png
    └── dam_creation.png
├── configs
    ├── r50_deformable_detr.sh
    ├── swint_deformable_detr.sh
    ├── r50_efficient_detr.sh
    ├── swint_efficient_detr.sh
    ├── r50_sparse_detr_rho_0.1.sh
    ├── r50_sparse_detr_rho_0.2.sh
    ├── r50_sparse_detr_rho_0.3.sh
    ├── swint_sparse_detr_rho_0.1.sh
    ├── swint_sparse_detr_rho_0.2.sh
    └── swint_sparse_detr_rho_0.3.sh
├── models
    ├── swin_transformer
    │   ├── configs
    │   │   ├── swin_large_patch4_window7_224.yaml
    │   │   ├── swin_tiny_patch4_window7_224.yaml
    │   │   ├── swin_base_patch4_window7_224.yaml
    │   │   ├── swin_small_patch4_window7_224.yaml
    │   │   └── default.yaml
    │   ├── __init__.py
    │   ├── build.py
    │   └── config.py
    ├── ops
    │   ├── make.sh
    │   ├── functions
    │   │   ├── __init__.py
    │   │   └── ms_deform_attn_func.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   └── ms_deform_attn.py
    │   ├── src
    │   │   ├── vision.cpp
    │   │   ├── cuda
    │   │   │   ├── ms_deform_attn_cuda.h
    │   │   │   └── ms_deform_attn_cuda.cu
    │   │   ├── cpu
    │   │   │   ├── ms_deform_attn_cpu.h
    │   │   │   └── ms_deform_attn_cpu.cpp
    │   │   └── ms_deform_attn.h
    │   ├── setup.py
    │   └── test.py
    ├── __init__.py
    ├── position_encoding.py
    ├── matcher.py
    ├── backbone.py
    └── segmentation.py
├── datasets
    ├── torchvision_datasets
    │   ├── __init__.py
    │   └── coco.py
    ├── __init__.py
    ├── panoptic_eval.py
    ├── data_prefetcher.py
    ├── coco_panoptic.py
    ├── samplers.py
    ├── coco.py
    ├── transforms.py
    └── coco_eval.py
├── util
    ├── __init__.py
    ├── box_ops.py
    ├── dam.py
    ├── plot_utils.py
    └── benchmark.py
├── tools
    ├── run_dist_launch.sh
    └── launch.py
├── NOTICE
├── engine.py
├── README.md
└── LICENSE


/requirements.txt:
--------------------------------------------------------------------------------
1 | pycocotools
2 | tqdm
3 | scipy
4 | timm
5 | fvcore
6 | tensorboard
7 | 


--------------------------------------------------------------------------------
/figs/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kakaobrain/sparse-detr/HEAD/figs/architecture.png


--------------------------------------------------------------------------------
/figs/dam_creation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kakaobrain/sparse-detr/HEAD/figs/dam_creation.png


--------------------------------------------------------------------------------
/configs/r50_deformable_detr.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/r50_deformable_detr
 6 | PY_ARGS=${@:1}
 7 | 
 8 | python -u main.py \
 9 |     --output_dir ${EXP_DIR} \
10 |     ${PY_ARGS}
11 | 


--------------------------------------------------------------------------------
/configs/swint_deformable_detr.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/swint_deformable_detr
 6 | PY_ARGS=${@:1}
 7 | 
 8 | python -u main.py \
 9 |     --output_dir ${EXP_DIR} \
10 |     --backbone swin-t \
11 |     ${PY_ARGS}
12 | 


--------------------------------------------------------------------------------
/models/swin_transformer/configs/swin_large_patch4_window7_224.yaml:
--------------------------------------------------------------------------------
 1 | BASE: ['default.yaml']
 2 | MODEL:
 3 |   TYPE: swin
 4 |   NAME: swin_large_patch4_window7_224
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [ 2, 2, 18, 2 ]
 8 |     NUM_HEADS: [ 6, 12, 24, 48 ]
 9 |     WINDOW_SIZE: 7
10 | 


--------------------------------------------------------------------------------
/models/swin_transformer/configs/swin_tiny_patch4_window7_224.yaml:
--------------------------------------------------------------------------------
 1 | BASE: ['default.yaml']
 2 | MODEL:
 3 |   TYPE: swin
 4 |   NAME: swin_tiny_patch4_window7_224
 5 |   DROP_PATH_RATE: 0.2
 6 |   SWIN:
 7 |     EMBED_DIM: 96
 8 |     DEPTHS: [ 2, 2, 6, 2 ]
 9 |     NUM_HEADS: [ 3, 6, 12, 24 ]
10 |     WINDOW_SIZE: 7
11 | 


--------------------------------------------------------------------------------
/models/swin_transformer/configs/swin_base_patch4_window7_224.yaml:
--------------------------------------------------------------------------------
 1 | BASE: ['default.yaml']
 2 | MODEL:
 3 |   TYPE: swin
 4 |   NAME: swin_base_patch4_window7_224
 5 |   DROP_PATH_RATE: 0.5
 6 |   SWIN:
 7 |     EMBED_DIM: 128
 8 |     DEPTHS: [ 2, 2, 18, 2 ]
 9 |     NUM_HEADS: [ 4, 8, 16, 32 ]
10 |     WINDOW_SIZE: 7
11 | 


--------------------------------------------------------------------------------
/models/swin_transformer/configs/swin_small_patch4_window7_224.yaml:
--------------------------------------------------------------------------------
 1 | BASE: ['default.yaml']
 2 | MODEL:
 3 |   TYPE: swin
 4 |   NAME: swin_small_patch4_window7_224
 5 |   DROP_PATH_RATE: 0.3
 6 |   SWIN:
 7 |     EMBED_DIM: 96
 8 |     DEPTHS: [ 2, 2, 18, 2 ]
 9 |     NUM_HEADS: [ 3, 6, 12, 24 ]
10 |     WINDOW_SIZE: 7
11 | 


--------------------------------------------------------------------------------
/configs/r50_efficient_detr.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/r50_efficient_detr
 6 | PY_ARGS=${@:1}
 7 | 
 8 | python -u main.py \
 9 |     --output_dir ${EXP_DIR} \
10 |     --with_box_refine \
11 |     --two_stage \
12 |     --eff_query_init \
13 |     --eff_specific_head \
14 |     ${PY_ARGS}
15 | 


--------------------------------------------------------------------------------
/configs/swint_efficient_detr.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/swint_efficient_detr
 6 | PY_ARGS=${@:1}
 7 | 
 8 | python -u main.py \
 9 |     --output_dir ${EXP_DIR} \
10 |     --backbone swin-t \
11 |     --with_box_refine \
12 |     --two_stage \
13 |     --eff_query_init \
14 |     --eff_specific_head \
15 |     ${PY_ARGS}
16 | 


--------------------------------------------------------------------------------
/configs/r50_sparse_detr_rho_0.1.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/r50_sparse_detr_0.1
 6 | PY_ARGS=${@:1}
 7 | 
 8 | python -u main.py \
 9 |     --output_dir ${EXP_DIR} \
10 |     --with_box_refine \
11 |     --two_stage \
12 |     --eff_query_init \
13 |     --eff_specific_head \
14 |     --rho 0.1 \
15 |     --use_enc_aux_loss \
16 |     ${PY_ARGS}
17 | 


--------------------------------------------------------------------------------
/configs/r50_sparse_detr_rho_0.2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/r50_sparse_detr_0.2
 6 | PY_ARGS=${@:1}
 7 | 
 8 | python -u main.py \
 9 |     --output_dir ${EXP_DIR} \
10 |     --with_box_refine \
11 |     --two_stage \
12 |     --eff_query_init \
13 |     --eff_specific_head \
14 |     --rho 0.2 \
15 |     --use_enc_aux_loss \
16 |     ${PY_ARGS}
17 | 


--------------------------------------------------------------------------------
/configs/r50_sparse_detr_rho_0.3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/r50_sparse_detr_0.3
 6 | PY_ARGS=${@:1}
 7 | 
 8 | python -u main.py \
 9 |     --output_dir ${EXP_DIR} \
10 |     --with_box_refine \
11 |     --two_stage \
12 |     --eff_query_init \
13 |     --eff_specific_head \
14 |     --rho 0.3 \
15 |     --use_enc_aux_loss \
16 |     ${PY_ARGS}
17 | 


--------------------------------------------------------------------------------
/datasets/torchvision_datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | 
7 | from .coco import CocoDetection
8 | 


--------------------------------------------------------------------------------
/models/swin_transformer/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------------
2 | # Sparse DETR
3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------------------
6 | 
7 | 
8 | from .build import build_model
9 | 


--------------------------------------------------------------------------------
/configs/swint_sparse_detr_rho_0.1.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/swint_sparse_detr_0.1
 6 | PY_ARGS=${@:1}
 7 | 
 8 | python -u main.py \
 9 |     --output_dir ${EXP_DIR} \
10 |     --backbone swin-t \
11 |     --with_box_refine \
12 |     --two_stage \
13 |     --eff_query_init \
14 |     --eff_specific_head \
15 |     --rho 0.1 \
16 |     --use_enc_aux_loss \
17 |     ${PY_ARGS}
18 | 


--------------------------------------------------------------------------------
/configs/swint_sparse_detr_rho_0.2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/swint_sparse_detr_0.2
 6 | PY_ARGS=${@:1}
 7 | 
 8 | python -u main.py \
 9 |     --output_dir ${EXP_DIR} \
10 |     --backbone swin-t \
11 |     --with_box_refine \
12 |     --two_stage \
13 |     --eff_query_init \
14 |     --eff_specific_head \
15 |     --rho 0.2 \
16 |     --use_enc_aux_loss \
17 |     ${PY_ARGS}
18 | 


--------------------------------------------------------------------------------
/configs/swint_sparse_detr_rho_0.3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/swint_sparse_detr_0.3
 6 | PY_ARGS=${@:1}
 7 | 
 8 | python -u main.py \
 9 |     --output_dir ${EXP_DIR} \
10 |     --backbone swin-t \
11 |     --with_box_refine \
12 |     --two_stage \
13 |     --eff_query_init \
14 |     --eff_specific_head \
15 |     --rho 0.3 \
16 |     --use_enc_aux_loss \
17 |     ${PY_ARGS}
18 | 


--------------------------------------------------------------------------------
/models/swin_transformer/configs/default.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_SIZE: 224
 3 | TRAIN:
 4 |   USE_CHECKPOINT: false
 5 | MODEL:
 6 |   SWIN:
 7 |     APE: false
 8 |     DEPTHS: [2, 2, 6, 2]
 9 |     EMBED_DIM: 96
10 |     IN_CHANS: 3
11 |     MLP_RATIO: 4.0
12 |     NUM_HEADS: [3, 6, 12, 24]
13 |     PATCH_NORM: true
14 |     PATCH_SIZE: 4
15 |     QKV_BIAS: true
16 |     QK_SCALE: null
17 |     WINDOW_SIZE: 7
18 |   DROP_RATE: 0.0
19 |   DROP_PATH_RATE: 0.1
20 |   NUM_CLASSES: 1000
21 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | # Modified from DETR (https://github.com/facebookresearch/detr)
7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
8 | # ------------------------------------------------------------------------
9 | 


--------------------------------------------------------------------------------
/models/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | python setup.py build install
11 | 


--------------------------------------------------------------------------------
/models/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn_func import MSDeformAttnFunction
10 | 
11 | 


--------------------------------------------------------------------------------
/models/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------
 2 | # Sparse DETR
 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Deformable DETR
 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 8 | # ------------------------------------------------------------------------------------------------
 9 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
10 | # ------------------------------------------------------------------------------------------------
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/models/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include "ms_deform_attn.h"
12 | 
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16 | }
17 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------
 2 | # Sparse DETR
 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------
 6 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 7 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 8 | # ------------------------------------------------------------------------------------
 9 | # Modified from DETR (https://github.com/facebookresearch/detr)
10 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
11 | # ------------------------------------------------------------------------------------
12 | 
13 | 
14 | from .deformable_detr import build
15 | 
16 | 
17 | def build_model(args):
18 |     return build(args)
19 | 
20 | 


--------------------------------------------------------------------------------
/tools/run_dist_launch.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------
 7 | 
 8 | set -x
 9 | 
10 | GPUS=$1
11 | RUN_COMMAND=${@:2}
12 | if [ $GPUS -lt 8 ]; then
13 |     GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS}
14 | else
15 |     GPUS_PER_NODE=${GPUS_PER_NODE:-8}
16 | fi
17 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
18 | MASTER_PORT=${MASTER_PORT:-"29500"}
19 | NODE_RANK=${NODE_RANK:-0}
20 | 
21 | let "NNODES=GPUS/GPUS_PER_NODE"
22 | 
23 | python ./tools/launch.py \
24 |     --nnodes ${NNODES} \
25 |     --node_rank ${NODE_RANK} \
26 |     --master_addr ${MASTER_ADDR} \
27 |     --master_port ${MASTER_PORT} \
28 |     --nproc_per_node ${GPUS_PER_NODE} \
29 |     ${RUN_COMMAND}


--------------------------------------------------------------------------------
/models/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor ms_deform_attn_cuda_forward(
15 |     const at::Tensor &value, 
16 |     const at::Tensor &spatial_shapes,
17 |     const at::Tensor &level_start_index,
18 |     const at::Tensor &sampling_loc,
19 |     const at::Tensor &attn_weight,
20 |     const int im2col_step);
21 | 
22 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
23 |     const at::Tensor &value, 
24 |     const at::Tensor &spatial_shapes,
25 |     const at::Tensor &level_start_index,
26 |     const at::Tensor &sampling_loc,
27 |     const at::Tensor &attn_weight,
28 |     const at::Tensor &grad_output,
29 |     const int im2col_step);
30 | 
31 | 


--------------------------------------------------------------------------------
/models/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor
15 | ms_deform_attn_cpu_forward(
16 |     const at::Tensor &value, 
17 |     const at::Tensor &spatial_shapes,
18 |     const at::Tensor &level_start_index,
19 |     const at::Tensor &sampling_loc,
20 |     const at::Tensor &attn_weight,
21 |     const int im2col_step);
22 | 
23 | std::vector<at::Tensor>
24 | ms_deform_attn_cpu_backward(
25 |     const at::Tensor &value, 
26 |     const at::Tensor &spatial_shapes,
27 |     const at::Tensor &level_start_index,
28 |     const at::Tensor &sampling_loc,
29 |     const at::Tensor &attn_weight,
30 |     const at::Tensor &grad_output,
31 |     const int im2col_step);
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/models/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include <vector>
12 | 
13 | #include <ATen/ATen.h>
14 | #include <ATen/cuda/CUDAContext.h>
15 | 
16 | 
17 | at::Tensor
18 | ms_deform_attn_cpu_forward(
19 |     const at::Tensor &value, 
20 |     const at::Tensor &spatial_shapes,
21 |     const at::Tensor &level_start_index,
22 |     const at::Tensor &sampling_loc,
23 |     const at::Tensor &attn_weight,
24 |     const int im2col_step)
25 | {
26 |     AT_ERROR("Not implement on cpu");
27 | }
28 | 
29 | std::vector<at::Tensor>
30 | ms_deform_attn_cpu_backward(
31 |     const at::Tensor &value, 
32 |     const at::Tensor &spatial_shapes,
33 |     const at::Tensor &level_start_index,
34 |     const at::Tensor &sampling_loc,
35 |     const at::Tensor &attn_weight,
36 |     const at::Tensor &grad_output,
37 |     const int im2col_step)
38 | {
39 |     AT_ERROR("Not implement on cpu");
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | # ------------------------------------------------------------------------
 9 | 
10 | import torch.utils.data
11 | from .torchvision_datasets import CocoDetection
12 | 
13 | from .coco import build as build_coco
14 | 
15 | 
16 | def get_coco_api_from_dataset(dataset):
17 |     for _ in range(10):
18 |         # if isinstance(dataset, torchvision.datasets.CocoDetection):
19 |         #     break
20 |         if isinstance(dataset, torch.utils.data.Subset):
21 |             dataset = dataset.dataset
22 |     if isinstance(dataset, CocoDetection):
23 |         return dataset.coco
24 | 
25 | 
26 | def build_dataset(image_set, args):
27 |     if args.dataset_file == 'coco':
28 |         return build_coco(image_set, args)
29 |     if args.dataset_file == 'coco_panoptic':
30 |         # to avoid making panopticapi required for coco
31 |         from .coco_panoptic import build as build_coco_panoptic
32 |         return build_coco_panoptic(image_set, args)
33 |     raise ValueError(f'dataset {args.dataset_file} not supported')
34 | 


--------------------------------------------------------------------------------
/models/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | 
13 | #include "cpu/ms_deform_attn_cpu.h"
14 | 
15 | #ifdef WITH_CUDA
16 | #include "cuda/ms_deform_attn_cuda.h"
17 | #endif
18 | 
19 | 
20 | at::Tensor
21 | ms_deform_attn_forward(
22 |     const at::Tensor &value, 
23 |     const at::Tensor &spatial_shapes,
24 |     const at::Tensor &level_start_index,
25 |     const at::Tensor &sampling_loc,
26 |     const at::Tensor &attn_weight,
27 |     const int im2col_step)
28 | {
29 |     if (value.type().is_cuda())
30 |     {
31 | #ifdef WITH_CUDA
32 |         return ms_deform_attn_cuda_forward(
33 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34 | #else
35 |         AT_ERROR("Not compiled with GPU support");
36 | #endif
37 |     }
38 |     AT_ERROR("Not implemented on the CPU");
39 | }
40 | 
41 | std::vector<at::Tensor>
42 | ms_deform_attn_backward(
43 |     const at::Tensor &value, 
44 |     const at::Tensor &spatial_shapes,
45 |     const at::Tensor &level_start_index,
46 |     const at::Tensor &sampling_loc,
47 |     const at::Tensor &attn_weight,
48 |     const at::Tensor &grad_output,
49 |     const int im2col_step)
50 | {
51 |     if (value.type().is_cuda())
52 |     {
53 | #ifdef WITH_CUDA
54 |         return ms_deform_attn_cuda_backward(
55 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56 | #else
57 |         AT_ERROR("Not compiled with GPU support");
58 | #endif
59 |     }
60 |     AT_ERROR("Not implemented on the CPU");
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/datasets/panoptic_eval.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | # ------------------------------------------------------------------------
 9 | 
10 | import json
11 | import os
12 | 
13 | import util.misc as utils
14 | 
15 | try:
16 |     from panopticapi.evaluation import pq_compute
17 | except ImportError:
18 |     pass
19 | 
20 | 
21 | class PanopticEvaluator(object):
22 |     def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"):
23 |         self.gt_json = ann_file
24 |         self.gt_folder = ann_folder
25 |         if utils.is_main_process():
26 |             if not os.path.exists(output_dir):
27 |                 os.mkdir(output_dir)
28 |         self.output_dir = output_dir
29 |         self.predictions = []
30 | 
31 |     def update(self, predictions):
32 |         for p in predictions:
33 |             with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
34 |                 f.write(p.pop("png_string"))
35 | 
36 |         self.predictions += predictions
37 | 
38 |     def synchronize_between_processes(self):
39 |         all_predictions = utils.all_gather(self.predictions)
40 |         merged_predictions = []
41 |         for p in all_predictions:
42 |             merged_predictions += p
43 |         self.predictions = merged_predictions
44 | 
45 |     def summarize(self):
46 |         if utils.is_main_process():
47 |             json_data = {"annotations": self.predictions}
48 |             predictions_json = os.path.join(self.output_dir, "predictions.json")
49 |             with open(predictions_json, "w") as f:
50 |                 f.write(json.dumps(json_data))
51 |             return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir)
52 |         return None
53 | 


--------------------------------------------------------------------------------
/models/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | import os
10 | import glob
11 | 
12 | import torch
13 | 
14 | from torch.utils.cpp_extension import CUDA_HOME
15 | from torch.utils.cpp_extension import CppExtension
16 | from torch.utils.cpp_extension import CUDAExtension
17 | 
18 | from setuptools import find_packages
19 | from setuptools import setup
20 | 
21 | requirements = ["torch", "torchvision"]
22 | 
23 | def get_extensions():
24 |     this_dir = os.path.dirname(os.path.abspath(__file__))
25 |     extensions_dir = os.path.join(this_dir, "src")
26 | 
27 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 | 
31 |     sources = main_file + source_cpu
32 |     extension = CppExtension
33 |     extra_compile_args = {"cxx": []}
34 |     define_macros = []
35 | 
36 |     if torch.cuda.is_available() and CUDA_HOME is not None:
37 |         extension = CUDAExtension
38 |         sources += source_cuda
39 |         define_macros += [("WITH_CUDA", None)]
40 |         extra_compile_args["nvcc"] = [
41 |             "-DCUDA_HAS_FP16=1",
42 |             "-D__CUDA_NO_HALF_OPERATORS__",
43 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
44 |             "-D__CUDA_NO_HALF2_OPERATORS__",
45 |         ]
46 |     else:
47 |         raise NotImplementedError('Cuda is not availabel')
48 | 
49 |     sources = [os.path.join(extensions_dir, s) for s in sources]
50 |     include_dirs = [extensions_dir]
51 |     ext_modules = [
52 |         extension(
53 |             "MultiScaleDeformableAttention",
54 |             sources,
55 |             include_dirs=include_dirs,
56 |             define_macros=define_macros,
57 |             extra_compile_args=extra_compile_args,
58 |         )
59 |     ]
60 |     return ext_modules
61 | 
62 | setup(
63 |     name="MultiScaleDeformableAttention",
64 |     version="1.0",
65 |     author="Weijie Su",
66 |     url="https://github.com/fundamentalvision/Deformable-DETR",
67 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
68 |     packages=find_packages(exclude=("configs", "tests",)),
69 |     ext_modules=get_extensions(),
70 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
71 | )
72 | 


--------------------------------------------------------------------------------
/datasets/data_prefetcher.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | 
 7 | import torch
 8 | 
 9 | def to_cuda(samples, targets, device):
10 |     samples = samples.to(device, non_blocking=True)
11 |     targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets]
12 |     return samples, targets
13 | 
14 | class data_prefetcher():
15 |     def __init__(self, loader, device, prefetch=True):
16 |         self.loader = iter(loader)
17 |         self.prefetch = prefetch
18 |         self.device = device
19 |         if prefetch:
20 |             self.stream = torch.cuda.Stream()
21 |             self.preload()
22 | 
23 |     def preload(self):
24 |         try:
25 |             self.next_samples, self.next_targets = next(self.loader)
26 |         except StopIteration:
27 |             self.next_samples = None
28 |             self.next_targets = None
29 |             return
30 |         # if record_stream() doesn't work, another option is to make sure device inputs are created
31 |         # on the main stream.
32 |         # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda')
33 |         # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda')
34 |         # Need to make sure the memory allocated for next_* is not still in use by the main stream
35 |         # at the time we start copying to next_*:
36 |         # self.stream.wait_stream(torch.cuda.current_stream())
37 |         with torch.cuda.stream(self.stream):
38 |             self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device)
39 |             # more code for the alternative if record_stream() doesn't work:
40 |             # copy_ will record the use of the pinned source tensor in this side stream.
41 |             # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
42 |             # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
43 |             # self.next_input = self.next_input_gpu
44 |             # self.next_target = self.next_target_gpu
45 | 
46 |             # With Amp, it isn't necessary to manually convert data to half.
47 |             # if args.fp16:
48 |             #     self.next_input = self.next_input.half()
49 |             # else:
50 | 
51 |     def next(self):
52 |         if self.prefetch:
53 |             torch.cuda.current_stream().wait_stream(self.stream)
54 |             samples = self.next_samples
55 |             targets = self.next_targets
56 |             if samples is not None:
57 |                 samples.record_stream(torch.cuda.current_stream())
58 |             if targets is not None:
59 |                 for t in targets:
60 |                     for k, v in t.items():
61 |                         v.record_stream(torch.cuda.current_stream())
62 |             self.preload()
63 |         else:
64 |             try:
65 |                 samples, targets = next(self.loader)
66 |                 samples, targets = to_cuda(samples, targets, self.device)
67 |             except StopIteration:
68 |                 samples = None
69 |                 targets = None
70 |         return samples, targets
71 | 


--------------------------------------------------------------------------------
/util/box_ops.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | # ------------------------------------------------------------------------
 9 | 
10 | """
11 | Utilities for bounding box manipulation and GIoU.
12 | """
13 | import torch
14 | from torchvision.ops.boxes import box_area
15 | 
16 | 
17 | def box_cxcywh_to_xyxy(x):
18 |     x_c, y_c, w, h = x.unbind(-1)
19 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
20 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
21 |     return torch.stack(b, dim=-1)
22 | 
23 | 
24 | def box_xyxy_to_cxcywh(x):
25 |     x0, y0, x1, y1 = x.unbind(-1)
26 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
27 |          (x1 - x0), (y1 - y0)]
28 |     return torch.stack(b, dim=-1)
29 | 
30 | 
31 | # modified from torchvision to also return the union
32 | def box_iou(boxes1, boxes2):
33 |     area1 = box_area(boxes1)
34 |     area2 = box_area(boxes2)
35 | 
36 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
37 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
38 | 
39 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
40 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
41 | 
42 |     union = area1[:, None] + area2 - inter
43 | 
44 |     iou = inter / union
45 |     return iou, union
46 | 
47 | 
48 | def generalized_box_iou(boxes1, boxes2):
49 |     """
50 |     Generalized IoU from https://giou.stanford.edu/
51 | 
52 |     The boxes should be in [x0, y0, x1, y1] format
53 | 
54 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
55 |     and M = len(boxes2)
56 |     """
57 |     # degenerate boxes gives inf / nan results
58 |     # so do an early check
59 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
60 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
61 |     iou, union = box_iou(boxes1, boxes2)
62 | 
63 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
64 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
65 | 
66 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
67 |     area = wh[:, :, 0] * wh[:, :, 1]
68 | 
69 |     return iou - (area - union) / area
70 | 
71 | 
72 | def masks_to_boxes(masks):
73 |     """Compute the bounding boxes around the provided masks
74 | 
75 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
76 | 
77 |     Returns a [N, 4] tensors, with the boxes in xyxy format
78 |     """
79 |     if masks.numel() == 0:
80 |         return torch.zeros((0, 4), device=masks.device)
81 | 
82 |     h, w = masks.shape[-2:]
83 | 
84 |     y = torch.arange(0, h, dtype=torch.float)
85 |     x = torch.arange(0, w, dtype=torch.float)
86 |     y, x = torch.meshgrid(y, x)
87 | 
88 |     x_mask = (masks * x.unsqueeze(0))
89 |     x_max = x_mask.flatten(1).max(-1)[0]
90 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
91 | 
92 |     y_mask = (masks * y.unsqueeze(0))
93 |     y_max = y_mask.flatten(1).max(-1)[0]
94 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
95 | 
96 |     return torch.stack([x_min, y_min, x_max, y_max], 1)
97 | 


--------------------------------------------------------------------------------
/models/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import torch
14 | import torch.nn.functional as F
15 | from torch.autograd import Function
16 | from torch.autograd.function import once_differentiable
17 | 
18 | import MultiScaleDeformableAttention as MSDA
19 | 
20 | 
21 | class MSDeformAttnFunction(Function):
22 |     @staticmethod
23 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
24 |         ctx.im2col_step = im2col_step
25 |         output = MSDA.ms_deform_attn_forward(
26 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
27 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
28 |         return output
29 | 
30 |     @staticmethod
31 |     @once_differentiable
32 |     def backward(ctx, grad_output):
33 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
34 |         grad_value, grad_sampling_loc, grad_attn_weight = \
35 |             MSDA.ms_deform_attn_backward(
36 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
37 | 
38 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
39 | 
40 | 
41 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
42 |     # for debug and test only,
43 |     # need to use cuda version instead
44 |     N_, S_, M_, D_ = value.shape
45 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
46 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
47 |     sampling_grids = 2 * sampling_locations - 1
48 |     sampling_value_list = []
49 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
50 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
51 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
52 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
53 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
54 |         # N_*M_, D_, Lq_, P_
55 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
56 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
57 |         sampling_value_list.append(sampling_value_l_)
58 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
59 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
60 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
61 |     return output.transpose(1, 2).contiguous()
62 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | ===============================================================================
 2 | Deformable DETR's Apache License 2.0
 3 | ===============================================================================
 4 | The overall structure of the code is based on the implementation in 
 5 | Deformable-DETR(https://github.com/fundamentalvision/Deformable-DETR).
 6 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 7 | Copyright (c) 2020 SenseTime
 8 | 
 9 | Licensed under the Apache License, Version 2.0 (the "License");
10 | you may not use this file except in compliance with the License.
11 | You may obtain a copy of the License at
12 | 
13 |    http://www.apache.org/licenses/LICENSE-2.0
14 | 
15 | Unless required by applicable law or agreed to in writing, software
16 | distributed under the License is distributed on an "AS IS" BASIS,
17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | See the License for the specific language governing permissions and
19 | limitations under the License.
20 | 
21 | ===============================================================================
22 | DETR's Apache License 2.0
23 | ===============================================================================
24 | Deformable DETR code is orginally built on the implementation in DETR
25 | (https://github.com/facebookresearch/detr).
26 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
27 | Copyright (c) 2020 Facebook, Inc
28 | 
29 | Licensed under the Apache License, Version 2.0 (the "License");
30 | you may not use this file except in compliance with the License.
31 | You may obtain a copy of the License at
32 | 
33 |    http://www.apache.org/licenses/LICENSE-2.0
34 | 
35 | Unless required by applicable law or agreed to in writing, software
36 | distributed under the License is distributed on an "AS IS" BASIS,
37 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
38 | See the License for the specific language governing permissions and
39 | limitations under the License.
40 | 
41 | 
42 | ===============================================================================
43 | Swin Transformer' MIT License
44 | ===============================================================================
45 | The transformer backbone is based on the implementation in Swin Transformer
46 | (https://github.com/microsoft/Swin-Transformer).
47 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
48 | Copyright (c) 2021 Microsoft
49 | 
50 | Permission is hereby granted, free of charge, to any person obtaining a copy
51 | of this software and associated documentation files (the "Software"), to deal
52 | in the Software without restriction, including without limitation the rights
53 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
54 | copies of the Software, and to permit persons to whom the Software is
55 | furnished to do so, subject to the following conditions:
56 | 
57 | The above copyright notice and this permission notice shall be included in all
58 | copies or substantial portions of the Software.
59 | 
60 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
61 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
62 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
63 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
64 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
65 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
66 | SOFTWARE.
67 | 


--------------------------------------------------------------------------------
/datasets/torchvision_datasets/coco.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from torchvision
 7 | # ------------------------------------------------------------------------
 8 | 
 9 | """
10 | Copy-Paste from torchvision, but add utility of caching images on memory
11 | """
12 | from torchvision.datasets.vision import VisionDataset
13 | from PIL import Image
14 | import os
15 | import os.path
16 | import tqdm
17 | from io import BytesIO
18 | 
19 | 
20 | class CocoDetection(VisionDataset):
21 |     """`MS Coco Detection <http://mscoco.org/dataset/#detections-challenge2016>`_ Dataset.
22 |     Args:
23 |         root (string): Root directory where images are downloaded to.
24 |         annFile (string): Path to json annotation file.
25 |         transform (callable, optional): A function/transform that  takes in an PIL image
26 |             and returns a transformed version. E.g, ``transforms.ToTensor``
27 |         target_transform (callable, optional): A function/transform that takes in the
28 |             target and transforms it.
29 |         transforms (callable, optional): A function/transform that takes input sample and its target as entry
30 |             and returns a transformed version.
31 |     """
32 | 
33 |     def __init__(self, root, annFile, transform=None, target_transform=None, transforms=None,
34 |                  cache_mode=False, local_rank=0, local_size=1):
35 |         super(CocoDetection, self).__init__(root, transforms, transform, target_transform)
36 |         from pycocotools.coco import COCO
37 |         self.coco = COCO(annFile)
38 |         self.ids = list(sorted(self.coco.imgs.keys()))
39 |         self.cache_mode = cache_mode
40 |         self.local_rank = local_rank
41 |         self.local_size = local_size
42 |         if cache_mode:
43 |             self.cache = {}
44 |             self.cache_images()
45 | 
46 |     def cache_images(self):
47 |         self.cache = {}
48 |         for index, img_id in zip(tqdm.trange(len(self.ids)), self.ids):
49 |             if index % self.local_size != self.local_rank:
50 |                 continue
51 |             path = self.coco.loadImgs(img_id)[0]['file_name']
52 |             with open(os.path.join(self.root, path), 'rb') as f:
53 |                 self.cache[path] = f.read()
54 | 
55 |     def get_image(self, path):
56 |         if self.cache_mode:
57 |             if path not in self.cache.keys():
58 |                 with open(os.path.join(self.root, path), 'rb') as f:
59 |                     self.cache[path] = f.read()
60 |             return Image.open(BytesIO(self.cache[path])).convert('RGB')
61 |         return Image.open(os.path.join(self.root, path)).convert('RGB')
62 | 
63 |     def __getitem__(self, index):
64 |         """
65 |         Args:
66 |             index (int): Index
67 |         Returns:
68 |             tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``.
69 |         """
70 |         coco = self.coco
71 |         img_id = self.ids[index]
72 |         ann_ids = coco.getAnnIds(imgIds=img_id)
73 |         target = coco.loadAnns(ann_ids)
74 | 
75 |         path = coco.loadImgs(img_id)[0]['file_name']
76 | 
77 |         img = self.get_image(path)
78 |         if self.transforms is not None:
79 |             img, target = self.transforms(img, target)
80 | 
81 |         return img, target
82 | 
83 |     def __len__(self):
84 |         return len(self.ids)
85 | 


--------------------------------------------------------------------------------
/models/swin_transformer/build.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Sparse DETR
 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------
 6 | 
 7 | 
 8 | from collections import abc, OrderedDict
 9 | import os
10 | import yaml
11 | 
12 | from .swin_transformer import SwinTransformer
13 | from .config import Config
14 | 
15 | import torch
16 | 
17 | 
18 | CONFIG_MAP = {
19 |     "swin-t": "models/swin_transformer/configs/swin_tiny_patch4_window7_224.yaml",
20 |     "swin-s": "models/swin_transformer/configs/swin_small_patch4_window7_224.yaml",
21 |     "swin-b": "models/swin_transformer/configs/swin_base_patch4_window7_224.yaml",
22 |     "swin-l": "models/swin_transformer/configs/swin_large_patch4_window7_224.yaml",
23 | }
24 | 
25 | 
26 | CHECKPOINT_MAP = {
27 |     "swin-t": "/data/public/rw/team-autolearn/pretrainedmodels/swin/swin_tiny_patch4_window7_224.pth",
28 | }
29 | 
30 | 
31 | def build_model(name, out_indices, frozen_stages, pretrained):
32 |     config_file = CONFIG_MAP[name]
33 |     config = load_config_yaml(config_file)
34 |     config = Config(config)
35 |     config.freeze()
36 |     
37 |     model_type = config.MODEL.TYPE
38 |     if model_type == 'swin':
39 |         model = SwinTransformer(pretrain_img_size=config.DATA.IMG_SIZE,
40 |                                 patch_size=config.MODEL.SWIN.PATCH_SIZE,
41 |                                 in_chans=config.MODEL.SWIN.IN_CHANS,
42 |                                 embed_dim=config.MODEL.SWIN.EMBED_DIM,
43 |                                 depths=config.MODEL.SWIN.DEPTHS,
44 |                                 num_heads=config.MODEL.SWIN.NUM_HEADS,
45 |                                 window_size=config.MODEL.SWIN.WINDOW_SIZE,
46 |                                 mlp_ratio=config.MODEL.SWIN.MLP_RATIO,
47 |                                 qkv_bias=config.MODEL.SWIN.QKV_BIAS,
48 |                                 qk_scale=config.MODEL.SWIN.QK_SCALE,
49 |                                 drop_rate=config.MODEL.DROP_RATE,
50 |                                 drop_path_rate=config.MODEL.DROP_PATH_RATE,
51 |                                 ape=config.MODEL.SWIN.APE,
52 |                                 patch_norm=config.MODEL.SWIN.PATCH_NORM,
53 |                                 use_checkpoint=config.TRAIN.USE_CHECKPOINT,
54 |                                 out_indices=out_indices,
55 |                                 frozen_stages=frozen_stages)
56 |     else:
57 |         raise NotImplementedError(f"Unkown model: {model_type}")
58 |     
59 |     if pretrained:
60 |         ckpt_path = CHECKPOINT_MAP[name]
61 |         state_dict = torch.load(ckpt_path)
62 |         model.load_state_dict(state_dict['model'], strict=False)
63 |         
64 |     return model
65 | 
66 | 
67 | def _update_dict(tar, src):
68 |     """recursive dict update."""
69 |     for k, v in src.items():
70 |         if isinstance(v, abc.Mapping):
71 |             tar[k] = _update_dict(tar.get(k, {}), v)
72 |         else:
73 |             tar[k] = v
74 |     return tar
75 | 
76 | 
77 | def load_config_yaml(cfg_file, config=None):
78 |     if config is None:
79 |         config = OrderedDict()
80 |     
81 |     with open(cfg_file, 'r') as f:
82 |         config_src = yaml.load(f, Loader=yaml.FullLoader)
83 | 
84 |     for cfg in config_src.setdefault('BASE', ['']):
85 |         if cfg:
86 |             load_config_yaml(
87 |                 os.path.join(os.path.dirname(cfg_file), cfg), config
88 |             )
89 |     print('=> merge config from {}'.format(cfg_file))
90 |     _update_dict(config, config_src)
91 |     return config
92 | 


--------------------------------------------------------------------------------
/util/dam.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------
 2 | # Sparse DETR
 3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------
 6 | 
 7 | 
 8 | from pathlib import Path
 9 | 
10 | import numpy as np
11 | import torch
12 | import torch.nn.functional as F
13 | 
14 | import matplotlib.pyplot as plt
15 | import matplotlib.patches as patches
16 | 
17 | from util.box_ops import box_cxcywh_to_xyxy
18 | from util.misc import unwrap
19 | 
20 | 
21 | def idx_to_flat_grid(spatial_shapes, idx):
22 |     flat_grid_shape = (idx.shape[0], int(torch.sum(spatial_shapes[..., 0] * spatial_shapes[..., 1])))
23 |     flat_grid = torch.zeros(flat_grid_shape, device=idx.device, dtype=torch.float32)
24 |     flat_grid.scatter_(1, idx.to(torch.int64), 1)
25 | 
26 |     return flat_grid
27 | 
28 | 
29 | def attn_map_to_flat_grid(spatial_shapes, level_start_index, sampling_locations, attention_weights):
30 |     # sampling_locations: [N, n_layers, Len_q, n_heads, n_levels, n_points, 2]
31 |     # attention_weights: [N, n_layers, Len_q, n_heads, n_levels, n_points]
32 |     N, n_layers, _, n_heads, *_ = sampling_locations.shape
33 |     sampling_locations = sampling_locations.permute(0, 1, 3, 2, 5, 4, 6).flatten(0, 2).flatten(1, 2)
34 |     # [N * n_layers * n_heads, Len_q * n_points, n_levels, 2]
35 |     attention_weights = attention_weights.permute(0, 1, 3, 2, 5, 4).flatten(0, 2).flatten(1, 2)
36 |     # [N * n_layers * n_heads, Len_q * n_points, n_levels]
37 | 
38 |     rev_spatial_shapes = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], dim=-1) # hw -> wh (xy)
39 |     col_row_float = sampling_locations * rev_spatial_shapes
40 | 
41 |     col_row_ll = col_row_float.floor().to(torch.int64)
42 |     zero = torch.zeros(*col_row_ll.shape[:-1], dtype=torch.int64, device=col_row_ll.device)
43 |     one = torch.ones(*col_row_ll.shape[:-1], dtype=torch.int64, device=col_row_ll.device)
44 |     col_row_lh = col_row_ll + torch.stack([zero, one], dim=-1)
45 |     col_row_hl = col_row_ll + torch.stack([one, zero], dim=-1)
46 |     col_row_hh = col_row_ll + 1
47 | 
48 |     margin_ll = (col_row_float - col_row_ll).prod(dim=-1)
49 |     margin_lh = -(col_row_float - col_row_lh).prod(dim=-1)
50 |     margin_hl = -(col_row_float - col_row_hl).prod(dim=-1)
51 |     margin_hh = (col_row_float - col_row_hh).prod(dim=-1)
52 | 
53 |     flat_grid_shape = (attention_weights.shape[0], int(torch.sum(spatial_shapes[..., 0] * spatial_shapes[..., 1])))
54 |     flat_grid = torch.zeros(flat_grid_shape, dtype=torch.float32, device=attention_weights.device)
55 | 
56 |     zipped = [(col_row_ll, margin_hh), (col_row_lh, margin_hl), (col_row_hl, margin_lh), (col_row_hh, margin_ll)]
57 |     for col_row, margin in zipped:
58 |         valid_mask = torch.logical_and(
59 |             torch.logical_and(col_row[..., 0] >= 0, col_row[..., 0] < rev_spatial_shapes[..., 0]),
60 |             torch.logical_and(col_row[..., 1] >= 0, col_row[..., 1] < rev_spatial_shapes[..., 1]),
61 |         )
62 |         idx = col_row[..., 1] * spatial_shapes[..., 1] + col_row[..., 0] + level_start_index
63 |         idx = (idx * valid_mask).flatten(1, 2)
64 |         weights = (attention_weights * valid_mask * margin).flatten(1)
65 |         flat_grid.scatter_add_(1, idx, weights)
66 | 
67 |     return flat_grid.reshape(N, n_layers, n_heads, -1)
68 | 
69 | 
70 | def compute_corr(flat_grid_topk, flat_grid_attn_map, spatial_shapes):
71 |     if len(flat_grid_topk.shape) == 1:
72 |         flat_grid_topk = flat_grid_topk.unsqueeze(0)
73 |         flat_grid_attn_map = flat_grid_attn_map.unsqueeze(0)
74 |         
75 |     tot = flat_grid_attn_map.sum(-1)
76 |     hit = (flat_grid_topk * flat_grid_attn_map).sum(-1)
77 | 
78 |     corr = [hit / tot]
79 |     flat_grid_idx = 0
80 | 
81 |     for shape in spatial_shapes:
82 |         level_range = np.arange(int(flat_grid_idx), int(flat_grid_idx + shape[0] * shape[1]))
83 |         tot = (flat_grid_attn_map[:, level_range]).sum(-1)
84 |         hit = (flat_grid_topk[:, level_range] * flat_grid_attn_map[:, level_range]).sum(-1)
85 |         flat_grid_idx += shape[0] * shape[1]
86 |         corr.append(hit / tot)
87 |     return corr
88 | 
89 | 


--------------------------------------------------------------------------------
/models/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import time
14 | import torch
15 | import torch.nn as nn
16 | from torch.autograd import gradcheck
17 | 
18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19 | 
20 | 
21 | N, M, D = 1, 2, 2
22 | Lq, L, P = 2, 2, 2
23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25 | S = sum([(H*W).item() for H, W in shapes])
26 | 
27 | 
28 | torch.manual_seed(3)
29 | 
30 | 
31 | @torch.no_grad()
32 | def check_forward_equal_with_pytorch_double():
33 |     value = torch.rand(N, S, M, D).cuda() * 0.01
34 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37 |     im2col_step = 2
38 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40 |     fwdok = torch.allclose(output_cuda, output_pytorch)
41 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
42 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43 | 
44 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45 | 
46 | 
47 | @torch.no_grad()
48 | def check_forward_equal_with_pytorch_float():
49 |     value = torch.rand(N, S, M, D).cuda() * 0.01
50 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
51 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
52 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53 |     im2col_step = 2
54 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
55 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
56 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
57 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
58 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
59 | 
60 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
61 | 
62 | 
63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
64 | 
65 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
66 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
67 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
68 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
69 |     im2col_step = 2
70 |     func = MSDeformAttnFunction.apply
71 | 
72 |     value.requires_grad = grad_value
73 |     sampling_locations.requires_grad = grad_sampling_loc
74 |     attention_weights.requires_grad = grad_attn_weight
75 | 
76 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
77 | 
78 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     check_forward_equal_with_pytorch_double()
83 |     check_forward_equal_with_pytorch_float()
84 | 
85 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
86 |         check_gradient_numerical(channels, True, True, True)
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/models/position_encoding.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------
  2 | # Sparse DETR
  3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------
  6 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  7 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  8 | # ------------------------------------------------------------------------------------
  9 | # Modified from DETR (https://github.com/facebookresearch/detr)
 10 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 11 | # ------------------------------------------------------------------------------------
 12 | 
 13 | 
 14 | """
 15 | Various positional encodings for the transformer.
 16 | """
 17 | import math
 18 | import torch
 19 | from torch import nn
 20 | 
 21 | from util.misc import NestedTensor
 22 | 
 23 | 
 24 | class PositionEmbeddingSine(nn.Module):
 25 |     """
 26 |     This is a more standard version of the position embedding, very similar to the one
 27 |     used by the Attention is all you need paper, generalized to work on images.
 28 |     """
 29 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
 30 |         super().__init__()
 31 |         self.num_pos_feats = num_pos_feats
 32 |         self.temperature = temperature
 33 |         self.normalize = normalize
 34 |         if scale is not None and normalize is False:
 35 |             raise ValueError("normalize should be True if scale is passed")
 36 |         if scale is None:
 37 |             scale = 2 * math.pi
 38 |         self.scale = scale
 39 | 
 40 |     def forward(self, tensor_list: NestedTensor):
 41 |         x = tensor_list.tensors
 42 |         mask = tensor_list.mask
 43 |         assert mask is not None
 44 |         not_mask = ~mask
 45 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
 46 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
 47 |         if self.normalize:
 48 |             eps = 1e-6
 49 |             y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
 50 |             x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
 51 | 
 52 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
 53 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
 54 | 
 55 |         pos_x = x_embed[:, :, :, None] / dim_t
 56 |         pos_y = y_embed[:, :, :, None] / dim_t
 57 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
 58 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
 59 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
 60 |         return pos
 61 | 
 62 | 
 63 | class PositionEmbeddingLearned(nn.Module):
 64 |     """
 65 |     Absolute pos embedding, learned.
 66 |     """
 67 |     def __init__(self, num_pos_feats=256):
 68 |         super().__init__()
 69 |         self.row_embed = nn.Embedding(50, num_pos_feats)
 70 |         self.col_embed = nn.Embedding(50, num_pos_feats)
 71 |         self.reset_parameters()
 72 | 
 73 |     def reset_parameters(self):
 74 |         nn.init.uniform_(self.row_embed.weight)
 75 |         nn.init.uniform_(self.col_embed.weight)
 76 | 
 77 |     def forward(self, tensor_list: NestedTensor):
 78 |         x = tensor_list.tensors
 79 |         h, w = x.shape[-2:]
 80 |         i = torch.arange(w, device=x.device)
 81 |         j = torch.arange(h, device=x.device)
 82 |         x_emb = self.col_embed(i)
 83 |         y_emb = self.row_embed(j)
 84 |         pos = torch.cat([
 85 |             x_emb.unsqueeze(0).repeat(h, 1, 1),
 86 |             y_emb.unsqueeze(1).repeat(1, w, 1),
 87 |         ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
 88 |         return pos
 89 | 
 90 | 
 91 | def build_position_encoding(args):
 92 |     N_steps = args.hidden_dim // 2
 93 |     if args.position_embedding in ('v2', 'sine'):
 94 |         # TODO find a better way of exposing other arguments
 95 |         position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
 96 |     elif args.position_embedding in ('v3', 'learned'):
 97 |         position_embedding = PositionEmbeddingLearned(N_steps)
 98 |     else:
 99 |         raise ValueError(f"not supported {args.position_embedding}")
100 | 
101 |     return position_embedding
102 | 


--------------------------------------------------------------------------------
/datasets/coco_panoptic.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from DETR (https://github.com/facebookresearch/detr)
  7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  8 | # ------------------------------------------------------------------------
  9 | 
 10 | import json
 11 | from pathlib import Path
 12 | 
 13 | import numpy as np
 14 | import torch
 15 | from PIL import Image
 16 | 
 17 | from panopticapi.utils import rgb2id
 18 | from util.box_ops import masks_to_boxes
 19 | 
 20 | from .coco import make_coco_transforms
 21 | 
 22 | 
 23 | class CocoPanoptic:
 24 |     def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True):
 25 |         with open(ann_file, 'r') as f:
 26 |             self.coco = json.load(f)
 27 | 
 28 |         # sort 'images' field so that they are aligned with 'annotations'
 29 |         # i.e., in alphabetical order
 30 |         self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id'])
 31 |         # sanity check
 32 |         if "annotations" in self.coco:
 33 |             for img, ann in zip(self.coco['images'], self.coco['annotations']):
 34 |                 assert img['file_name'][:-4] == ann['file_name'][:-4]
 35 | 
 36 |         self.img_folder = img_folder
 37 |         self.ann_folder = ann_folder
 38 |         self.ann_file = ann_file
 39 |         self.transforms = transforms
 40 |         self.return_masks = return_masks
 41 | 
 42 |     def __getitem__(self, idx):
 43 |         ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx]
 44 |         img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg')
 45 |         ann_path = Path(self.ann_folder) / ann_info['file_name']
 46 | 
 47 |         img = Image.open(img_path).convert('RGB')
 48 |         w, h = img.size
 49 |         if "segments_info" in ann_info:
 50 |             masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
 51 |             masks = rgb2id(masks)
 52 | 
 53 |             ids = np.array([ann['id'] for ann in ann_info['segments_info']])
 54 |             masks = masks == ids[:, None, None]
 55 | 
 56 |             masks = torch.as_tensor(masks, dtype=torch.uint8)
 57 |             labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64)
 58 | 
 59 |         target = {}
 60 |         target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]])
 61 |         if self.return_masks:
 62 |             target['masks'] = masks
 63 |         target['labels'] = labels
 64 | 
 65 |         target["boxes"] = masks_to_boxes(masks)
 66 | 
 67 |         target['size'] = torch.as_tensor([int(h), int(w)])
 68 |         target['orig_size'] = torch.as_tensor([int(h), int(w)])
 69 |         if "segments_info" in ann_info:
 70 |             for name in ['iscrowd', 'area']:
 71 |                 target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']])
 72 | 
 73 |         if self.transforms is not None:
 74 |             img, target = self.transforms(img, target)
 75 | 
 76 |         return img, target
 77 | 
 78 |     def __len__(self):
 79 |         return len(self.coco['images'])
 80 | 
 81 |     def get_height_and_width(self, idx):
 82 |         img_info = self.coco['images'][idx]
 83 |         height = img_info['height']
 84 |         width = img_info['width']
 85 |         return height, width
 86 | 
 87 | 
 88 | def build(image_set, args):
 89 |     img_folder_root = Path(args.coco_path)
 90 |     ann_folder_root = Path(args.coco_panoptic_path)
 91 |     assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist'
 92 |     assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist'
 93 |     mode = 'panoptic'
 94 |     PATHS = {
 95 |         "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'),
 96 |         "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'),
 97 |     }
 98 | 
 99 |     img_folder, ann_file = PATHS[image_set]
100 |     img_folder_path = img_folder_root / img_folder
101 |     ann_folder = ann_folder_root / f'{mode}_{img_folder}'
102 |     ann_file = ann_folder_root / ann_file
103 | 
104 |     dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file,
105 |                            transforms=make_coco_transforms(image_set), return_masks=args.masks)
106 | 
107 |     return dataset
108 | 


--------------------------------------------------------------------------------
/util/plot_utils.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from DETR (https://github.com/facebookresearch/detr)
  7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  8 | # ------------------------------------------------------------------------
  9 | 
 10 | """
 11 | Plotting utilities to visualize training logs.
 12 | """
 13 | import torch
 14 | import pandas as pd
 15 | import seaborn as sns
 16 | import matplotlib.pyplot as plt
 17 | 
 18 | from pathlib import Path, PurePath
 19 | 
 20 | 
 21 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'):
 22 |     '''
 23 |     Function to plot specific fields from training log(s). Plots both training and test results.
 24 | 
 25 |     :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file
 26 |               - fields = which results to plot from each log file - plots both training and test for each field.
 27 |               - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots
 28 |               - log_name = optional, name of log file if different than default 'log.txt'.
 29 | 
 30 |     :: Outputs - matplotlib plots of results in fields, color coded for each log file.
 31 |                - solid lines are training results, dashed lines are test results.
 32 | 
 33 |     '''
 34 |     func_name = "plot_utils.py::plot_logs"
 35 | 
 36 |     # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path,
 37 |     # convert single Path to list to avoid 'not iterable' error
 38 | 
 39 |     if not isinstance(logs, list):
 40 |         if isinstance(logs, PurePath):
 41 |             logs = [logs]
 42 |             print(f"{func_name} info: logs param expects a list argument, converted to list[Path].")
 43 |         else:
 44 |             raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \
 45 |             Expect list[Path] or single Path obj, received {type(logs)}")
 46 | 
 47 |     # verify valid dir(s) and that every item in list is Path object
 48 |     for i, dir in enumerate(logs):
 49 |         if not isinstance(dir, PurePath):
 50 |             raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}")
 51 |         if dir.exists():
 52 |             continue
 53 |         raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}")
 54 | 
 55 |     # load log file(s) and plot
 56 |     dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs]
 57 | 
 58 |     fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5))
 59 | 
 60 |     for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))):
 61 |         for j, field in enumerate(fields):
 62 |             if field == 'mAP':
 63 |                 coco_eval = pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]).ewm(com=ewm_col).mean()
 64 |                 axs[j].plot(coco_eval, c=color)
 65 |             else:
 66 |                 df.interpolate().ewm(com=ewm_col).mean().plot(
 67 |                     y=[f'train_{field}', f'test_{field}'],
 68 |                     ax=axs[j],
 69 |                     color=[color] * 2,
 70 |                     style=['-', '--']
 71 |                 )
 72 |     for ax, field in zip(axs, fields):
 73 |         ax.legend([Path(p).name for p in logs])
 74 |         ax.set_title(field)
 75 | 
 76 | 
 77 | def plot_precision_recall(files, naming_scheme='iter'):
 78 |     if naming_scheme == 'exp_id':
 79 |         # name becomes exp_id
 80 |         names = [f.parts[-3] for f in files]
 81 |     elif naming_scheme == 'iter':
 82 |         names = [f.stem for f in files]
 83 |     else:
 84 |         raise ValueError(f'not supported {naming_scheme}')
 85 |     fig, axs = plt.subplots(ncols=2, figsize=(16, 5))
 86 |     for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names):
 87 |         data = torch.load(f)
 88 |         # precision is n_iou, n_points, n_cat, n_area, max_det
 89 |         precision = data['precision']
 90 |         recall = data['params'].recThrs
 91 |         scores = data['scores']
 92 |         # take precision for all classes, all areas and 100 detections
 93 |         precision = precision[0, :, :, 0, -1].mean(1)
 94 |         scores = scores[0, :, :, 0, -1].mean(1)
 95 |         prec = precision.mean()
 96 |         rec = data['recall'][0, :, 0, -1].mean()
 97 |         print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' +
 98 |               f'score={scores.mean():0.3f}, ' +
 99 |               f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}'
100 |               )
101 |         axs[0].plot(recall, precision, c=color)
102 |         axs[1].plot(recall, scores, c=color)
103 | 
104 |     axs[0].set_title('Precision / Recall')
105 |     axs[0].legend(names)
106 |     axs[1].set_title('Scores / Recall')
107 |     axs[1].legend(names)
108 |     return fig, axs
109 | 
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/util/benchmark.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import time
  3 | from typing import Any, Counter, DefaultDict, Tuple, Dict, Optional
  4 | import warnings
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from torch import nn
  9 | import tqdm
 10 | 
 11 | from util.misc import nested_tensor_from_tensor_list
 12 | from fvcore.nn import FlopCountAnalysis
 13 | from fvcore.nn.jit_handles import Handle
 14 | 
 15 | 
 16 | @torch.no_grad()
 17 | def measure_average_inference_time(model, inputs, num_iters=100, warm_iters=5):
 18 |     ts = []
 19 |     # note that warm-up iters. are excluded from the total iters.
 20 |     for iter_ in tqdm.tqdm(range(warm_iters + num_iters)):
 21 |         torch.cuda.synchronize()
 22 |         t_ = time.perf_counter()
 23 |         model(inputs)
 24 |         torch.cuda.synchronize()
 25 |         t = time.perf_counter() - t_
 26 |         if iter_ >= warm_iters:
 27 |           ts.append(t)
 28 |     return sum(ts) / len(ts)
 29 | 
 30 | 
 31 | def python_ops_mode_for_deform_attn(model, ops_mode):
 32 |     def change_ops_mode(module):
 33 |         if hasattr(module, "python_ops_for_test"):
 34 |             module.python_ops_for_test = ops_mode
 35 |     model.apply(change_ops_mode)
 36 |     
 37 |     
 38 | @torch.no_grad()
 39 | def compute_fps(model, dataset, num_iters=300, warm_iters=5, batch_size=4):
 40 |     print(f"computing fps.. (num_iters={num_iters}, batch_size={batch_size}) "
 41 |           f"warm_iters={warm_iters}, batch_size={batch_size}]")
 42 |     assert num_iters > 0 and warm_iters >= 0 and batch_size > 0
 43 |     model.cuda()
 44 |     model.eval()
 45 |     inputs = nested_tensor_from_tensor_list(
 46 |         [dataset.__getitem__(0)[0].cuda() for _ in range(batch_size)])
 47 |     t = measure_average_inference_time(model, inputs, num_iters, warm_iters)
 48 |     model.train()
 49 |     print(f"FPS: {1.0 / t * batch_size}")  
 50 |     return 1.0 / t * batch_size
 51 |       
 52 |         
 53 | @torch.no_grad()
 54 | def compute_gflops(model, dataset, approximated=True):
 55 |     print(f"computing flops.. (approximated={approximated})")
 56 |     model.eval()
 57 |     python_ops_mode_for_deform_attn(model, True)
 58 |     if approximated:
 59 |         # use just a single image to approximate the full compuation
 60 |         # the size of the image was found heuristically
 61 |         images = [torch.randn((3, 850, 1040))]
 62 |     else:
 63 |         # full computation: get the first 100 images of COCO val2017
 64 |         images = []
 65 |         for idx in range(100):
 66 |             img, _ = dataset[idx]
 67 |             images.append(img)
 68 |     
 69 |     gflops_list = []
 70 |     imsize_list = []
 71 |     
 72 |     for img in tqdm.tqdm(images):
 73 |         inputs = [img.cuda()]
 74 |         with warnings.catch_warnings():
 75 |             warnings.filterwarnings("ignore", category=RuntimeWarning)
 76 |             res = flop_count_without_warnings(model, (inputs,), )[0]
 77 |         gflops = sum(res.values())
 78 |         gflops_list.append(gflops)
 79 |         imsize_list.append(list(img.shape))
 80 |     
 81 |     if approximated:
 82 |         print(f"The image size used for approximation: [3, 850, 1040]")
 83 |     else:
 84 |         print("Average image size of first 100 image of COCO val2017 : "
 85 |               f"{np.array(imsize_list).mean(0)}")
 86 |         
 87 |     print(f"GFLOPs : {np.array(gflops_list).mean()}")
 88 |     model.train()
 89 |     python_ops_mode_for_deform_attn(model, False)
 90 |     return gflops
 91 | 
 92 | 
 93 | def flop_count_without_warnings(
 94 |     
 95 |     model: nn.Module,
 96 |     inputs: Tuple[Any, ...],
 97 |     supported_ops: Optional[Dict[str, Handle]] = None,
 98 | ) -> Tuple[DefaultDict[str, float], Counter[str]]:
 99 |     """copied and modified from fvcore.nn.flop_count.py
100 |     
101 |     Given a model and an input to the model, compute the per-operator Gflops
102 |     of the given model.
103 |     Args:
104 |         model (nn.Module): The model to compute flop counts.
105 |         inputs (tuple): Inputs that are passed to `model` to count flops.
106 |             Inputs need to be in a tuple.
107 |         supported_ops (dict(str,Callable) or None) : provide additional
108 |             handlers for extra ops, or overwrite the existing handlers for
109 |             convolution and matmul and einsum. The key is operator name and the value
110 |             is a function that takes (inputs, outputs) of the op. We count
111 |             one Multiply-Add as one FLOP.
112 |     Returns:
113 |         tuple[defaultdict, Counter]: A dictionary that records the number of
114 |             gflops for each operation and a Counter that records the number of
115 |             unsupported operations.
116 |     """
117 |     if supported_ops is None:
118 |         supported_ops = {}
119 |     flop_counter = FlopCountAnalysis(model, inputs).set_op_handle(**supported_ops)
120 |     flop_counter.unsupported_ops_warnings(False)
121 |     flop_counter.uncalled_modules_warnings(False)
122 |     flop_counter.tracer_warnings("no_tracer_warning")
123 |     giga_flops = defaultdict(float)
124 |     for op, flop in flop_counter.by_operator().items():
125 |         giga_flops[op] = flop / 1e9
126 |     return giga_flops, flop_counter.unsupported_ops() 
127 | 


--------------------------------------------------------------------------------
/models/matcher.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------
  2 | # Sparse DETR
  3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------
  6 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  7 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  8 | # ------------------------------------------------------------------------------------
  9 | # Modified from DETR (https://github.com/facebookresearch/detr)
 10 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 11 | # ------------------------------------------------------------------------------------
 12 | 
 13 | 
 14 | """
 15 | Modules to compute the matching cost and solve the corresponding LSAP.
 16 | """
 17 | import torch
 18 | from scipy.optimize import linear_sum_assignment
 19 | from torch import nn
 20 | 
 21 | from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
 22 | 
 23 | 
 24 | class HungarianMatcher(nn.Module):
 25 |     """This class computes an assignment between the targets and the predictions of the network
 26 | 
 27 |     For efficiency reasons, the targets don't include the no_object. Because of this, in general,
 28 |     there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
 29 |     while the others are un-matched (and thus treated as non-objects).
 30 |     """
 31 | 
 32 |     def __init__(self,
 33 |                  cost_class: float = 1,
 34 |                  cost_bbox: float = 1,
 35 |                  cost_giou: float = 1):
 36 |         """Creates the matcher
 37 | 
 38 |         Params:
 39 |             cost_class: This is the relative weight of the classification error in the matching cost
 40 |             cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
 41 |             cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
 42 |         """
 43 |         super().__init__()
 44 |         self.cost_class = cost_class
 45 |         self.cost_bbox = cost_bbox
 46 |         self.cost_giou = cost_giou
 47 |         assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
 48 | 
 49 |     def forward(self, outputs, targets):
 50 |         """ Performs the matching
 51 | 
 52 |         Params:
 53 |             outputs: This is a dict that contains at least these entries:
 54 |                  "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
 55 |                  "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
 56 | 
 57 |             targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
 58 |                  "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
 59 |                            objects in the target) containing the class labels
 60 |                  "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
 61 | 
 62 |         Returns:
 63 |             A list of size batch_size, containing tuples of (index_i, index_j) where:
 64 |                 - index_i is the indices of the selected predictions (in order)
 65 |                 - index_j is the indices of the corresponding selected targets (in order)
 66 |             For each batch element, it holds:
 67 |                 len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
 68 |         """
 69 |         with torch.no_grad():
 70 |             bs, num_queries = outputs["pred_logits"].shape[:2]
 71 | 
 72 |             # We flatten to compute the cost matrices in a batch
 73 |             out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
 74 |             out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
 75 | 
 76 |             # Also concat the target labels and boxes
 77 |             tgt_ids = torch.cat([v["labels"] for v in targets])
 78 |             tgt_bbox = torch.cat([v["boxes"] for v in targets])
 79 | 
 80 |             # Compute the classification cost.
 81 |             alpha = 0.25
 82 |             gamma = 2.0
 83 |             neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
 84 |             pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
 85 |             cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
 86 | 
 87 |             # Compute the L1 cost between boxes
 88 |             cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
 89 | 
 90 |             # Compute the giou cost betwen boxes
 91 |             cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
 92 |                                              box_cxcywh_to_xyxy(tgt_bbox))
 93 | 
 94 |             # Final cost matrix
 95 |             C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
 96 |             C = C.view(bs, num_queries, -1).cpu()
 97 | 
 98 |             sizes = [len(v["boxes"]) for v in targets]
 99 |             indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
100 |             return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor([_j % size for _j in j], dtype=torch.int64)) 
101 |                     for (i, j), size in zip(indices, sizes)]
102 | 
103 | 
104 | def build_matcher(args):
105 |     return HungarianMatcher(cost_class=args.set_cost_class,
106 |                             cost_bbox=args.set_cost_bbox,
107 |                             cost_giou=args.set_cost_giou)
108 | 


--------------------------------------------------------------------------------
/datasets/samplers.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from codes in torch.utils.data.distributed
  7 | # ------------------------------------------------------------------------
  8 | 
  9 | import os
 10 | import math
 11 | import torch
 12 | import torch.distributed as dist
 13 | from torch.utils.data.sampler import Sampler
 14 | 
 15 | 
 16 | class DistributedSampler(Sampler):
 17 |     """Sampler that restricts data loading to a subset of the dataset.
 18 |     It is especially useful in conjunction with
 19 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
 20 |     process can pass a DistributedSampler instance as a DataLoader sampler,
 21 |     and load a subset of the original dataset that is exclusive to it.
 22 |     .. note::
 23 |         Dataset is assumed to be of constant size.
 24 |     Arguments:
 25 |         dataset: Dataset used for sampling.
 26 |         num_replicas (optional): Number of processes participating in
 27 |             distributed training.
 28 |         rank (optional): Rank of the current process within num_replicas.
 29 |     """
 30 | 
 31 |     def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True):
 32 |         if num_replicas is None:
 33 |             if not dist.is_available():
 34 |                 raise RuntimeError("Requires distributed package to be available")
 35 |             num_replicas = dist.get_world_size()
 36 |         if rank is None:
 37 |             if not dist.is_available():
 38 |                 raise RuntimeError("Requires distributed package to be available")
 39 |             rank = dist.get_rank()
 40 |         self.dataset = dataset
 41 |         self.num_replicas = num_replicas
 42 |         self.rank = rank
 43 |         self.epoch = 0
 44 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
 45 |         self.total_size = self.num_samples * self.num_replicas
 46 |         self.shuffle = shuffle
 47 | 
 48 |     def __iter__(self):
 49 |         if self.shuffle:
 50 |             # deterministically shuffle based on epoch
 51 |             g = torch.Generator()
 52 |             g.manual_seed(self.epoch)
 53 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
 54 |         else:
 55 |             indices = torch.arange(len(self.dataset)).tolist()
 56 | 
 57 |         # add extra samples to make it evenly divisible
 58 |         indices += indices[: (self.total_size - len(indices))]
 59 |         assert len(indices) == self.total_size
 60 | 
 61 |         # subsample
 62 |         offset = self.num_samples * self.rank
 63 |         indices = indices[offset : offset + self.num_samples]
 64 |         assert len(indices) == self.num_samples
 65 | 
 66 |         return iter(indices)
 67 | 
 68 |     def __len__(self):
 69 |         return self.num_samples
 70 | 
 71 |     def set_epoch(self, epoch):
 72 |         self.epoch = epoch
 73 | 
 74 | 
 75 | class NodeDistributedSampler(Sampler):
 76 |     """Sampler that restricts data loading to a subset of the dataset.
 77 |     It is especially useful in conjunction with
 78 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
 79 |     process can pass a DistributedSampler instance as a DataLoader sampler,
 80 |     and load a subset of the original dataset that is exclusive to it.
 81 |     .. note::
 82 |         Dataset is assumed to be of constant size.
 83 |     Arguments:
 84 |         dataset: Dataset used for sampling.
 85 |         num_replicas (optional): Number of processes participating in
 86 |             distributed training.
 87 |         rank (optional): Rank of the current process within num_replicas.
 88 |     """
 89 | 
 90 |     def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True):
 91 |         if num_replicas is None:
 92 |             if not dist.is_available():
 93 |                 raise RuntimeError("Requires distributed package to be available")
 94 |             num_replicas = dist.get_world_size()
 95 |         if rank is None:
 96 |             if not dist.is_available():
 97 |                 raise RuntimeError("Requires distributed package to be available")
 98 |             rank = dist.get_rank()
 99 |         if local_rank is None:
100 |             local_rank = int(os.environ.get('LOCAL_RANK', 0))
101 |         if local_size is None:
102 |             local_size = int(os.environ.get('LOCAL_SIZE', 1))
103 |         self.dataset = dataset
104 |         self.shuffle = shuffle
105 |         self.num_replicas = num_replicas
106 |         self.num_parts = local_size
107 |         self.rank = rank
108 |         self.local_rank = local_rank
109 |         self.epoch = 0
110 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
111 |         self.total_size = self.num_samples * self.num_replicas
112 | 
113 |         self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts
114 | 
115 |     def __iter__(self):
116 |         if self.shuffle:
117 |             # deterministically shuffle based on epoch
118 |             g = torch.Generator()
119 |             g.manual_seed(self.epoch)
120 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
121 |         else:
122 |             indices = torch.arange(len(self.dataset)).tolist()
123 |         indices = [i for i in indices if i % self.num_parts == self.local_rank]
124 | 
125 |         # add extra samples to make it evenly divisible
126 |         indices += indices[:(self.total_size_parts - len(indices))]
127 |         assert len(indices) == self.total_size_parts
128 | 
129 |         # subsample
130 |         indices = indices[self.rank // self.num_parts:self.total_size_parts:self.num_replicas // self.num_parts]
131 |         assert len(indices) == self.num_samples
132 | 
133 |         return iter(indices)
134 | 
135 |     def __len__(self):
136 |         return self.num_samples
137 | 
138 |     def set_epoch(self, epoch):
139 |         self.epoch = epoch
140 | 


--------------------------------------------------------------------------------
/models/swin_transformer/config.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------
  2 | # Sparse DETR
  3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------
  6 | 
  7 | 
  8 | import collections
  9 | from collections import OrderedDict
 10 | from copy import deepcopy
 11 | import logging
 12 | from os.path import basename, splitext
 13 | from pprint import pformat
 14 | from types import SimpleNamespace
 15 | import yaml
 16 | 
 17 | 
 18 | class Config(SimpleNamespace):
 19 |     """Dictionary-based but also dot-accessible configuration object, which will 
 20 |     rescue you from the messy brackets and quotation marks while accessing 
 21 |     nested dictionaries.
 22 |         
 23 |     As the usage example below, a value can be easily assigned to a new field 
 24 |     with hierarchies by using Python's usual assignment syntax. Due to the side 
 25 |     effects of this feature, it is safe that the user call '.freeze()' before 
 26 |     using the Config instance as a fixed configuration. Otherwise, even when 
 27 |     a wanted attribute is called with an incorrect name, AttributeError will be 
 28 |     silently ignored and returns an empty config, which could be resulting in 
 29 |     unwanted consequences.
 30 |     
 31 |     Usage:
 32 |         >>> cfg = Config()
 33 |         >>> cfg.foo = 1
 34 |         >>> cfg.bar.baz = 2
 35 |         >>> cfg['bar']['baz'] == cfg.bar.baz
 36 |         True
 37 |         >>> cfg.pprint()
 38 |         ---
 39 |         foo: 1
 40 |         bar:
 41 |             baz: 2
 42 |         ...
 43 |         >>> cfg.freeze()
 44 |         >>> cfg.new = 3
 45 |         RuntimeError: Can't set new attribute after being freezed!
 46 |             
 47 |     """
 48 |     def __init__(self, _dict=None, **kwargs):
 49 |         super().__init__(**kwargs)
 50 |         self._freezed = False
 51 |         self._order = list()
 52 |         if _dict is not None:
 53 |             self._set_with_nested_dict(_dict)
 54 | 
 55 |     def _set_with_nested_dict(self, _dict):
 56 |         for key, value in _dict.items():
 57 |             if isinstance(value, dict):
 58 |                 self.__setattr__(key, Config(value))
 59 |             else:
 60 |                 self.__setattr__(key, value)
 61 |                 self._order.append(key)
 62 |                 
 63 |     @property
 64 |     def freezed(self):
 65 |         return self._freezed
 66 |                 
 67 |     @classmethod
 68 |     def from_yaml(cls, yaml_file):
 69 |         """Initialize configuration with a YAML file."""
 70 |         return cls(OrderedDict(yaml.load(open(yaml_file, "r"), 
 71 |                                          Loader=yaml.FullLoader)))
 72 | 
 73 |     def __repr__(self):
 74 |         return 'Config' + self.to_dict().__repr__()
 75 | 
 76 |     def __getitem__(self, item):
 77 |         return self.__getattr__(item)
 78 | 
 79 |     def __getattr__(self, item):
 80 |         try:
 81 |             return self.__getattribute__(item)
 82 |         except AttributeError as e:
 83 |             if self._freezed:
 84 |                 raise AttributeError(f"Can't find the field: {item}") from e
 85 |             else:
 86 |                 # if there's no attribute with the given name, 
 87 |                 # make new one and assign an empty config. 
 88 |                 self.__setattr__(item, Config())
 89 |                 return self.__getattribute__(item)
 90 |         
 91 |     def __setattr__(self, item, value):
 92 |         if item != '_freezed' and self.__dict__['_freezed']:
 93 |             raise RuntimeError("Can't set new attribute after being freezed!")
 94 |         super().__setattr__(item, value)
 95 | 
 96 |     def __bool__(self):
 97 |         return len([k for k in self.to_dict().keys() 
 98 |                     if not k.startswith('_')]) > 0
 99 | 
100 |     def __len__(self):
101 |         return len(self.to_dict())
102 | 
103 |     def __getstate__(self):
104 |         return self.to_dict()
105 | 
106 |     def __setstate__(self, state):
107 |         self._set_with_nested_dict(state)
108 | 
109 |     def __contains__(self, item):
110 |         return self.to_dict().__contains__(item)
111 | 
112 |     def __deepcopy__(self, memodict={}):
113 |         return Config(_dict=deepcopy(self.to_dict()))
114 | 
115 |     def __iter__(self):
116 |         # for iterable unpacking
117 |         return self.to_dict().__iter__()
118 |     
119 |     def pformat(self):
120 |         return yaml.dump(self.to_dict(), indent=4, sort_keys=False,
121 |                          explicit_start=True, explicit_end=True)
122 |                                         
123 |     def pprint(self):
124 |         return print(self.pformat())
125 |     
126 |     def freeze(self):
127 |         self._freezed = True
128 |         for value in self.__dict__.values():
129 |             if isinstance(value, Config):
130 |                 value.freeze()
131 |         
132 |         return self
133 |         
134 |     def defrost(self):
135 |         self._freezed = False
136 |         for value in self.__dict__.values():
137 |             if isinstance(value, Config):
138 |                 value.defrost()
139 |         return self
140 | 
141 |     def get(self, *args, **kwargs):
142 |         return self.to_dict().get(*args, **kwargs)
143 | 
144 |     def keys(self):
145 |         return self.to_dict().keys()
146 | 
147 |     def values(self):
148 |         return self.to_dict().values()
149 | 
150 |     def items(self):
151 |         return self.to_dict().items()
152 | 
153 |     def clone(self):
154 |         return self.__deepcopy__()
155 | 
156 |     def update(self, dict_, delimiter='/'):
157 |         for k, v in dict_.items():
158 |             self._update(k, v, delimiter)
159 | 
160 |     def _update(self, key, value, delimiter='/'):
161 |         obj = self
162 |         keys = key.split(delimiter)
163 |         for k in keys[:-1]:
164 |             obj = obj.__getattr__(k)
165 |         obj.__setattr__(keys[-1], value)
166 | 
167 |     def to_dict(self):
168 |         out_dict = OrderedDict()
169 |         for key, value in self.__dict__.items():
170 |             if isinstance(value, Config):
171 |                 out_dict[key] = value.to_dict()
172 |             else:
173 |                 if not key.startswith('_'):
174 |                     out_dict[key] = value
175 |         return dict(out_dict)
176 | 


--------------------------------------------------------------------------------
/datasets/coco.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from DETR (https://github.com/facebookresearch/detr)
  7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  8 | # ------------------------------------------------------------------------
  9 | 
 10 | """
 11 | COCO dataset which returns image_id for evaluation.
 12 | 
 13 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
 14 | """
 15 | from pathlib import Path
 16 | 
 17 | import torch
 18 | import torch.utils.data
 19 | from pycocotools import mask as coco_mask
 20 | 
 21 | from .torchvision_datasets import CocoDetection as TvCocoDetection
 22 | from util.misc import get_local_rank, get_local_size
 23 | import datasets.transforms as T
 24 | 
 25 | 
 26 | class CocoDetection(TvCocoDetection):
 27 |     def __init__(self, img_folder, ann_file, transforms, return_masks, cache_mode=False, local_rank=0, local_size=1):
 28 |         super(CocoDetection, self).__init__(img_folder, ann_file,
 29 |                                             cache_mode=cache_mode, local_rank=local_rank, local_size=local_size)
 30 |         self._transforms = transforms
 31 |         self.prepare = ConvertCocoPolysToMask(return_masks)
 32 | 
 33 |     def __getitem__(self, idx):
 34 |         img, target = super(CocoDetection, self).__getitem__(idx)
 35 |         image_id = self.ids[idx]
 36 |         target = {'image_id': image_id, 'annotations': target}
 37 |         img, target = self.prepare(img, target)
 38 |         if self._transforms is not None:
 39 |             img, target = self._transforms(img, target)
 40 |         return img, target
 41 | 
 42 | 
 43 | def convert_coco_poly_to_mask(segmentations, height, width):
 44 |     masks = []
 45 |     for polygons in segmentations:
 46 |         rles = coco_mask.frPyObjects(polygons, height, width)
 47 |         mask = coco_mask.decode(rles)
 48 |         if len(mask.shape) < 3:
 49 |             mask = mask[..., None]
 50 |         mask = torch.as_tensor(mask, dtype=torch.uint8)
 51 |         mask = mask.any(dim=2)
 52 |         masks.append(mask)
 53 |     if masks:
 54 |         masks = torch.stack(masks, dim=0)
 55 |     else:
 56 |         masks = torch.zeros((0, height, width), dtype=torch.uint8)
 57 |     return masks
 58 | 
 59 | 
 60 | class ConvertCocoPolysToMask(object):
 61 |     def __init__(self, return_masks=False):
 62 |         self.return_masks = return_masks
 63 | 
 64 |     def __call__(self, image, target):
 65 |         w, h = image.size
 66 | 
 67 |         image_id = target["image_id"]
 68 |         image_id = torch.tensor([image_id])
 69 | 
 70 |         anno = target["annotations"]
 71 | 
 72 |         anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
 73 | 
 74 |         boxes = [obj["bbox"] for obj in anno]
 75 |         # guard against no boxes via resizing
 76 |         boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
 77 |         boxes[:, 2:] += boxes[:, :2]
 78 |         boxes[:, 0::2].clamp_(min=0, max=w)
 79 |         boxes[:, 1::2].clamp_(min=0, max=h)
 80 | 
 81 |         classes = [obj["category_id"] for obj in anno]
 82 |         classes = torch.tensor(classes, dtype=torch.int64)
 83 | 
 84 |         if self.return_masks:
 85 |             segmentations = [obj["segmentation"] for obj in anno]
 86 |             masks = convert_coco_poly_to_mask(segmentations, h, w)
 87 | 
 88 |         keypoints = None
 89 |         if anno and "keypoints" in anno[0]:
 90 |             keypoints = [obj["keypoints"] for obj in anno]
 91 |             keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
 92 |             num_keypoints = keypoints.shape[0]
 93 |             if num_keypoints:
 94 |                 keypoints = keypoints.view(num_keypoints, -1, 3)
 95 | 
 96 |         keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
 97 |         boxes = boxes[keep]
 98 |         classes = classes[keep]
 99 |         if self.return_masks:
100 |             masks = masks[keep]
101 |         if keypoints is not None:
102 |             keypoints = keypoints[keep]
103 | 
104 |         target = {}
105 |         target["boxes"] = boxes
106 |         target["labels"] = classes
107 |         if self.return_masks:
108 |             target["masks"] = masks
109 |         target["image_id"] = image_id
110 |         if keypoints is not None:
111 |             target["keypoints"] = keypoints
112 | 
113 |         # for conversion to coco api
114 |         area = torch.tensor([obj["area"] for obj in anno])
115 |         iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
116 |         target["area"] = area[keep]
117 |         target["iscrowd"] = iscrowd[keep]
118 | 
119 |         target["orig_size"] = torch.as_tensor([int(h), int(w)])
120 |         target["size"] = torch.as_tensor([int(h), int(w)])
121 | 
122 |         return image, target
123 | 
124 | 
125 | def make_coco_transforms(image_set):
126 | 
127 |     normalize = T.Compose([
128 |         T.ToTensor(),
129 |         T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
130 |     ])
131 | 
132 |     scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
133 | 
134 |     if image_set == 'train':
135 |         return T.Compose([
136 |             T.RandomHorizontalFlip(),
137 |             T.RandomSelect(
138 |                 T.RandomResize(scales, max_size=1333),
139 |                 T.Compose([
140 |                     T.RandomResize([400, 500, 600]),
141 |                     T.RandomSizeCrop(384, 600),
142 |                     T.RandomResize(scales, max_size=1333),
143 |                 ])
144 |             ),
145 |             normalize,
146 |         ])
147 | 
148 |     if image_set == 'val':
149 |         return T.Compose([
150 |             T.RandomResize([800], max_size=1333),
151 |             normalize,
152 |         ])
153 | 
154 |     raise ValueError(f'unknown {image_set}')
155 | 
156 | 
157 | def build(image_set, args):
158 |     root = Path(args.coco_path)
159 |     assert root.exists(), f'provided COCO path {root} does not exist'
160 |     mode = 'instances'
161 |     PATHS = {
162 |         "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'),
163 |         "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'),
164 |     }
165 | 
166 |     img_folder, ann_file = PATHS[image_set]
167 |     dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks,
168 |                             cache_mode=args.cache_mode, local_rank=get_local_rank(), local_size=get_local_size())
169 |     return dataset
170 | 


--------------------------------------------------------------------------------
/models/ops/src/cuda/ms_deform_attn_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*!
  2 | **************************************************************************************************
  3 | * Deformable DETR
  4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
  5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  6 | **************************************************************************************************
  7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  8 | **************************************************************************************************
  9 | */
 10 | 
 11 | #include <vector>
 12 | #include "cuda/ms_deform_im2col_cuda.cuh"
 13 | 
 14 | #include <ATen/ATen.h>
 15 | #include <ATen/cuda/CUDAContext.h>
 16 | #include <cuda.h>
 17 | #include <cuda_runtime.h>
 18 | 
 19 | 
 20 | at::Tensor ms_deform_attn_cuda_forward(
 21 |     const at::Tensor &value, 
 22 |     const at::Tensor &spatial_shapes,
 23 |     const at::Tensor &level_start_index,
 24 |     const at::Tensor &sampling_loc,
 25 |     const at::Tensor &attn_weight,
 26 |     const int im2col_step)
 27 | {
 28 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 29 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 30 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 31 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 32 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 33 | 
 34 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
 35 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
 36 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
 37 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
 38 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
 39 | 
 40 |     const int batch = value.size(0);
 41 |     const int spatial_size = value.size(1);
 42 |     const int num_heads = value.size(2);
 43 |     const int channels = value.size(3);
 44 | 
 45 |     const int num_levels = spatial_shapes.size(0);
 46 | 
 47 |     const int num_query = sampling_loc.size(1);
 48 |     const int num_point = sampling_loc.size(4);
 49 | 
 50 |     const int im2col_step_ = std::min(batch, im2col_step);
 51 | 
 52 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
 53 |     
 54 |     auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
 55 | 
 56 |     const int batch_n = im2col_step_;
 57 |     auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
 58 |     auto per_value_size = spatial_size * num_heads * channels;
 59 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
 60 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
 61 |     for (int n = 0; n < batch/im2col_step_; ++n)
 62 |     {
 63 |         auto columns = output_n.select(0, n);
 64 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
 65 |             ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
 66 |                 value.data<scalar_t>() + n * im2col_step_ * per_value_size,
 67 |                 spatial_shapes.data<int64_t>(),
 68 |                 level_start_index.data<int64_t>(),
 69 |                 sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
 70 |                 attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
 71 |                 batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
 72 |                 columns.data<scalar_t>());
 73 | 
 74 |         }));
 75 |     }
 76 | 
 77 |     output = output.view({batch, num_query, num_heads*channels});
 78 | 
 79 |     return output;
 80 | }
 81 | 
 82 | 
 83 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
 84 |     const at::Tensor &value, 
 85 |     const at::Tensor &spatial_shapes,
 86 |     const at::Tensor &level_start_index,
 87 |     const at::Tensor &sampling_loc,
 88 |     const at::Tensor &attn_weight,
 89 |     const at::Tensor &grad_output,
 90 |     const int im2col_step)
 91 | {
 92 | 
 93 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 94 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 95 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 96 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 97 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 98 |     AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
 99 | 
100 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
101 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
102 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
103 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
104 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
105 |     AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
106 | 
107 |     const int batch = value.size(0);
108 |     const int spatial_size = value.size(1);
109 |     const int num_heads = value.size(2);
110 |     const int channels = value.size(3);
111 | 
112 |     const int num_levels = spatial_shapes.size(0);
113 | 
114 |     const int num_query = sampling_loc.size(1);
115 |     const int num_point = sampling_loc.size(4);
116 | 
117 |     const int im2col_step_ = std::min(batch, im2col_step);
118 | 
119 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
120 | 
121 |     auto grad_value = at::zeros_like(value);
122 |     auto grad_sampling_loc = at::zeros_like(sampling_loc);
123 |     auto grad_attn_weight = at::zeros_like(attn_weight);
124 | 
125 |     const int batch_n = im2col_step_;
126 |     auto per_value_size = spatial_size * num_heads * channels;
127 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
128 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
129 |     auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
130 |     
131 |     for (int n = 0; n < batch/im2col_step_; ++n)
132 |     {
133 |         auto grad_output_g = grad_output_n.select(0, n);
134 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
135 |             ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
136 |                                     grad_output_g.data<scalar_t>(),
137 |                                     value.data<scalar_t>() + n * im2col_step_ * per_value_size,
138 |                                     spatial_shapes.data<int64_t>(),
139 |                                     level_start_index.data<int64_t>(),
140 |                                     sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
141 |                                     attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
142 |                                     batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
143 |                                     grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
144 |                                     grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
145 |                                     grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
146 | 
147 |         }));
148 |     }
149 | 
150 |     return {
151 |         grad_value, grad_sampling_loc, grad_attn_weight
152 |     };
153 | }


--------------------------------------------------------------------------------
/models/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------
  2 | # Sparse DETR
  3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Deformable DETR
  7 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  8 | # ------------------------------------------------------------------------------------------------
  9 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 10 | # ------------------------------------------------------------------------------------------------
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import print_function
 14 | from __future__ import division
 15 | 
 16 | import warnings
 17 | import math
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | import torch.nn.functional as F
 22 | from torch.nn.init import xavier_uniform_, constant_
 23 | 
 24 | from ..functions import MSDeformAttnFunction
 25 | 
 26 | 
 27 | def _is_power_of_2(n):
 28 |     if (not isinstance(n, int)) or (n < 0):
 29 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 30 |     return (n & (n-1) == 0) and n != 0
 31 | 
 32 | 
 33 | class MSDeformAttn(nn.Module):
 34 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 35 |         """
 36 |         Multi-Scale Deformable Attention Module
 37 |         :param d_model      hidden dimension
 38 |         :param n_levels     number of feature levels
 39 |         :param n_heads      number of attention heads
 40 |         :param n_points     number of sampling points per attention head per feature level
 41 |         """
 42 |         super().__init__()
 43 |         if d_model % n_heads != 0:
 44 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 45 |         _d_per_head = d_model // n_heads
 46 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 47 |         if not _is_power_of_2(_d_per_head):
 48 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 49 |                           "which is more efficient in our CUDA implementation.")
 50 | 
 51 |         self.im2col_step = 64
 52 | 
 53 |         self.d_model = d_model
 54 |         self.n_levels = n_levels
 55 |         self.n_heads = n_heads
 56 |         self.n_points = n_points
 57 | 
 58 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 59 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 60 |         self.value_proj = nn.Linear(d_model, d_model)
 61 |         self.output_proj = nn.Linear(d_model, d_model)
 62 |         self.python_ops_for_test = False
 63 | 
 64 |         self._reset_parameters()
 65 | 
 66 |     def _reset_parameters(self):
 67 |         constant_(self.sampling_offsets.weight.data, 0.)
 68 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 69 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 70 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 71 |         for i in range(self.n_points):
 72 |             grid_init[:, :, i, :] *= i + 1
 73 |         with torch.no_grad():
 74 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 75 |         constant_(self.attention_weights.weight.data, 0.)
 76 |         constant_(self.attention_weights.bias.data, 0.)
 77 |         xavier_uniform_(self.value_proj.weight.data)
 78 |         constant_(self.value_proj.bias.data, 0.)
 79 |         xavier_uniform_(self.output_proj.weight.data)
 80 |         constant_(self.output_proj.bias.data, 0.)
 81 | 
 82 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 83 |         """
 84 |         :param query                       (N, Length_{query}, C)
 85 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 86 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 87 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 88 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 89 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 90 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 91 | 
 92 |         :return output                     (N, Length_{query}, C)
 93 |         """
 94 |         N, Len_q, _ = query.shape
 95 |         N, Len_in, _ = input_flatten.shape
 96 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 97 | 
 98 |         value = self.value_proj(input_flatten)
 99 |         if input_padding_mask is not None:
100 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
101 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105 |         # N, Len_q, n_heads, n_levels, n_points, 2
106 |         if reference_points.shape[-1] == 2:
107 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
108 |             sampling_locations = reference_points[:, :, None, :, None, :] \
109 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
110 |         elif reference_points.shape[-1] == 4:
111 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
112 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
113 |         else:
114 |             raise ValueError(
115 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
116 |         if not self.python_ops_for_test:
117 |             output = MSDeformAttnFunction.apply(
118 |                 value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
119 |         else:
120 |             output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
121 |         output = self.output_proj(output)
122 |         return output, sampling_locations, attention_weights
123 | 
124 | 
125 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
126 |     # for debug and test only,
127 |     # need to use cuda version instead
128 |     N_, S_, M_, D_ = value.shape
129 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
130 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
131 |     sampling_grids = 2 * sampling_locations - 1
132 |     sampling_value_list = []
133 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
134 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
135 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
136 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
137 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
138 |         # N_*M_, D_, Lq_, P_
139 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
140 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
141 |         sampling_value_list.append(sampling_value_l_)
142 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
143 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
144 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
145 |     return output.transpose(1, 2).contiguous()
146 | 


--------------------------------------------------------------------------------
/engine.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------
  2 | # Sparse DETR
  3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------
  6 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  7 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  8 | # ------------------------------------------------------------------------------------
  9 | # Modified from DETR (https://github.com/facebookresearch/detr)
 10 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 11 | # ------------------------------------------------------------------------------------
 12 | 
 13 | 
 14 | """
 15 | Train and eval functions used in main.py
 16 | """
 17 | import math
 18 | import os
 19 | import sys
 20 | from typing import Iterable
 21 | 
 22 | import torch
 23 | import util.misc as utils
 24 | from datasets.coco_eval import CocoEvaluator
 25 | from datasets.panoptic_eval import PanopticEvaluator
 26 | from datasets.data_prefetcher import data_prefetcher
 27 | 
 28 | from util.misc import check_unused_parameters
 29 | 
 30 | 
 31 | def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
 32 |                     data_loader: Iterable, optimizer: torch.optim.Optimizer,
 33 |                     device: torch.device, epoch: int, max_norm: float = 0, 
 34 |                     writer=None, total_iter=0):
 35 |     model.train()
 36 |     criterion.train()
 37 |     metric_logger = utils.MetricLogger(delimiter="  ")
 38 |     metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
 39 |     metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))
 40 |     metric_logger.add_meter('grad_norm', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))
 41 |     header = 'Epoch: [{}]'.format(epoch)
 42 |     print_freq = 10
 43 | 
 44 |     prefetcher = data_prefetcher(data_loader, device, prefetch=True)
 45 |     samples, targets = prefetcher.next()
 46 | 
 47 |     for i in metric_logger.log_every(range(len(data_loader)), print_freq, header):            
 48 |         outputs = model(samples)
 49 |         loss_dict = criterion(outputs, targets)
 50 |         weight_dict = criterion.weight_dict
 51 |         losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
 52 | 
 53 |         # reduce losses over all GPUs for logging purposes
 54 |         loss_dict_reduced = utils.reduce_dict(loss_dict)
 55 |         loss_dict_reduced_unscaled = {f'{k}_unscaled': v
 56 |                                       for k, v in loss_dict_reduced.items()}
 57 |         loss_dict_reduced_scaled = {k: v * weight_dict[k]
 58 |                                     for k, v in loss_dict_reduced.items() if k in weight_dict}
 59 |         losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())
 60 | 
 61 |         loss_value = losses_reduced_scaled.item()
 62 | 
 63 |         if not math.isfinite(loss_value):
 64 |             print("Loss is {}, stopping training".format(loss_value))
 65 |             print(loss_dict_reduced)
 66 |             sys.exit(1)
 67 |             
 68 |         optimizer.zero_grad()
 69 |         losses.backward()
 70 |         
 71 |         if i == 0:
 72 |             check_unused_parameters(model, loss_dict, weight_dict)
 73 |                 
 74 |         if max_norm > 0:
 75 |             grad_total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
 76 |         else:
 77 |             grad_total_norm = utils.get_total_grad_norm(model.parameters(), max_norm)
 78 |             
 79 |         metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled)
 80 |         metric_logger.update(class_error=loss_dict_reduced['class_error'])
 81 |         metric_logger.update(lr=optimizer.param_groups[0]["lr"])
 82 |         metric_logger.update(grad_norm=grad_total_norm)
 83 |                     
 84 |         optimizer.step()
 85 | 
 86 |         if total_iter % (print_freq*10) == 0 and utils.is_main_process():
 87 |             writer.add_scalar('train/loss', loss_value, total_iter)
 88 |             writer.add_scalar('train/class_error', loss_dict_reduced['class_error'], total_iter)
 89 |             writer.add_scalar('lr', optimizer.param_groups[0]["lr"], total_iter)
 90 |             writer.add_scalar('train/grad_norm', grad_total_norm, total_iter)
 91 |             for key, value in loss_dict_reduced_scaled.items():
 92 |                 writer.add_scalar('train/'+key, value, total_iter)
 93 |             for key, value in loss_dict_reduced_unscaled.items():
 94 |                 if "corr" in key:
 95 |                     writer.add_scalar('train/'+key, value, total_iter)
 96 | 
 97 |         total_iter += 1
 98 |         samples, targets = prefetcher.next()
 99 | 
100 |     # gather the stats from all processes
101 |     metric_logger.synchronize_between_processes()
102 |     print("Averaged stats:", metric_logger)
103 |     return {k: meter.global_avg for k, meter in metric_logger.meters.items()}, total_iter
104 | 
105 | 
106 | @torch.no_grad()
107 | def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, args):
108 |     model.eval()
109 |     criterion.eval()
110 | 
111 |     metric_logger = utils.MetricLogger(delimiter="  ")
112 |     metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))
113 |     header = 'Test:'
114 | 
115 |     iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys())
116 |     coco_evaluator = CocoEvaluator(base_ds, iou_types)
117 | 
118 |     panoptic_evaluator = None
119 |     if 'panoptic' in postprocessors.keys():
120 |         panoptic_evaluator = PanopticEvaluator(
121 |             data_loader.dataset.ann_file,
122 |             data_loader.dataset.ann_folder,
123 |             output_dir=os.path.join(args.output_dir, "panoptic_eval"),
124 |         )
125 | 
126 |     for step, (samples, targets) in enumerate(metric_logger.log_every(data_loader, 10, header)):
127 |         samples = samples.to(device)
128 |         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
129 | 
130 |         outputs = model(samples)
131 |         loss_dict = criterion(outputs, targets)
132 |         weight_dict = criterion.weight_dict
133 | 
134 |         # reduce losses over all GPUs for logging purposes
135 |         loss_dict_reduced = utils.reduce_dict(loss_dict)
136 |         loss_dict_reduced_scaled = {k: v * weight_dict[k]
137 |                                     for k, v in loss_dict_reduced.items() if k in weight_dict}
138 |         loss_dict_reduced_unscaled = {f'{k}_unscaled': v
139 |                                       for k, v in loss_dict_reduced.items()}
140 |         metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()),
141 |                              **loss_dict_reduced_scaled,
142 |                              **loss_dict_reduced_unscaled)
143 |         metric_logger.update(class_error=loss_dict_reduced['class_error'])
144 | 
145 |         orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
146 |         results = postprocessors['bbox'](outputs, orig_target_sizes)
147 |         if 'segm' in postprocessors.keys():
148 |             target_sizes = torch.stack([t["size"] for t in targets], dim=0)
149 |             results = postprocessors['segm'](results, outputs, orig_target_sizes, target_sizes)
150 |         res = {target['image_id'].item(): output for target, output in zip(targets, results)}
151 |         if coco_evaluator is not None:
152 |             coco_evaluator.update(res)
153 | 
154 |         if panoptic_evaluator is not None:
155 |             res_pano = postprocessors["panoptic"](outputs, target_sizes, orig_target_sizes)
156 |             for i, target in enumerate(targets):
157 |                 image_id = target["image_id"].item()
158 |                 file_name = f"{image_id:012d}.png"
159 |                 res_pano[i]["image_id"] = image_id
160 |                 res_pano[i]["file_name"] = file_name
161 | 
162 |             panoptic_evaluator.update(res_pano)
163 | 
164 | 
165 | 
166 |     # gather the stats from all processes
167 |     metric_logger.synchronize_between_processes()
168 |     print("Averaged stats:", metric_logger)
169 |     if coco_evaluator is not None:
170 |         coco_evaluator.synchronize_between_processes()
171 |     if panoptic_evaluator is not None:
172 |         panoptic_evaluator.synchronize_between_processes()
173 | 
174 |     # accumulate predictions from all images
175 |     if coco_evaluator is not None:
176 |         coco_evaluator.accumulate()
177 |         coco_evaluator.summarize()
178 |     panoptic_res = None
179 |     if panoptic_evaluator is not None:
180 |         panoptic_res = panoptic_evaluator.summarize()
181 |     stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
182 |     if coco_evaluator is not None:
183 |         if 'bbox' in postprocessors.keys():
184 |             stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist()
185 |         if 'segm' in postprocessors.keys():
186 |             stats['coco_eval_masks'] = coco_evaluator.coco_eval['segm'].stats.tolist()
187 |     if panoptic_res is not None:
188 |         stats['PQ_all'] = panoptic_res["All"]
189 |         stats['PQ_th'] = panoptic_res["Things"]
190 |         stats['PQ_st'] = panoptic_res["Stuff"]
191 |     return stats, coco_evaluator
192 | 


--------------------------------------------------------------------------------
/tools/launch.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # --------------------------------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/pytorch/pytorch/blob/173f224570017b4b1a3a1a13d0bff280a54d9cd9/torch/distributed/launch.py
  7 | # --------------------------------------------------------------------------------------------------------------------------
  8 | 
  9 | r"""
 10 | `torch.distributed.launch` is a module that spawns up multiple distributed
 11 | training processes on each of the training nodes.
 12 | The utility can be used for single-node distributed training, in which one or
 13 | more processes per node will be spawned. The utility can be used for either
 14 | CPU training or GPU training. If the utility is used for GPU training,
 15 | each distributed process will be operating on a single GPU. This can achieve
 16 | well-improved single-node training performance. It can also be used in
 17 | multi-node distributed training, by spawning up multiple processes on each node
 18 | for well-improved multi-node distributed training performance as well.
 19 | This will especially be benefitial for systems with multiple Infiniband
 20 | interfaces that have direct-GPU support, since all of them can be utilized for
 21 | aggregated communication bandwidth.
 22 | In both cases of single-node distributed training or multi-node distributed
 23 | training, this utility will launch the given number of processes per node
 24 | (``--nproc_per_node``). If used for GPU training, this number needs to be less
 25 | or euqal to the number of GPUs on the current system (``nproc_per_node``),
 26 | and each process will be operating on a single GPU from *GPU 0 to
 27 | GPU (nproc_per_node - 1)*.
 28 | **How to use this module:**
 29 | 1. Single-Node multi-process distributed training
 30 | ::
 31 |     >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
 32 |                YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
 33 |                arguments of your training script)
 34 | 2. Multi-Node multi-process distributed training: (e.g. two nodes)
 35 | Node 1: *(IP: 192.168.1.1, and has a free port: 1234)*
 36 | ::
 37 |     >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
 38 |                --nnodes=2 --node_rank=0 --master_addr="192.168.1.1"
 39 |                --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
 40 |                and all other arguments of your training script)
 41 | Node 2:
 42 | ::
 43 |     >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
 44 |                --nnodes=2 --node_rank=1 --master_addr="192.168.1.1"
 45 |                --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
 46 |                and all other arguments of your training script)
 47 | 3. To look up what optional arguments this module offers:
 48 | ::
 49 |     >>> python -m torch.distributed.launch --help
 50 | **Important Notices:**
 51 | 1. This utilty and multi-process distributed (single-node or
 52 | multi-node) GPU training currently only achieves the best performance using
 53 | the NCCL distributed backend. Thus NCCL backend is the recommended backend to
 54 | use for GPU training.
 55 | 2. In your training program, you must parse the command-line argument:
 56 | ``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by this module.
 57 | If your training program uses GPUs, you should ensure that your code only
 58 | runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by:
 59 | Parsing the local_rank argument
 60 | ::
 61 |     >>> import argparse
 62 |     >>> parser = argparse.ArgumentParser()
 63 |     >>> parser.add_argument("--local_rank", type=int)
 64 |     >>> args = parser.parse_args()
 65 | Set your device to local rank using either
 66 | ::
 67 |     >>> torch.cuda.set_device(arg.local_rank)  # before your code runs
 68 | or
 69 | ::
 70 |     >>> with torch.cuda.device(arg.local_rank):
 71 |     >>>    # your code to run
 72 | 3. In your training program, you are supposed to call the following function
 73 | at the beginning to start the distributed backend. You need to make sure that
 74 | the init_method uses ``env://``, which is the only supported ``init_method``
 75 | by this module.
 76 | ::
 77 |     torch.distributed.init_process_group(backend='YOUR BACKEND',
 78 |                                          init_method='env://')
 79 | 4. In your training program, you can either use regular distributed functions
 80 | or use :func:`torch.nn.parallel.DistributedDataParallel` module. If your
 81 | training program uses GPUs for training and you would like to use
 82 | :func:`torch.nn.parallel.DistributedDataParallel` module,
 83 | here is how to configure it.
 84 | ::
 85 |     model = torch.nn.parallel.DistributedDataParallel(model,
 86 |                                                       device_ids=[arg.local_rank],
 87 |                                                       output_device=arg.local_rank)
 88 | Please ensure that ``device_ids`` argument is set to be the only GPU device id
 89 | that your code will be operating on. This is generally the local rank of the
 90 | process. In other words, the ``device_ids`` needs to be ``[args.local_rank]``,
 91 | and ``output_device`` needs to be ``args.local_rank`` in order to use this
 92 | utility
 93 | 5. Another way to pass ``local_rank`` to the subprocesses via environment variable
 94 | ``LOCAL_RANK``. This behavior is enabled when you launch the script with
 95 | ``--use_env=True``. You must adjust the subprocess example above to replace
 96 | ``args.local_rank`` with ``os.environ['LOCAL_RANK']``; the launcher
 97 | will not pass ``--local_rank`` when you specify this flag.
 98 | .. warning::
 99 |     ``local_rank`` is NOT globally unique: it is only unique per process
100 |     on a machine.  Thus, don't use it to decide if you should, e.g.,
101 |     write to a networked filesystem.  See
102 |     https://github.com/pytorch/pytorch/issues/12042 for an example of
103 |     how things can go wrong if you don't do this correctly.
104 | """
105 | 
106 | 
107 | import sys
108 | import subprocess
109 | import os
110 | import socket
111 | from argparse import ArgumentParser, REMAINDER
112 | 
113 | import torch
114 | 
115 | 
116 | def parse_args():
117 |     """
118 |     Helper function parsing the command line options
119 |     @retval ArgumentParser
120 |     """
121 |     parser = ArgumentParser(description="PyTorch distributed training launch "
122 |                                         "helper utilty that will spawn up "
123 |                                         "multiple distributed processes")
124 | 
125 |     # Optional arguments for the launch helper
126 |     parser.add_argument("--nnodes", type=int, default=1,
127 |                         help="The number of nodes to use for distributed "
128 |                              "training")
129 |     parser.add_argument("--node_rank", type=int, default=0,
130 |                         help="The rank of the node for multi-node distributed "
131 |                              "training")
132 |     parser.add_argument("--nproc_per_node", type=int, default=1,
133 |                         help="The number of processes to launch on each node, "
134 |                              "for GPU training, this is recommended to be set "
135 |                              "to the number of GPUs in your system so that "
136 |                              "each process can be bound to a single GPU.")
137 |     parser.add_argument("--master_addr", default="127.0.0.1", type=str,
138 |                         help="Master node (rank 0)'s address, should be either "
139 |                              "the IP address or the hostname of node 0, for "
140 |                              "single node multi-proc training, the "
141 |                              "--master_addr can simply be 127.0.0.1")
142 |     parser.add_argument("--master_port", default=29500, type=int,
143 |                         help="Master node (rank 0)'s free port that needs to "
144 |                              "be used for communciation during distributed "
145 |                              "training")
146 | 
147 |     # positional
148 |     parser.add_argument("training_script", type=str,
149 |                         help="The full path to the single GPU training "
150 |                              "program/script to be launched in parallel, "
151 |                              "followed by all the arguments for the "
152 |                              "training script")
153 | 
154 |     # rest from the training program
155 |     parser.add_argument('training_script_args', nargs=REMAINDER)
156 |     return parser.parse_args()
157 | 
158 | 
159 | def main():
160 |     args = parse_args()
161 | 
162 |     # world size in terms of number of processes
163 |     dist_world_size = args.nproc_per_node * args.nnodes
164 | 
165 |     # set PyTorch distributed related environmental variables
166 |     current_env = os.environ.copy()
167 |     current_env["MASTER_ADDR"] = args.master_addr
168 |     current_env["MASTER_PORT"] = str(args.master_port)
169 |     current_env["WORLD_SIZE"] = str(dist_world_size)
170 | 
171 |     processes = []
172 | 
173 |     for local_rank in range(0, args.nproc_per_node):
174 |         # each process's rank
175 |         dist_rank = args.nproc_per_node * args.node_rank + local_rank
176 |         current_env["RANK"] = str(dist_rank)
177 |         current_env["LOCAL_RANK"] = str(local_rank)
178 | 
179 |         cmd = [args.training_script] + args.training_script_args
180 | 
181 |         process = subprocess.Popen(cmd, env=current_env)
182 |         processes.append(process)
183 | 
184 |     for process in processes:
185 |         process.wait()
186 |         if process.returncode != 0:
187 |             raise subprocess.CalledProcessError(returncode=process.returncode,
188 |                                                 cmd=process.args)
189 | 
190 | 
191 | if __name__ == "__main__":
192 |     main()


--------------------------------------------------------------------------------
/datasets/transforms.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from DETR (https://github.com/facebookresearch/detr)
  7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  8 | # ------------------------------------------------------------------------
  9 | 
 10 | """
 11 | Transforms and data augmentation for both image + bbox.
 12 | """
 13 | import random
 14 | 
 15 | import PIL
 16 | import torch
 17 | import torchvision.transforms as T
 18 | import torchvision.transforms.functional as F
 19 | 
 20 | from util.box_ops import box_xyxy_to_cxcywh
 21 | from util.misc import interpolate
 22 | 
 23 | 
 24 | def crop(image, target, region):
 25 |     cropped_image = F.crop(image, *region)
 26 | 
 27 |     target = target.copy()
 28 |     i, j, h, w = region
 29 | 
 30 |     # should we do something wrt the original size?
 31 |     target["size"] = torch.tensor([h, w])
 32 | 
 33 |     fields = ["labels", "area", "iscrowd"]
 34 | 
 35 |     if "boxes" in target:
 36 |         boxes = target["boxes"]
 37 |         max_size = torch.as_tensor([w, h], dtype=torch.float32)
 38 |         cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
 39 |         cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
 40 |         cropped_boxes = cropped_boxes.clamp(min=0)
 41 |         area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
 42 |         target["boxes"] = cropped_boxes.reshape(-1, 4)
 43 |         target["area"] = area
 44 |         fields.append("boxes")
 45 | 
 46 |     if "masks" in target:
 47 |         # FIXME should we update the area here if there are no boxes?
 48 |         target['masks'] = target['masks'][:, i:i + h, j:j + w]
 49 |         fields.append("masks")
 50 | 
 51 |     # remove elements for which the boxes or masks that have zero area
 52 |     if "boxes" in target or "masks" in target:
 53 |         # favor boxes selection when defining which elements to keep
 54 |         # this is compatible with previous implementation
 55 |         if "boxes" in target:
 56 |             cropped_boxes = target['boxes'].reshape(-1, 2, 2)
 57 |             keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
 58 |         else:
 59 |             keep = target['masks'].flatten(1).any(1)
 60 | 
 61 |         for field in fields:
 62 |             target[field] = target[field][keep]
 63 | 
 64 |     return cropped_image, target
 65 | 
 66 | 
 67 | def hflip(image, target):
 68 |     flipped_image = F.hflip(image)
 69 | 
 70 |     w, h = image.size
 71 | 
 72 |     target = target.copy()
 73 |     if "boxes" in target:
 74 |         boxes = target["boxes"]
 75 |         boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
 76 |         target["boxes"] = boxes
 77 | 
 78 |     if "masks" in target:
 79 |         target['masks'] = target['masks'].flip(-1)
 80 | 
 81 |     return flipped_image, target
 82 | 
 83 | 
 84 | def resize(image, target, size, max_size=None):
 85 |     # size can be min_size (scalar) or (w, h) tuple
 86 | 
 87 |     def get_size_with_aspect_ratio(image_size, size, max_size=None):
 88 |         w, h = image_size
 89 |         if max_size is not None:
 90 |             min_original_size = float(min((w, h)))
 91 |             max_original_size = float(max((w, h)))
 92 |             if max_original_size / min_original_size * size > max_size:
 93 |                 size = int(round(max_size * min_original_size / max_original_size))
 94 | 
 95 |         if (w <= h and w == size) or (h <= w and h == size):
 96 |             return (h, w)
 97 | 
 98 |         if w < h:
 99 |             ow = size
100 |             oh = int(size * h / w)
101 |         else:
102 |             oh = size
103 |             ow = int(size * w / h)
104 | 
105 |         return (oh, ow)
106 | 
107 |     def get_size(image_size, size, max_size=None):
108 |         if isinstance(size, (list, tuple)):
109 |             return size[::-1]
110 |         else:
111 |             return get_size_with_aspect_ratio(image_size, size, max_size)
112 | 
113 |     size = get_size(image.size, size, max_size)
114 |     rescaled_image = F.resize(image, size)
115 | 
116 |     if target is None:
117 |         return rescaled_image, None
118 | 
119 |     ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
120 |     ratio_width, ratio_height = ratios
121 | 
122 |     target = target.copy()
123 |     if "boxes" in target:
124 |         boxes = target["boxes"]
125 |         scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
126 |         target["boxes"] = scaled_boxes
127 | 
128 |     if "area" in target:
129 |         area = target["area"]
130 |         scaled_area = area * (ratio_width * ratio_height)
131 |         target["area"] = scaled_area
132 | 
133 |     h, w = size
134 |     target["size"] = torch.tensor([h, w])
135 | 
136 |     if "masks" in target:
137 |         target['masks'] = interpolate(
138 |             target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
139 | 
140 |     return rescaled_image, target
141 | 
142 | 
143 | def pad(image, target, padding):
144 |     # assumes that we only pad on the bottom right corners
145 |     padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
146 |     if target is None:
147 |         return padded_image, None
148 |     target = target.copy()
149 |     # should we do something wrt the original size?
150 |     target["size"] = torch.tensor(padded_image[::-1])
151 |     if "masks" in target:
152 |         target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
153 |     return padded_image, target
154 | 
155 | 
156 | class RandomCrop(object):
157 |     def __init__(self, size):
158 |         self.size = size
159 | 
160 |     def __call__(self, img, target):
161 |         region = T.RandomCrop.get_params(img, self.size)
162 |         return crop(img, target, region)
163 | 
164 | 
165 | class RandomSizeCrop(object):
166 |     def __init__(self, min_size: int, max_size: int):
167 |         self.min_size = min_size
168 |         self.max_size = max_size
169 | 
170 |     def __call__(self, img: PIL.Image.Image, target: dict):
171 |         w = random.randint(self.min_size, min(img.width, self.max_size))
172 |         h = random.randint(self.min_size, min(img.height, self.max_size))
173 |         region = T.RandomCrop.get_params(img, [h, w])
174 |         return crop(img, target, region)
175 | 
176 | 
177 | class CenterCrop(object):
178 |     def __init__(self, size):
179 |         self.size = size
180 | 
181 |     def __call__(self, img, target):
182 |         image_width, image_height = img.size
183 |         crop_height, crop_width = self.size
184 |         crop_top = int(round((image_height - crop_height) / 2.))
185 |         crop_left = int(round((image_width - crop_width) / 2.))
186 |         return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
187 | 
188 | 
189 | class RandomHorizontalFlip(object):
190 |     def __init__(self, p=0.5):
191 |         self.p = p
192 | 
193 |     def __call__(self, img, target):
194 |         if random.random() < self.p:
195 |             return hflip(img, target)
196 |         return img, target
197 | 
198 | 
199 | class RandomResize(object):
200 |     def __init__(self, sizes, max_size=None):
201 |         assert isinstance(sizes, (list, tuple))
202 |         self.sizes = sizes
203 |         self.max_size = max_size
204 | 
205 |     def __call__(self, img, target=None):
206 |         size = random.choice(self.sizes)
207 |         return resize(img, target, size, self.max_size)
208 | 
209 | 
210 | class RandomPad(object):
211 |     def __init__(self, max_pad):
212 |         self.max_pad = max_pad
213 | 
214 |     def __call__(self, img, target):
215 |         pad_x = random.randint(0, self.max_pad)
216 |         pad_y = random.randint(0, self.max_pad)
217 |         return pad(img, target, (pad_x, pad_y))
218 | 
219 | 
220 | class RandomSelect(object):
221 |     """
222 |     Randomly selects between transforms1 and transforms2,
223 |     with probability p for transforms1 and (1 - p) for transforms2
224 |     """
225 |     def __init__(self, transforms1, transforms2, p=0.5):
226 |         self.transforms1 = transforms1
227 |         self.transforms2 = transforms2
228 |         self.p = p
229 | 
230 |     def __call__(self, img, target):
231 |         if random.random() < self.p:
232 |             return self.transforms1(img, target)
233 |         return self.transforms2(img, target)
234 | 
235 | 
236 | class ToTensor(object):
237 |     def __call__(self, img, target):
238 |         return F.to_tensor(img), target
239 | 
240 | 
241 | class RandomErasing(object):
242 | 
243 |     def __init__(self, *args, **kwargs):
244 |         self.eraser = T.RandomErasing(*args, **kwargs)
245 | 
246 |     def __call__(self, img, target):
247 |         return self.eraser(img), target
248 | 
249 | 
250 | class Normalize(object):
251 |     def __init__(self, mean, std):
252 |         self.mean = mean
253 |         self.std = std
254 | 
255 |     def __call__(self, image, target=None):
256 |         image = F.normalize(image, mean=self.mean, std=self.std)
257 |         if target is None:
258 |             return image, None
259 |         target = target.copy()
260 |         h, w = image.shape[-2:]
261 |         if "boxes" in target:
262 |             boxes = target["boxes"]
263 |             boxes = box_xyxy_to_cxcywh(boxes)
264 |             boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
265 |             target["boxes"] = boxes
266 |         return image, target
267 | 
268 | 
269 | class Compose(object):
270 |     def __init__(self, transforms):
271 |         self.transforms = transforms
272 | 
273 |     def __call__(self, image, target):
274 |         for t in self.transforms:
275 |             image, target = t(image, target)
276 |         return image, target
277 | 
278 |     def __repr__(self):
279 |         format_string = self.__class__.__name__ + "("
280 |         for t in self.transforms:
281 |             format_string += "\n"
282 |             format_string += "    {0}".format(t)
283 |         format_string += "\n)"
284 |         return format_string
285 | 


--------------------------------------------------------------------------------
/models/backbone.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------
  2 | # Sparse DETR
  3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------
  6 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  7 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  8 | # ------------------------------------------------------------------------------------
  9 | # Modified from DETR (https://github.com/facebookresearch/detr)
 10 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 11 | # ------------------------------------------------------------------------------------
 12 | 
 13 | 
 14 | """
 15 | Backbone modules.
 16 | """
 17 | from collections import OrderedDict
 18 | 
 19 | import torch
 20 | import torch.nn.functional as F
 21 | import torchvision
 22 | from torch import nn
 23 | from torchvision.models._utils import IntermediateLayerGetter
 24 | from typing import Dict, List
 25 | 
 26 | from models import swin_transformer
 27 | from util.misc import NestedTensor, is_main_process
 28 | 
 29 | from .position_encoding import build_position_encoding
 30 | 
 31 | 
 32 | class FrozenBatchNorm2d(torch.nn.Module):
 33 |     """
 34 |     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 35 | 
 36 |     Copy-paste from torchvision.misc.ops with added eps before rsqrt,
 37 |     without which any other models than torchvision.models.resnet[18,34,50,101]
 38 |     produce nans.
 39 |     """
 40 | 
 41 |     def __init__(self, n, eps=1e-5):
 42 |         super(FrozenBatchNorm2d, self).__init__()
 43 |         self.register_buffer("weight", torch.ones(n))
 44 |         self.register_buffer("bias", torch.zeros(n))
 45 |         self.register_buffer("running_mean", torch.zeros(n))
 46 |         self.register_buffer("running_var", torch.ones(n))
 47 |         self.eps = eps
 48 | 
 49 |     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
 50 |                               missing_keys, unexpected_keys, error_msgs):
 51 |         num_batches_tracked_key = prefix + 'num_batches_tracked'
 52 |         if num_batches_tracked_key in state_dict:
 53 |             del state_dict[num_batches_tracked_key]
 54 | 
 55 |         super(FrozenBatchNorm2d, self)._load_from_state_dict(
 56 |             state_dict, prefix, local_metadata, strict,
 57 |             missing_keys, unexpected_keys, error_msgs)
 58 | 
 59 |     def forward(self, x):
 60 |         # move reshapes to the beginning
 61 |         # to make it fuser-friendly
 62 |         w = self.weight.reshape(1, -1, 1, 1)
 63 |         b = self.bias.reshape(1, -1, 1, 1)
 64 |         rv = self.running_var.reshape(1, -1, 1, 1)
 65 |         rm = self.running_mean.reshape(1, -1, 1, 1)
 66 |         eps = self.eps
 67 |         scale = w * (rv + eps).rsqrt()
 68 |         bias = b - rm * scale
 69 |         return x * scale + bias
 70 | 
 71 | 
 72 | class BackboneBase(nn.Module):
 73 | 
 74 |     def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool, args):
 75 |         # TODO: args -> duplicated args
 76 |         super().__init__()
 77 |         if 'none' in args.backbone:
 78 |             self.strides = [1]  # not used, actually (length only matters)  
 79 |             self.num_channels = [3]
 80 |             return_layers = self.get_return_layers('identity', (0,))
 81 |             self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
 82 | 
 83 |         elif 'resnet' in args.backbone:
 84 |             
 85 |             if not args.backbone_from_scratch and not args.finetune_early_layers:
 86 |                 print("Freeze early layers.")
 87 |                 for name, parameter in backbone.named_parameters():
 88 |                     if not train_backbone or all([k not in name for k in ['layer2', 'layer3', 'layer4']]):
 89 |                         parameter.requires_grad_(False)
 90 |             else:
 91 |                 print('Finetune early layers as well.')
 92 |                     
 93 |             layer_name = "layer"
 94 |             if return_interm_layers:
 95 |                 return_layers = self.get_return_layers(layer_name, (2, 3, 4))
 96 |                 self.strides = [8, 16, 32]
 97 |                 self.num_channels = [512, 1024, 2048]
 98 |             else:
 99 |                 return_layers = self.get_return_layers(layer_name, (4,))
100 |                 self.strides = [32]
101 |                 self.num_channels = [2048]
102 |             self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
103 |                 
104 |         elif 'swin' in args.backbone:
105 |             if return_interm_layers:
106 |                 num_channels = [int(backbone.embed_dim * 2 ** i) for i in range(backbone.num_layers)]
107 |                 return_layers = [2, 3, 4]
108 |                 self.strides = [8, 16, 32]
109 |                 self.num_channels = num_channels[1:]
110 |             else:
111 |                 return_layers = [4]
112 |                 self.strides = [32]
113 |                 self.num_channels = num_channels[-1]
114 |             self.body = backbone
115 |                 
116 |         else:
117 |             raise ValueError(f"Unknown backbone name: {args.backbone}")
118 |         
119 |     @staticmethod
120 |     def get_return_layers(name: str, layer_ids):
121 |         return {name + str(n): str(i) for i, n in enumerate(layer_ids)}
122 | 
123 |     def forward(self, tensor_list: NestedTensor):
124 |         xs = self.body(tensor_list.tensors)
125 |         out: Dict[str, NestedTensor] = {}
126 |         for name, x in xs.items():
127 |             m = tensor_list.mask
128 |             assert m is not None
129 |             mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
130 |             out[name] = NestedTensor(x, mask)
131 |         return out
132 |     
133 |     
134 | class DummyBackbone(torch.nn.Module):
135 |     def __init__(self):
136 |         super().__init__()
137 |         self.identity0 = torch.nn.Identity()
138 | 
139 | 
140 | class Backbone(BackboneBase):
141 |     """ResNet backbone with frozen BatchNorm."""
142 |     def __init__(self, name: str,
143 |                  train_backbone: bool,
144 |                  return_interm_layers: bool,
145 |                  dilation: bool,
146 |                  args):
147 |         print(f"Backbone: {name}")
148 |         pretrained = is_main_process() and not args.backbone_from_scratch and not args.scrl_pretrained_path
149 |         if not pretrained:
150 |             print("Train backbone from scratch.")
151 |         else:
152 |             print("Load pretrained weights")
153 |         
154 |         if "none" in name:
155 |             backbone = DummyBackbone()
156 |         elif "resnet" in name:
157 |             assert name not in ("resnet18", "resnet34"), "number of channels are hard coded"
158 |             backbone = getattr(torchvision.models, name)(
159 |                 replace_stride_with_dilation=[False, False, dilation],
160 |                 pretrained=pretrained, norm_layer=FrozenBatchNorm2d)
161 |         elif "swin" in name:
162 |             assert not dilation, "not supported"
163 |             if not args.backbone_from_scratch and not args.finetune_early_layers:
164 |                 print("Freeze early layers.")
165 |                 frozen_stages = 2
166 |             else:
167 |                 print('Finetune early layers as well.')
168 |                 frozen_stages = -1
169 |             if return_interm_layers:
170 |                 out_indices = [1, 2, 3]
171 |             else:
172 |                 out_indices = [3]
173 |                 
174 |             backbone = swin_transformer.build_model(
175 |                 name, out_indices=out_indices, frozen_stages=frozen_stages, pretrained=pretrained)
176 |         else:
177 |             raise ValueError(f"Unknown backbone name: {args.backbone}")
178 |             
179 |         if args.scrl_pretrained_path:
180 |             assert "resnet" in name, "Currently only resnet50 is available."
181 |             ckpt = torch.load(args.scrl_pretrained_path, map_location="cpu")
182 |             translate_map = {
183 |                 "encoder.0" : "conv1",
184 |                 "encoder.1" : "bn1",
185 |                 "encoder.4" : "layer1",
186 |                 "encoder.5" : "layer2",
187 |                 "encoder.6" : "layer3",
188 |                 "encoder.7" : "layer4",
189 |             }
190 |             state_dict = {
191 |                 translate_map[k[:9]] + k[9:] : v
192 |                 for k, v in ckpt["online_network_state_dict"].items()
193 |                 if "encoder" in k
194 |             }
195 |             backbone.load_state_dict(state_dict, strict=False)
196 |         
197 |         super().__init__(backbone, train_backbone, return_interm_layers, args)
198 |         if dilation and "resnet" in name:
199 |             self.strides[-1] = self.strides[-1] // 2
200 | 
201 | 
202 | class Joiner(nn.Sequential):
203 |     def __init__(self, backbone, position_embedding):
204 |         super().__init__(backbone, position_embedding)
205 |         self.strides = backbone.strides
206 |         self.num_channels = backbone.num_channels
207 | 
208 |     def forward(self, tensor_list: NestedTensor):
209 |         xs = self[0](tensor_list)
210 |         out: List[NestedTensor] = []
211 |         pos = []
212 |         for name, x in sorted(xs.items()):
213 |             out.append(x)
214 | 
215 |         # position encoding
216 |         for x in out:
217 |             pos.append(self[1](x).to(x.tensors.dtype))
218 | 
219 |         return out, pos
220 |     
221 |     
222 | def test_backbone(backbone):
223 |     imgs = [
224 |         torch.randn(2, 3, 633, 122),
225 |         torch.randn(2, 3, 322, 532),
226 |         torch.randn(2, 3, 236, 42),
227 |     ]
228 |     return [backbone(img).shape for img in imgs]
229 | 
230 | 
231 | def build_backbone(args):
232 |     # test_backbone(torchvision.models.resnet50())
233 |     position_embedding = build_position_encoding(args)
234 |     train_backbone = args.lr_backbone > 0
235 |     return_interm_layers = args.masks or (args.num_feature_levels > 1)
236 |     backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation, args)
237 |     model = Joiner(backbone, position_embedding)
238 |     return model
239 | 


--------------------------------------------------------------------------------
/datasets/coco_eval.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from DETR (https://github.com/facebookresearch/detr)
  7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  8 | # ------------------------------------------------------------------------
  9 | 
 10 | """
 11 | COCO evaluator that works in distributed mode.
 12 | 
 13 | Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
 14 | The difference is that there is less copy-pasting from pycocotools
 15 | in the end of the file, as python3 can suppress prints with contextlib
 16 | """
 17 | import os
 18 | import contextlib
 19 | import copy
 20 | import numpy as np
 21 | import torch
 22 | 
 23 | from pycocotools.cocoeval import COCOeval
 24 | from pycocotools.coco import COCO
 25 | import pycocotools.mask as mask_util
 26 | 
 27 | from util.misc import all_gather
 28 | 
 29 | 
 30 | class CocoEvaluator(object):
 31 |     def __init__(self, coco_gt, iou_types):
 32 |         assert isinstance(iou_types, (list, tuple))
 33 |         coco_gt = copy.deepcopy(coco_gt)
 34 |         self.coco_gt = coco_gt
 35 | 
 36 |         self.iou_types = iou_types
 37 |         self.coco_eval = {}
 38 |         for iou_type in iou_types:
 39 |             self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
 40 | 
 41 |         self.img_ids = []
 42 |         self.eval_imgs = {k: [] for k in iou_types}
 43 | 
 44 |     def update(self, predictions):
 45 |         img_ids = list(np.unique(list(predictions.keys())))
 46 |         self.img_ids.extend(img_ids)
 47 | 
 48 |         for iou_type in self.iou_types:
 49 |             results = self.prepare(predictions, iou_type)
 50 | 
 51 |             # suppress pycocotools prints
 52 |             with open(os.devnull, 'w') as devnull:
 53 |                 with contextlib.redirect_stdout(devnull):
 54 |                     coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
 55 |             coco_eval = self.coco_eval[iou_type]
 56 | 
 57 |             coco_eval.cocoDt = coco_dt
 58 |             coco_eval.params.imgIds = list(img_ids)
 59 |             img_ids, eval_imgs = evaluate(coco_eval)
 60 | 
 61 |             self.eval_imgs[iou_type].append(eval_imgs)
 62 | 
 63 |     def synchronize_between_processes(self):
 64 |         for iou_type in self.iou_types:
 65 |             self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
 66 |             create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
 67 | 
 68 |     def accumulate(self):
 69 |         for coco_eval in self.coco_eval.values():
 70 |             coco_eval.accumulate()
 71 | 
 72 |     def summarize(self):
 73 |         for iou_type, coco_eval in self.coco_eval.items():
 74 |             print("IoU metric: {}".format(iou_type))
 75 |             coco_eval.summarize()
 76 | 
 77 |     def prepare(self, predictions, iou_type):
 78 |         if iou_type == "bbox":
 79 |             return self.prepare_for_coco_detection(predictions)
 80 |         elif iou_type == "segm":
 81 |             return self.prepare_for_coco_segmentation(predictions)
 82 |         elif iou_type == "keypoints":
 83 |             return self.prepare_for_coco_keypoint(predictions)
 84 |         else:
 85 |             raise ValueError("Unknown iou type {}".format(iou_type))
 86 | 
 87 |     def prepare_for_coco_detection(self, predictions):
 88 |         coco_results = []
 89 |         for original_id, prediction in predictions.items():
 90 |             if len(prediction) == 0:
 91 |                 continue
 92 | 
 93 |             boxes = prediction["boxes"]
 94 |             boxes = convert_to_xywh(boxes).tolist()
 95 |             scores = prediction["scores"].tolist()
 96 |             labels = prediction["labels"].tolist()
 97 | 
 98 |             coco_results.extend(
 99 |                 [
100 |                     {
101 |                         "image_id": original_id,
102 |                         "category_id": labels[k],
103 |                         "bbox": box,
104 |                         "score": scores[k],
105 |                     }
106 |                     for k, box in enumerate(boxes)
107 |                 ]
108 |             )
109 |         return coco_results
110 | 
111 |     def prepare_for_coco_segmentation(self, predictions):
112 |         coco_results = []
113 |         for original_id, prediction in predictions.items():
114 |             if len(prediction) == 0:
115 |                 continue
116 | 
117 |             scores = prediction["scores"]
118 |             labels = prediction["labels"]
119 |             masks = prediction["masks"]
120 | 
121 |             masks = masks > 0.5
122 | 
123 |             scores = prediction["scores"].tolist()
124 |             labels = prediction["labels"].tolist()
125 | 
126 |             rles = [
127 |                 mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
128 |                 for mask in masks
129 |             ]
130 |             for rle in rles:
131 |                 rle["counts"] = rle["counts"].decode("utf-8")
132 | 
133 |             coco_results.extend(
134 |                 [
135 |                     {
136 |                         "image_id": original_id,
137 |                         "category_id": labels[k],
138 |                         "segmentation": rle,
139 |                         "score": scores[k],
140 |                     }
141 |                     for k, rle in enumerate(rles)
142 |                 ]
143 |             )
144 |         return coco_results
145 | 
146 |     def prepare_for_coco_keypoint(self, predictions):
147 |         coco_results = []
148 |         for original_id, prediction in predictions.items():
149 |             if len(prediction) == 0:
150 |                 continue
151 | 
152 |             boxes = prediction["boxes"]
153 |             boxes = convert_to_xywh(boxes).tolist()
154 |             scores = prediction["scores"].tolist()
155 |             labels = prediction["labels"].tolist()
156 |             keypoints = prediction["keypoints"]
157 |             keypoints = keypoints.flatten(start_dim=1).tolist()
158 | 
159 |             coco_results.extend(
160 |                 [
161 |                     {
162 |                         "image_id": original_id,
163 |                         "category_id": labels[k],
164 |                         'keypoints': keypoint,
165 |                         "score": scores[k],
166 |                     }
167 |                     for k, keypoint in enumerate(keypoints)
168 |                 ]
169 |             )
170 |         return coco_results
171 | 
172 | 
173 | def convert_to_xywh(boxes):
174 |     xmin, ymin, xmax, ymax = boxes.unbind(1)
175 |     return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
176 | 
177 | 
178 | def merge(img_ids, eval_imgs):
179 |     all_img_ids = all_gather(img_ids)
180 |     all_eval_imgs = all_gather(eval_imgs)
181 | 
182 |     merged_img_ids = []
183 |     for p in all_img_ids:
184 |         merged_img_ids.extend(p)
185 | 
186 |     merged_eval_imgs = []
187 |     for p in all_eval_imgs:
188 |         merged_eval_imgs.append(p)
189 | 
190 |     merged_img_ids = np.array(merged_img_ids)
191 |     merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
192 | 
193 |     # keep only unique (and in sorted order) images
194 |     merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
195 |     merged_eval_imgs = merged_eval_imgs[..., idx]
196 | 
197 |     return merged_img_ids, merged_eval_imgs
198 | 
199 | 
200 | def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
201 |     img_ids, eval_imgs = merge(img_ids, eval_imgs)
202 |     img_ids = list(img_ids)
203 |     eval_imgs = list(eval_imgs.flatten())
204 | 
205 |     coco_eval.evalImgs = eval_imgs
206 |     coco_eval.params.imgIds = img_ids
207 |     coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
208 | 
209 | 
210 | #################################################################
211 | # From pycocotools, just removed the prints and fixed
212 | # a Python3 bug about unicode not defined
213 | #################################################################
214 | 
215 | 
216 | def evaluate(self):
217 |     '''
218 |     Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
219 |     :return: None
220 |     '''
221 |     # tic = time.time()
222 |     # print('Running per image evaluation...')
223 |     p = self.params
224 |     # add backward compatibility if useSegm is specified in params
225 |     if p.useSegm is not None:
226 |         p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
227 |         print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
228 |     # print('Evaluate annotation type *{}*'.format(p.iouType))
229 |     p.imgIds = list(np.unique(p.imgIds))
230 |     if p.useCats:
231 |         p.catIds = list(np.unique(p.catIds))
232 |     p.maxDets = sorted(p.maxDets)
233 |     self.params = p
234 | 
235 |     self._prepare()
236 |     # loop through images, area range, max detection number
237 |     catIds = p.catIds if p.useCats else [-1]
238 | 
239 |     if p.iouType == 'segm' or p.iouType == 'bbox':
240 |         computeIoU = self.computeIoU
241 |     elif p.iouType == 'keypoints':
242 |         computeIoU = self.computeOks
243 |     self.ious = {
244 |         (imgId, catId): computeIoU(imgId, catId)
245 |         for imgId in p.imgIds
246 |         for catId in catIds}
247 | 
248 |     evaluateImg = self.evaluateImg
249 |     maxDet = p.maxDets[-1]
250 |     evalImgs = [
251 |         evaluateImg(imgId, catId, areaRng, maxDet)
252 |         for catId in catIds
253 |         for areaRng in p.areaRng
254 |         for imgId in p.imgIds
255 |     ]
256 |     # this is NOT in the pycocotools code, but could be done outside
257 |     evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
258 |     self._paramsEval = copy.deepcopy(self.params)
259 |     # toc = time.time()
260 |     # print('DONE (t={:0.2f}s).'.format(toc-tic))
261 |     return p.imgIds, evalImgs
262 | 
263 | #################################################################
264 | # end of straight copy from pycocotools, just removing the prints
265 | #################################################################
266 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![KakaoBrain](https://img.shields.io/badge/kakao-brain-ffcd00.svg)](http://kakaobrain.com/)
  2 | [![pytorch](https://img.shields.io/badge/pytorch-1.6.0-%2523ee4c2c.svg)](https://pytorch.org/)
  3 | [![pytorch](https://img.shields.io/badge/pytorch-1.7.1-%2523ee4c2c.svg)](https://pytorch.org/)
  4 | 
  5 | Sparse DETR (ICLR'22)
  6 | ========
  7 | 
  8 | By [Byungseok Roh](https://scholar.google.com/citations?user=H4VWYHwAAAAJ)\*,  [Jaewoong Shin](https://scholar.google.com/citations?user=i_o_95kAAAAJ)\*,  [Wuhyun Shin](https://scholar.google.com/citations?user=bGwfkakAAAAJ)\*, and [Saehoon Kim](https://scholar.google.com/citations?user=_ZfueMIAAAAJ) at [Kakao Brain](https://www.kakaobrain.com).
  9 | (*: Equal contribution)
 10 | 
 11 | * This repository is an official implementation of the paper [Sparse DETR: Efficient End-to-End Object Detection with Learnable Sparsity](https://arxiv.org/abs/2111.14330). 
 12 | * The code and some instructions are built upon the official [Deformable DETR repository](https://github.com/fundamentalvision/Deformable-DETR).
 13 | 
 14 | 
 15 | 
 16 | # Introduction
 17 | 
 18 | **TL; DR.** Sparse DETR is an efficient end-to-end object detector that **sparsifies encoder tokens** by using the learnable DAM(Decoder Attention Map) predictor. It achieves better performance than Deformable DETR even with only 10% encoder queries on the COCO dataset.
 19 | 
 20 | <p align="center">
 21 | <img src="./figs/dam_creation.png" height=350>
 22 | </p>
 23 | 
 24 | **Abstract.** DETR is the first end-to-end object detector using a transformer encoder-decoder architecture and demonstrates competitive performance but low computational efficiency on high resolution feature maps.
 25 | The subsequent work, Deformable DETR, enhances the efficiency of DETR by replacing dense attention with deformable attention, which achieves 10x faster convergence and improved performance. 
 26 | Deformable DETR uses the multiscale feature to ameliorate performance, however, the number of encoder tokens increases by 20x compared to DETR, and the computation cost of the encoder attention remains a bottleneck.
 27 | In our preliminary experiment, we observe that the detection performance hardly deteriorates even if only a part of the encoder token is updated.
 28 | Inspired by this observation, we propose *Sparse DETR* that selectively updates only the tokens expected to be referenced by the decoder, thus help the model effectively detect objects.
 29 | In addition, we show that applying an auxiliary detection loss on the selected tokens in the encoder improves the performance while minimizing computational overhead.
 30 | We validate that *Sparse DETR* achieves better performance than Deformable DETR even with only 10\% encoder tokens on the COCO dataset.
 31 | Albeit only the encoder tokens are sparsified, the total computation cost decreases by 38\% and the frames per second (FPS) increases by 42\% compared to Deformable DETR.
 32 | 
 33 | 
 34 | # Installation
 35 | 
 36 | ## Requirements
 37 | 
 38 | We have tested the code on the following environments: 
 39 | * Python 3.7.7 / Pytorch 1.6.0 / torchvisoin 0.7.0 / CUDA 10.1 / Ubuntu 18.04
 40 | * Python 3.8.3 / Pytorch 1.7.1 / torchvisoin 0.8.2 / CUDA 11.1 / Ubuntu 18.04
 41 | 
 42 | Run the following command to install dependencies:
 43 | ```bash
 44 | pip install -r requirements.txt
 45 | ```
 46 | 
 47 | ## Compiling CUDA operators
 48 | ```bash
 49 | cd ./models/ops
 50 | sh ./make.sh
 51 | # unit test (should see all checking is True)
 52 | python test.py
 53 | ```
 54 | 
 55 | # Usage
 56 | 
 57 | ## Dataset preparation
 58 | 
 59 | Please download [COCO 2017 dataset](https://cocodataset.org/) and organize them as follows:
 60 | 
 61 | ```
 62 | code_root/
 63 | └── data/
 64 |     └── coco/
 65 |         ├── train2017/
 66 |         ├── val2017/
 67 |         └── annotations/
 68 |         	├── instances_train2017.json
 69 |         	└── instances_val2017.json
 70 | ```
 71 | 
 72 | ## Training
 73 | 
 74 | ### Training on a single node
 75 | 
 76 | For example, the command for training Sparse DETR with the keeping ratio of 10% on 8 GPUs is as follows:
 77 | 
 78 | ```bash
 79 | $ GPUS_PER_NODE=8 ./tools/run_dist_launch.sh 8 ./configs/swint_sparse_detr_rho_0.1.sh
 80 | ```
 81 | 
 82 | ### Training on multiple nodes
 83 | 
 84 | For example, the command Sparse DETR with the keeping ratio of 10% on 2 nodes of each with 8 GPUs is as follows:
 85 | 
 86 | On node 1:
 87 | 
 88 | ```bash
 89 | $ MASTER_ADDR=<IP address of node 1> NODE_RANK=0 GPUS_PER_NODE=8 ./tools/run_dist_launch.sh 16 ./configs/swint_sparse_detr_rho_0.1.sh
 90 | ```
 91 | 
 92 | On node 2:
 93 | 
 94 | ```bash
 95 | $ MASTER_ADDR=<IP address of node 2> NODE_RANK=1 GPUS_PER_NODE=8 ./tools/run_dist_launch.sh 16 ./configs/swint_sparse_detr_rho_0.1.sh
 96 | ```
 97 | 
 98 | ### Direct argument control
 99 | 
100 | ```bash
101 | # Deformable DETR (with bounding-box-refinement and two-stage argument, if wanted)
102 | $ GPUS_PER_NODE=8 ./tools/run_dist_launch.sh 8 python main.py --with_box_refine --two_stage
103 | # Efficient DETR (with the class-specific head as describe in their paper)
104 | $ GPUS_PER_NODE=8 ./tools/run_dist_launch.sh 8 python main.py --with_box_refine --two_stage --eff_query_init --eff_specific_head
105 | # Sparse DETR (with the keeping ratio of 10% and encoder auxiliary loss)
106 | $ GPUS_PER_NODE=8 ./tools/run_dist_launch.sh 8 python main.py --with_box_refine --two_stage --eff_query_init --eff_specific_head --rho 0.1 --use_enc_aux_loss
107 | ```
108 | 
109 | ### Some tips to speed-up training
110 | * If your file system is slow to read images, you may consider enabling '--cache_mode' option to load the whole dataset into memory at the beginning of training.
111 | * You may increase the batch size to maximize the GPU utilization, according to GPU memory of yours, e.g., set '--batch_size 3' or '--batch_size 4'.
112 | 
113 | ## Evaluation
114 | 
115 | You can get the pre-trained model of Sparse DETR (the link is in "Main Results" session), then run the following command to evaluate it on COCO 2017 validation set:
116 | 
117 | ```bash
118 | # Note that you should run the command with the corresponding configuration.
119 | $ ./configs/swint_sparse_detr_rho_0.1.sh --resume <path to pre-trained model> --eval
120 | ```
121 | 
122 | You can also run distributed evaluation by using ```./tools/run_dist_launch.sh```.
123 | 
124 | # Main Results
125 | The tables below demonstrate the detection performance of Sparse DETR on the COCO 2017 validation set when using different backbones. 
126 | * **Top-k** : sampling the top-k object queries instead of using the learned object queries(as in Efficient DETR).
127 | * **BBR** : performing bounding box refinement in the decoder block(as in Deformable DETR).
128 | * The **encoder auxiliary loss** proposed in our paper is only applied to Sparse DETR.
129 | * **FLOPs** and **FPS** are measured in the same way as used in Deformable DETR. 
130 | * Refer to **Table 1** in the paper for more details.
131 | 
132 | 
133 | 
134 | ## ResNet-50 backbone
135 | | Method             | Epochs | ρ   | Top-k & BBR | AP   | #Params(M) | GFLOPs | B4FPS | Download |
136 | |:------------------:|:------:|:---:|:-----------:|:----:|:----------:|:------:|:-----:|:--------:|
137 | | Faster R-CNN + FPN | 109    | N/A |             | 42.0 | 42M        | 180G   | 26    |          |
138 | | DETR               | 50     | N/A |             | 35.0 | 41M        | 86G    | 28    |          |
139 | | DETR               | 500    | N/A |             | 42.0 | 41M        | 86G    | 28    |          |
140 | | DETR-DC5           | 500    | N/A |             | 43.3 | 41M        | 187G   | 12    |          |
141 | | PnP-DETR           | 500    | 33% |             | 41.1 |            |        |       |          |
142 | |                    | 500    | 50% |             | 41.8 |            |        |       |          |
143 | | PnP-DETR-DC5       | 500    | 33% |             | 42.7 |            |        |       |          |
144 | |                    | 500    | 50% |             | 43.1 |            |        |       |          |
145 | | Deformable-DETR    | 50     | N/A |             | 43.9 | 39.8M      | 172.9G | 19.1  |          |
146 | |                    | 50     | N/A | o           | 46.0 | 40.8M      | 177.3G | 18.2  |          |
147 | | Sparse-DETR        | 50     | 10% | o           | 45.3 | 40.9M      | 105.4G | 26.5  | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_10.pth)     |
148 | |                    | 50     | 20% | o           | 45.6 | 40.9M      | 112.9G | 24.8  | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_20.pth)     |
149 | |                    | 50     | 30% | o           | 46.0 | 40.9M      | 120.5G | 23.2  | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_30.pth)     |
150 | |                    | 50     | 40% | o           | 46.2 | 40.9M      | 128.0G | 21.8  | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_40.pth)     |
151 | |                    | 50     | 50% | o           | 46.3 | 40.9M      | 135.6G | 20.5  | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_50.pth)     |
152 | 
153 | 
154 | 
155 | ## Swin-T backbone
156 | | Method          | Epochs | ρ   | Top-k & BBR | AP   | #Params(M) | GFLOPs | B4FPS | Download |
157 | |:---------------:|:------:|:---:|:-----------:|:----:|:----------:|:------:|:-----:|:--------:|
158 | | DETR            | 50     | N/A |             | 35.9 | 45.0M      | 91.6G  | 26.8  |          |
159 | | DETR            | 500    | N/A |             | 45.4 | 45.0M      | 91.6G  | 26.8  |          |
160 | | Deformable-DETR | 50     | N/A |             | 45.7 | 40.3M      | 180.4G | 15.9  |          |
161 | |                 | 50     | N/A | o           | 48.0 | 41.3M      | 184.8G | 15.4  |          |
162 | | Sparse-DETR     | 50     | 10% | o           | 48.2 | 41.4M      | 113.4G | 21.2  | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_swint_10.pth)     |
163 | |                 | 50     | 20% | o           | 48.8 | 41.4M      | 121.0G | 20    | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_swint_20.pth)     |
164 | |                 | 50     | 30% | o           | 49.1 | 41.4M      | 128.5G | 18.9  | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_swint_30.pth)     |
165 | |                 | 50     | 40% | o           | 49.2 | 41.4M      | 136.1G | 18    | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_swint_40.pth)     |
166 | |                 | 50     | 50% | o           | 49.3 | 41.4M      | 143.7G | 17.2  | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_swint_50.pth)     |
167 | 
168 | 
169 | ## Initializing ResNet-50 backbone with SCRL
170 | The performance of Sparse DETR can be further improved when the backbone network is initialized with the `SCRL`([Spatially Consistent Representation Learning](https://arxiv.org/abs/2103.06122)) that aims to learn dense representations in a self-supervised way, compared to the default initialization with the ImageNet pre-trained one, denoted as `IN-sup` in the table below. 
171 | * We obtained pre-trained weights from [Torchvision](https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html#sphx-glr-beginner-finetuning-torchvision-models-tutorial-py) for `IN-sup`, and the [SCRL GitHub repository](https://github.com/kakaobrain/scrl) for `SCRL`.
172 | * To reproduce the `SCRL` results, add `--scrl_pretrained_path <downloaded_filepath>` to the training command.
173 |  
174 | | Method      | ρ   | AP(IN-sup) | AP(SCRL) | AP(gain) | Download |
175 | |:-----------:|:---:|:-----------:|:--------:|:--------:|:--------:|
176 | | Sparse DETR | 10% | 45.3        | 46.9     | +1.6     | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_scrl_10.pth)     |
177 | |             | 20% | 45.6        | 47.2     | +1.7     | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_scrl_20.pth)     |
178 | |             | 30% | 46.0        | 47.4     | +1.4     | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_scrl_30.pth)     |
179 | |             | 40% | 46.2        | 47.7     | +1.5     | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_scrl_40.pth)     |
180 | |             | 50% | 46.3        | 47.9     | +1.6     | [link](https://twg.kakaocdn.net/brainrepo/sparse_detr/sparse_detr_r50_scrl_50.pth)     |
181 | 
182 | 
183 | # Citation
184 | If you find Sparse DETR useful in your research, please consider citing:
185 | ```bibtex
186 | @inproceedings{roh2022sparse,
187 |   title={Sparse DETR: Efficient End-to-End Object Detection with Learnable Sparsity},
188 |   author={Roh, Byungseok and Shin, JaeWoong and Shin, Wuhyun and Kim, Saehoon},
189 |   booktitle={ICLR},
190 |   year={2022}
191 | }
192 | ```
193 | 
194 | # License
195 | 
196 | This project is released under the [Apache 2.0 license](./LICENSE).
197 | Copyright 2021 [Kakao Brain Corp](https://www.kakaobrain.com). All Rights Reserved.
198 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                 Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2021 KAKAO BRAIN Corp. All Rights Reserved.
190 |    
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 
204 | 
205 | Deformable DETR
206 | 
207 | Copyright 2020 SenseTime
208 | 
209 | Licensed under the Apache License, Version 2.0 (the "License");
210 | you may not use this file except in compliance with the License.
211 | You may obtain a copy of the License at
212 | 
213 |    http://www.apache.org/licenses/LICENSE-2.0
214 | 
215 | Unless required by applicable law or agreed to in writing, software
216 | distributed under the License is distributed on an "AS IS" BASIS,
217 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
218 | See the License for the specific language governing permissions and
219 | limitations under the License.
220 | 
221 | 
222 | 
223 | DETR
224 | 
225 | Copyright 2020 - present, Facebook, Inc
226 | 
227 | Licensed under the Apache License, Version 2.0 (the "License");
228 | you may not use this file except in compliance with the License.
229 | You may obtain a copy of the License at
230 | 
231 |    http://www.apache.org/licenses/LICENSE-2.0
232 | 
233 | Unless required by applicable law or agreed to in writing, software
234 | distributed under the License is distributed on an "AS IS" BASIS,
235 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
236 | See the License for the specific language governing permissions and
237 | limitations under the License.
238 | 


--------------------------------------------------------------------------------
/models/segmentation.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------
  2 | # Sparse DETR
  3 | # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------
  6 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
  7 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  8 | # ------------------------------------------------------------------------------------
  9 | # Modified from DETR (https://github.com/facebookresearch/detr)
 10 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 11 | # ------------------------------------------------------------------------------------
 12 | 
 13 | 
 14 | """
 15 | This file provides the definition of the convolutional heads used to predict masks, as well as the losses
 16 | """
 17 | import io
 18 | from collections import defaultdict
 19 | 
 20 | import torch
 21 | import torch.nn as nn
 22 | import torch.nn.functional as F
 23 | from PIL import Image
 24 | 
 25 | import util.box_ops as box_ops
 26 | from util.misc import NestedTensor, interpolate, nested_tensor_from_tensor_list
 27 | 
 28 | try:
 29 |     from panopticapi.utils import id2rgb, rgb2id
 30 | except ImportError:
 31 |     pass
 32 | 
 33 | 
 34 | class DETRsegm(nn.Module):
 35 |     def __init__(self, detr, freeze_detr=False):
 36 |         super().__init__()
 37 |         self.detr = detr
 38 | 
 39 |         if freeze_detr:
 40 |             for p in self.parameters():
 41 |                 p.requires_grad_(False)
 42 | 
 43 |         hidden_dim, nheads = detr.transformer.d_model, detr.transformer.nhead
 44 |         self.bbox_attention = MHAttentionMap(hidden_dim, hidden_dim, nheads, dropout=0)
 45 |         self.mask_head = MaskHeadSmallConv(hidden_dim + nheads, [1024, 512, 256], hidden_dim)
 46 | 
 47 |     def forward(self, samples: NestedTensor):
 48 |         if not isinstance(samples, NestedTensor):
 49 |             samples = nested_tensor_from_tensor_list(samples)
 50 |         features, pos = self.detr.backbone(samples)
 51 | 
 52 |         bs = features[-1].tensors.shape[0]
 53 | 
 54 |         src, mask = features[-1].decompose()
 55 |         src_proj = self.detr.input_proj(src)
 56 |         hs, memory = self.detr.transformer(src_proj, mask, self.detr.query_embed.weight, pos[-1])
 57 | 
 58 |         outputs_class = self.detr.class_embed(hs)
 59 |         outputs_coord = self.detr.bbox_embed(hs).sigmoid()
 60 |         out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]}
 61 |         if self.detr.aux_loss:
 62 |             out["aux_outputs"] = [
 63 |                 {"pred_logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])
 64 |             ]
 65 | 
 66 |         # FIXME h_boxes takes the last one computed, keep this in mind
 67 |         bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask)
 68 | 
 69 |         seg_masks = self.mask_head(src_proj, bbox_mask, [features[2].tensors, features[1].tensors, features[0].tensors])
 70 |         outputs_seg_masks = seg_masks.view(bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
 71 | 
 72 |         out["pred_masks"] = outputs_seg_masks
 73 |         return out
 74 | 
 75 | 
 76 | class MaskHeadSmallConv(nn.Module):
 77 |     """
 78 |     Simple convolutional head, using group norm.
 79 |     Upsampling is done using a FPN approach
 80 |     """
 81 | 
 82 |     def __init__(self, dim, fpn_dims, context_dim):
 83 |         super().__init__()
 84 | 
 85 |         inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
 86 |         self.lay1 = torch.nn.Conv2d(dim, dim, 3, padding=1)
 87 |         self.gn1 = torch.nn.GroupNorm(8, dim)
 88 |         self.lay2 = torch.nn.Conv2d(dim, inter_dims[1], 3, padding=1)
 89 |         self.gn2 = torch.nn.GroupNorm(8, inter_dims[1])
 90 |         self.lay3 = torch.nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
 91 |         self.gn3 = torch.nn.GroupNorm(8, inter_dims[2])
 92 |         self.lay4 = torch.nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
 93 |         self.gn4 = torch.nn.GroupNorm(8, inter_dims[3])
 94 |         self.lay5 = torch.nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
 95 |         self.gn5 = torch.nn.GroupNorm(8, inter_dims[4])
 96 |         self.out_lay = torch.nn.Conv2d(inter_dims[4], 1, 3, padding=1)
 97 | 
 98 |         self.dim = dim
 99 | 
100 |         self.adapter1 = torch.nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
101 |         self.adapter2 = torch.nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
102 |         self.adapter3 = torch.nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
103 | 
104 |         for m in self.modules():
105 |             if isinstance(m, nn.Conv2d):
106 |                 nn.init.kaiming_uniform_(m.weight, a=1)
107 |                 nn.init.constant_(m.bias, 0)
108 | 
109 |     def forward(self, x, bbox_mask, fpns):
110 |         def expand(tensor, length):
111 |             return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)
112 | 
113 |         x = torch.cat([expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
114 | 
115 |         x = self.lay1(x)
116 |         x = self.gn1(x)
117 |         x = F.relu(x)
118 |         x = self.lay2(x)
119 |         x = self.gn2(x)
120 |         x = F.relu(x)
121 | 
122 |         cur_fpn = self.adapter1(fpns[0])
123 |         if cur_fpn.size(0) != x.size(0):
124 |             cur_fpn = expand(cur_fpn, x.size(0) / cur_fpn.size(0))
125 |         x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
126 |         x = self.lay3(x)
127 |         x = self.gn3(x)
128 |         x = F.relu(x)
129 | 
130 |         cur_fpn = self.adapter2(fpns[1])
131 |         if cur_fpn.size(0) != x.size(0):
132 |             cur_fpn = expand(cur_fpn, x.size(0) / cur_fpn.size(0))
133 |         x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
134 |         x = self.lay4(x)
135 |         x = self.gn4(x)
136 |         x = F.relu(x)
137 | 
138 |         cur_fpn = self.adapter3(fpns[2])
139 |         if cur_fpn.size(0) != x.size(0):
140 |             cur_fpn = expand(cur_fpn, x.size(0) / cur_fpn.size(0))
141 |         x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
142 |         x = self.lay5(x)
143 |         x = self.gn5(x)
144 |         x = F.relu(x)
145 | 
146 |         x = self.out_lay(x)
147 |         return x
148 | 
149 | 
150 | class MHAttentionMap(nn.Module):
151 |     """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
152 | 
153 |     def __init__(self, query_dim, hidden_dim, num_heads, dropout=0, bias=True):
154 |         super().__init__()
155 |         self.num_heads = num_heads
156 |         self.hidden_dim = hidden_dim
157 |         self.dropout = nn.Dropout(dropout)
158 | 
159 |         self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
160 |         self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
161 | 
162 |         nn.init.zeros_(self.k_linear.bias)
163 |         nn.init.zeros_(self.q_linear.bias)
164 |         nn.init.xavier_uniform_(self.k_linear.weight)
165 |         nn.init.xavier_uniform_(self.q_linear.weight)
166 |         self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
167 | 
168 |     def forward(self, q, k, mask=None):
169 |         q = self.q_linear(q)
170 |         k = F.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
171 |         qh = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
172 |         kh = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
173 |         weights = torch.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
174 | 
175 |         if mask is not None:
176 |             weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf"))
177 |         weights = F.softmax(weights.flatten(2), dim=-1).view_as(weights)
178 |         weights = self.dropout(weights)
179 |         return weights
180 | 
181 | 
182 | def dice_loss(inputs, targets, num_boxes):
183 |     """
184 |     Compute the DICE loss, similar to generalized IOU for masks
185 |     Args:
186 |         inputs: A float tensor of arbitrary shape.
187 |                 The predictions for each example.
188 |         targets: A float tensor with the same shape as inputs. Stores the binary
189 |                  classification label for each element in inputs
190 |                 (0 for the negative class and 1 for the positive class).
191 |     """
192 |     inputs = inputs.sigmoid()
193 |     inputs = inputs.flatten(1)
194 |     numerator = 2 * (inputs * targets).sum(1)
195 |     denominator = inputs.sum(-1) + targets.sum(-1)
196 |     loss = 1 - (numerator + 1) / (denominator + 1)
197 |     return loss.sum() / num_boxes
198 | 
199 | 
200 | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, idx=None):
201 |     """
202 |     Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
203 |     Args:
204 |         inputs: A float tensor of arbitrary shape.
205 |                 The predictions for each example.
206 |         targets: A float tensor with the same shape as inputs. Stores the binary
207 |                  classification label for each element in inputs
208 |                 (0 for the negative class and 1 for the positive class).
209 |         alpha: (optional) Weighting factor in range (0,1) to balance
210 |                 positive vs negative examples. Default = -1 (no weighting).
211 |         gamma: Exponent of the modulating factor (1 - p_t) to
212 |                balance easy vs hard examples.
213 |     Returns:
214 |         Loss tensor
215 |     """
216 |     prob = inputs.sigmoid()
217 |     ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
218 |     p_t = prob * targets + (1 - prob) * (1 - targets)
219 |     loss = ce_loss * ((1 - p_t) ** gamma)
220 | 
221 |     if alpha >= 0:
222 |         alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
223 |         loss = alpha_t * loss
224 |     if idx is not None:
225 |         return loss[idx].mean(1).sum() / num_boxes
226 |     return loss.mean(1).sum() / num_boxes
227 | 
228 | 
229 | class PostProcessSegm(nn.Module):
230 |     def __init__(self, threshold=0.5):
231 |         super().__init__()
232 |         self.threshold = threshold
233 | 
234 |     @torch.no_grad()
235 |     def forward(self, results, outputs, orig_target_sizes, max_target_sizes):
236 |         assert len(orig_target_sizes) == len(max_target_sizes)
237 |         max_h, max_w = max_target_sizes.max(0)[0].tolist()
238 |         outputs_masks = outputs["pred_masks"].squeeze(2)
239 |         outputs_masks = F.interpolate(outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False)
240 |         outputs_masks = (outputs_masks.sigmoid() > self.threshold).cpu()
241 | 
242 |         for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
243 |             img_h, img_w = t[0], t[1]
244 |             results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
245 |             results[i]["masks"] = F.interpolate(
246 |                 results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
247 |             ).byte()
248 | 
249 |         return results
250 | 
251 | 
252 | class PostProcessPanoptic(nn.Module):
253 |     """This class converts the output of the model to the final panoptic result, in the format expected by the
254 |     coco panoptic API """
255 | 
256 |     def __init__(self, is_thing_map, threshold=0.85):
257 |         """
258 |         Parameters:
259 |            is_thing_map: This is a whose keys are the class ids, and the values a boolean indicating whether
260 |                           the class is  a thing (True) or a stuff (False) class
261 |            threshold: confidence threshold: segments with confidence lower than this will be deleted
262 |         """
263 |         super().__init__()
264 |         self.threshold = threshold
265 |         self.is_thing_map = is_thing_map
266 | 
267 |     def forward(self, outputs, processed_sizes, target_sizes=None):
268 |         """ This function computes the panoptic prediction from the model's predictions.
269 |         Parameters:
270 |             outputs: This is a dict coming directly from the model. See the model doc for the content.
271 |             processed_sizes: This is a list of tuples (or torch tensors) of sizes of the images that were passed to the
272 |                              model, ie the size after data augmentation but before batching.
273 |             target_sizes: This is a list of tuples (or torch tensors) corresponding to the requested final size
274 |                           of each prediction. If left to None, it will default to the processed_sizes
275 |             """
276 |         if target_sizes is None:
277 |             target_sizes = processed_sizes
278 |         assert len(processed_sizes) == len(target_sizes)
279 |         out_logits, raw_masks, raw_boxes = outputs["pred_logits"], outputs["pred_masks"], outputs["pred_boxes"]
280 |         assert len(out_logits) == len(raw_masks) == len(target_sizes)
281 |         preds = []
282 | 
283 |         def to_tuple(tup):
284 |             if isinstance(tup, tuple):
285 |                 return tup
286 |             return tuple(tup.cpu().tolist())
287 | 
288 |         for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
289 |             out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
290 |         ):
291 |             # we filter empty queries and detection below threshold
292 |             scores, labels = cur_logits.softmax(-1).max(-1)
293 |             keep = labels.ne(outputs["pred_logits"].shape[-1] - 1) & (scores > self.threshold)
294 |             cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
295 |             cur_scores = cur_scores[keep]
296 |             cur_classes = cur_classes[keep]
297 |             cur_masks = cur_masks[keep]
298 |             cur_masks = interpolate(cur_masks[None], to_tuple(size), mode="bilinear").squeeze(0)
299 |             cur_boxes = box_ops.box_cxcywh_to_xyxy(cur_boxes[keep])
300 | 
301 |             h, w = cur_masks.shape[-2:]
302 |             assert len(cur_boxes) == len(cur_classes)
303 | 
304 |             # It may be that we have several predicted masks for the same stuff class.
305 |             # In the following, we track the list of masks ids for each stuff class (they are merged later on)
306 |             cur_masks = cur_masks.flatten(1)
307 |             stuff_equiv_classes = defaultdict(lambda: [])
308 |             for k, label in enumerate(cur_classes):
309 |                 if not self.is_thing_map[label.item()]:
310 |                     stuff_equiv_classes[label.item()].append(k)
311 | 
312 |             def get_ids_area(masks, scores, dedup=False):
313 |                 # This helper function creates the final panoptic segmentation image
314 |                 # It also returns the area of the masks that appears on the image
315 | 
316 |                 m_id = masks.transpose(0, 1).softmax(-1)
317 | 
318 |                 if m_id.shape[-1] == 0:
319 |                     # We didn't detect any mask :(
320 |                     m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
321 |                 else:
322 |                     m_id = m_id.argmax(-1).view(h, w)
323 | 
324 |                 if dedup:
325 |                     # Merge the masks corresponding to the same stuff class
326 |                     for equiv in stuff_equiv_classes.values():
327 |                         if len(equiv) > 1:
328 |                             for eq_id in equiv:
329 |                                 m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
330 | 
331 |                 final_h, final_w = to_tuple(target_size)
332 | 
333 |                 seg_img = Image.fromarray(id2rgb(m_id.view(h, w).cpu().numpy()))
334 |                 seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST)
335 | 
336 |                 np_seg_img = (
337 |                     torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())).view(final_h, final_w, 3).numpy()
338 |                 )
339 |                 m_id = torch.from_numpy(rgb2id(np_seg_img))
340 | 
341 |                 area = []
342 |                 for i in range(len(scores)):
343 |                     area.append(m_id.eq(i).sum().item())
344 |                 return area, seg_img
345 | 
346 |             area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
347 |             if cur_classes.numel() > 0:
348 |                 # We know filter empty masks as long as we find some
349 |                 while True:
350 |                     filtered_small = torch.as_tensor(
351 |                         [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
352 |                     )
353 |                     if filtered_small.any().item():
354 |                         cur_scores = cur_scores[~filtered_small]
355 |                         cur_classes = cur_classes[~filtered_small]
356 |                         cur_masks = cur_masks[~filtered_small]
357 |                         area, seg_img = get_ids_area(cur_masks, cur_scores)
358 |                     else:
359 |                         break
360 | 
361 |             else:
362 |                 cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device)
363 | 
364 |             segments_info = []
365 |             for i, a in enumerate(area):
366 |                 cat = cur_classes[i].item()
367 |                 segments_info.append({"id": i, "isthing": self.is_thing_map[cat], "category_id": cat, "area": a})
368 |             del cur_classes
369 | 
370 |             with io.BytesIO() as out:
371 |                 seg_img.save(out, format="PNG")
372 |                 predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
373 |             preds.append(predictions)
374 |         return preds
375 | 


--------------------------------------------------------------------------------