├── adet
    ├── utils
    │   ├── __init__.py
    │   ├── comm.py
    │   ├── visualizer.py
    │   └── measures.py
    ├── structures
    │   ├── __init__.py
    │   └── beziers.py
    ├── __init__.py
    ├── modeling
    │   ├── osformer
    │   │   ├── __init__.py
    │   │   ├── instance_fusion.py
    │   │   ├── trans_utils.py
    │   │   ├── feed_forward.py
    │   │   ├── position_encoding.py
    │   │   ├── trans_encoder.py
    │   │   ├── loss.py
    │   │   └── trans_decoder.py
    │   ├── __init__.py
    │   ├── backbone
    │   │   ├── __init__.py
    │   │   ├── fpn.py
    │   │   ├── lpf.py
    │   │   ├── resnet_interval.py
    │   │   └── mobilenet.py
    │   └── ops
    │   │   ├── make.sh
    │   │   ├── modules
    │   │       ├── __init__.py
    │   │       └── ms_deform_attn.py
    │   │   ├── functions
    │   │       ├── __init__.py
    │   │       └── ms_deform_attn_func.py
    │   │   ├── src
    │   │       ├── vision.cpp
    │   │       ├── cuda
    │   │       │   ├── ms_deform_attn_cuda.h
    │   │       │   └── ms_deform_attn_cuda.cu
    │   │       ├── cpu
    │   │       │   ├── ms_deform_attn_cpu.h
    │   │       │   └── ms_deform_attn_cpu.cpp
    │   │       └── ms_deform_attn.h
    │   │   ├── setup.py
    │   │   └── test.py
    ├── config
    │   ├── __init__.py
    │   ├── config.py
    │   └── defaults.py
    ├── checkpoint
    │   ├── __init__.py
    │   └── adet_checkpoint.py
    ├── evaluation
    │   └── __init__.py
    ├── layers
    │   ├── csrc
    │   │   ├── cuda_version.cu
    │   │   ├── ml_nms
    │   │   │   ├── ml_nms.h
    │   │   │   └── ml_nms.cu
    │   │   ├── vision.cpp
    │   │   ├── DefROIAlign
    │   │   │   └── DefROIAlign.h
    │   │   └── BezierAlign
    │   │   │   └── BezierAlign.h
    │   ├── __init__.py
    │   ├── ml_nms.py
    │   ├── conv_with_kaiming_uniform.py
    │   ├── iou_loss.py
    │   ├── gcn.py
    │   ├── naive_group_norm.py
    │   ├── bezier_align.py
    │   ├── def_roi_align.py
    │   └── deform_conv.py
    └── data
    │   ├── __init__.py
    │   ├── datasets
    │       └── cis.py
    │   ├── builtin.py
    │   ├── detection_utils.py
    │   ├── augmentation.py
    │   └── dataset_mapper.py
├── docs
    ├── 488.gif
    ├── 4126.gif
    ├── OSFormer.png
    ├── logo_osformer.png
    ├── COD10K-CAM-3-Flying-65-Owl-4620.gif
    ├── COD10K-CAM-3-Flying-53-Bird-3024.gif
    ├── .gitattributes
    └── faq.md
├── .gitignore
├── configs
    ├── CIS_R101.yaml
    ├── CIS_R50.yaml
    ├── CIS_RT.yaml
    ├── CIS_PVTv2B2Li.yaml
    ├── CIS_SWINT.yaml
    └── Base-CIS.yaml
├── requirements.txt
├── tools
    ├── img2gif.py
    ├── compare_ap.py
    ├── csv2txt.py
    ├── eval_single.py
    ├── combine_vis.py
    ├── plot_utils.py
    ├── visualize_data.py
    ├── visualize_feat.py
    └── train_net.py
├── demo
    ├── vis_pred_json.py
    └── demo.py
├── setup.py
└── README.md


/adet/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/adet/structures/__init__.py:
--------------------------------------------------------------------------------
1 | from .beziers import Beziers


--------------------------------------------------------------------------------
/adet/__init__.py:
--------------------------------------------------------------------------------
1 | from adet import modeling
2 | 
3 | __version__ = "0.1.1"
4 | 


--------------------------------------------------------------------------------
/adet/modeling/osformer/__init__.py:
--------------------------------------------------------------------------------
1 | from .osformer import OSFormer
2 | 
3 | 


--------------------------------------------------------------------------------
/docs/488.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PJLallen/OSFormer/HEAD/docs/488.gif


--------------------------------------------------------------------------------
/docs/4126.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PJLallen/OSFormer/HEAD/docs/4126.gif


--------------------------------------------------------------------------------
/docs/OSFormer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PJLallen/OSFormer/HEAD/docs/OSFormer.png


--------------------------------------------------------------------------------
/docs/logo_osformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PJLallen/OSFormer/HEAD/docs/logo_osformer.png


--------------------------------------------------------------------------------
/adet/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import get_cfg
2 | 
3 | __all__ = [
4 |     "get_cfg",
5 | ]
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .idea
3 | build
4 | output
5 | *.ipynb
6 | *.egg-info
7 | .ipynb_checkpoints
8 | *.so
9 | 


--------------------------------------------------------------------------------
/adet/checkpoint/__init__.py:
--------------------------------------------------------------------------------
1 | from .adet_checkpoint import AdetCheckpointer
2 | 
3 | __all__ = ["AdetCheckpointer"]
4 | 


--------------------------------------------------------------------------------
/docs/COD10K-CAM-3-Flying-65-Owl-4620.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PJLallen/OSFormer/HEAD/docs/COD10K-CAM-3-Flying-65-Owl-4620.gif


--------------------------------------------------------------------------------
/docs/COD10K-CAM-3-Flying-53-Bird-3024.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PJLallen/OSFormer/HEAD/docs/COD10K-CAM-3-Flying-53-Bird-3024.gif


--------------------------------------------------------------------------------
/adet/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .text_evaluation import TextEvaluator
2 | from .text_eval_script import text_eval_main
3 | from . import rrc_evaluation_funcs


--------------------------------------------------------------------------------
/configs/CIS_R101.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-CIS.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
4 |   RESNETS:
5 |     DEPTH: 101
6 | 


--------------------------------------------------------------------------------
/configs/CIS_R50.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-CIS.yaml"
2 | MODEL:
3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 |   RESNETS:
5 |     DEPTH: 50
6 | 


--------------------------------------------------------------------------------
/adet/layers/csrc/cuda_version.cu:
--------------------------------------------------------------------------------
1 | #include <cuda_runtime_api.h>
2 | 
3 | namespace adet {
4 | int get_cudart_version() {
5 |   return CUDART_VERSION;
6 | }
7 | } // namespace adet
8 | 


--------------------------------------------------------------------------------
/configs/CIS_RT.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "./CIS_R50.yaml"
2 | INPUT:
3 |   MIN_SIZE_TEST: 550
4 |   MIN_SIZE_TRAIN: (550,)
5 | MODEL:
6 |   OSFormer:
7 |     ENC_LAYERS: 3
8 |     DEC_LAYERS: 3
9 | 


--------------------------------------------------------------------------------
/adet/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import builtin  # ensure the builtin datasets are registered
2 | from .dataset_mapper import DatasetMapperWithBasis
3 | 
4 | 
5 | __all__ = ["DatasetMapperWithBasis"]
6 | 


--------------------------------------------------------------------------------
/adet/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone import build_fcos_resnet_fpn_backbone
2 | from .osformer import OSFormer
3 | 
4 | _EXCLUDE = {"torch", "ShapeSpec"}
5 | __all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")]
6 | 


--------------------------------------------------------------------------------
/configs/CIS_PVTv2B2Li.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-CIS.yaml"
2 | MODEL:
3 |   WEIGHTS: ""  # download weights from https://github.com/whai362/PVT
4 |   BACKBONE:
5 |     NAME: "build_pvt_v2_b2_li"
6 |   PVTV2:
7 |     OUT_FEATURES: [ "res2", "res3", "res4", "res5" ]


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | setuptools==59.5.0
 2 | termcolor>=1.1
 3 | Pillow>=6.0
 4 | yacs>=0.1.6
 5 | tabulate
 6 | cloudpickle
 7 | matplotlib
 8 | tqdm>4.29.0
 9 | tensorboard
10 | python-Levenshtein
11 | Polygon3
12 | shapely
13 | kornia==0.6.8
14 | opencv-python
15 | timm
16 | 


--------------------------------------------------------------------------------
/adet/config/config.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import CfgNode
 2 | 
 3 | 
 4 | def get_cfg() -> CfgNode:
 5 |     """
 6 |     Get a copy of the default config.
 7 | 
 8 |     Returns:
 9 |         a detectron2 CfgNode instance.
10 |     """
11 |     from .defaults import _C
12 | 
13 |     return _C.clone()
14 | 


--------------------------------------------------------------------------------
/adet/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .fpn import build_fcos_resnet_fpn_backbone
2 | from .dla import build_fcos_dla_fpn_backbone
3 | from .resnet_lpf import build_resnet_lpf_backbone
4 | from .bifpn import build_fcos_resnet_bifpn_backbone
5 | from .pvt_v2 import build_pvt_v2_b2_li, build_pvt_v2_fpn_b2_li
6 | from .swin import D2SwinTransformer
7 | 


--------------------------------------------------------------------------------
/adet/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .deform_conv import DFConv2d
 2 | from .ml_nms import ml_nms
 3 | from .iou_loss import IOULoss
 4 | from .conv_with_kaiming_uniform import conv_with_kaiming_uniform
 5 | from .bezier_align import BezierAlign
 6 | from .def_roi_align import DefROIAlign
 7 | from .naive_group_norm import NaiveGroupNorm
 8 | from .gcn import GCN
 9 | 
10 | __all__ = [k for k in globals().keys() if not k.startswith("_")]


--------------------------------------------------------------------------------
/adet/modeling/ops/make.sh:
--------------------------------------------------------------------------------
1 | # Taken from https://github.com/fundamentalvision/Deformable-DETR
2 | # ------------------------------------------------------------------------------------------------
3 | # Deformable DETR
4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | # ------------------------------------------------------------------------------------------------
7 | 
8 | python setup.py build install
9 | 


--------------------------------------------------------------------------------
/adet/modeling/ops/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # Taken from https://github.com/fundamentalvision/Deformable-DETR
2 | # ------------------------------------------------------------------------------------------------
3 | # Deformable DETR
4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6 | # ------------------------------------------------------------------------------------------------\
7 | 
8 | from .ms_deform_attn import MSDeformAttn
9 | 


--------------------------------------------------------------------------------
/adet/modeling/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # Taken from https://github.com/fundamentalvision/Deformable-DETR
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | 
 8 | from .ms_deform_attn_func import MSDeformAttnFunction
 9 | 
10 | 


--------------------------------------------------------------------------------
/configs/CIS_SWINT.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-CIS.yaml"
 2 | MODEL:
 3 |   WEIGHTS: ""  # download weights from https://github.com/microsoft/Swin-Transformer
 4 |   BACKBONE:
 5 |     NAME: "D2SwinTransformer"
 6 |   SWIN:
 7 |     EMBED_DIM: 96
 8 |     DEPTHS: [ 2, 2, 6, 2 ]
 9 |     NUM_HEADS: [ 3, 6, 12, 24 ]
10 |     WINDOW_SIZE: 7
11 |     APE: False
12 |     DROP_PATH_RATE: 0.3
13 |     PATCH_NORM: True
14 |   PIXEL_MEAN: [ 123.675, 116.280, 103.530 ]
15 |   PIXEL_STD: [ 58.395, 57.120, 57.375 ]
16 | INPUT:
17 |   MIN_SIZE_TRAIN: (800,)
18 |   MIN_SIZE_TEST: 800


--------------------------------------------------------------------------------
/docs/.gitattributes:
--------------------------------------------------------------------------------
 1 | faq.md filter=lfs diff=lfs merge=lfs -text
 2 | logo_osformer.png filter=lfs diff=lfs merge=lfs -text
 3 | OSFormer.png filter=lfs diff=lfs merge=lfs -text
 4 | 4126.gif filter=lfs diff=lfs merge=lfs -text
 5 | COD10K-CAM-3-Flying-53-Bird-3024.gif filter=lfs diff=lfs merge=lfs -text
 6 | ECCV'22[[:space:]]poster-1317.pdf filter=lfs diff=lfs merge=lfs -text
 7 | 488.gif filter=lfs diff=lfs merge=lfs -text
 8 | COD10K-CAM-3-Flying-65-Owl-4620.gif filter=lfs diff=lfs merge=lfs -text
 9 | ECCV'22[[:space:]]video-1317.mp4 filter=lfs diff=lfs merge=lfs -text
10 | 


--------------------------------------------------------------------------------
/adet/modeling/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | Taken from https://github.com/fundamentalvision/Deformable-DETR
 4 | **************************************************************************************************
 5 | * Deformable DETR
 6 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 7 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include "ms_deform_attn.h"
12 | 
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16 | }
17 | 


--------------------------------------------------------------------------------
/adet/layers/csrc/ml_nms/ml_nms.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | namespace adet {
 5 | 
 6 | 
 7 | #ifdef WITH_CUDA
 8 | at::Tensor ml_nms_cuda(
 9 |     const at::Tensor dets,
10 |     const float threshold);
11 | #endif
12 | 
13 | at::Tensor ml_nms(const at::Tensor& dets,
14 |                   const at::Tensor& scores,
15 |                   const at::Tensor& labels,
16 |                   const float threshold) {
17 | 
18 |   if (dets.type().is_cuda()) {
19 | #ifdef WITH_CUDA
20 |     // TODO raise error if not compiled with CUDA
21 |     if (dets.numel() == 0)
22 |       return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
23 |     auto b = at::cat({dets, scores.unsqueeze(1), labels.unsqueeze(1)}, 1);
24 |     return ml_nms_cuda(b, threshold);
25 | #else
26 |     AT_ERROR("Not compiled with GPU support");
27 | #endif
28 |   }
29 |   AT_ERROR("CPU version not implemented");
30 | }
31 | 
32 | } // namespace adet
33 | 


--------------------------------------------------------------------------------
/adet/layers/ml_nms.py:
--------------------------------------------------------------------------------
 1 | from detectron2.layers import batched_nms
 2 | 
 3 | 
 4 | def ml_nms(boxlist, nms_thresh, max_proposals=-1,
 5 |            score_field="scores", label_field="labels"):
 6 |     """
 7 |     Performs non-maximum suppression on a boxlist, with scores specified
 8 |     in a boxlist field via score_field.
 9 |     
10 |     Args:
11 |         boxlist (detectron2.structures.Boxes): 
12 |         nms_thresh (float): 
13 |         max_proposals (int): if > 0, then only the top max_proposals are kept
14 |             after non-maximum suppression
15 |         score_field (str): 
16 |     """
17 |     if nms_thresh <= 0:
18 |         return boxlist
19 |     boxes = boxlist.pred_boxes.tensor
20 |     scores = boxlist.scores
21 |     labels = boxlist.pred_classes
22 |     keep = batched_nms(boxes, scores, labels, nms_thresh)
23 |     if max_proposals > 0:
24 |         keep = keep[: max_proposals]
25 |     boxlist = boxlist[keep]
26 |     return boxlist
27 | 


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## Dataset settings
 4 | 
 5 | Following [SINet](https://github.com/DengPingFan/SINet) and other previous COD works, the original COD10K contains both camouflaged and non-camouflaged images. We only use camouflaged images with instance-level labels for training (3,040) and testing (2,026).
 6 | We have uploded the 3,040 training and 2,026 testing COD10K dataset Baidu/[Google](https://drive.google.com/file/d/1YGa3v-MiXy-3MMJDkidLXPt0KQwygt-Z/view?usp=sharing)/[Quark](https://pan.quark.cn/s/07ba3258b777).
 7 | 
 8 | | Dataset      |  CAM (instance-level) | NonCAM | Total |
 9 | |  ----        |         ----          | ----   | ----  |
10 | | COD10K-Train |         3040          | 2960   | 6000  |
11 | | COD10K-Test  |         2026          | 1974   | 4000  |
12 | 
13 | ## Paper link
14 | 
15 | Arxiv: https://arxiv.org/abs/2207.02255
16 | 
17 | ## Initial weights for PVT and Swin
18 | 
19 | To fit detectron2 framework, we add prefix to the key of pth. See https://github.com/PJLallen/OSFormer/issues/4 for details.
20 | 


--------------------------------------------------------------------------------
/tools/img2gif.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | from PIL import Image
 4 | 
 5 | STEP = 8
 6 | BEZEL = 5  # pixel numbers
 7 | DURATION = 200
 8 | 
 9 | def img2gif(img_og, img_pred, out_name):
10 |     im1 = cv2.imread(img_og)
11 |     im2 = cv2.imread(img_pred)
12 | 
13 |     h, w, _ = im1.shape
14 |     step_size = w // STEP
15 |     res_list = []
16 | 
17 |     for i in range(STEP):
18 |         tmp = np.ones_like(im1) * 255
19 |         tmp[:, :(i + 1) * step_size - BEZEL] = im1[:, :(i + 1) * step_size - BEZEL]
20 |         tmp[:, (i + 1) * step_size:] = im2[:, (i + 1) * step_size:]
21 |         res_list.append(Image.fromarray(tmp[...,::-1].astype('uint8')).convert('RGB'))
22 | 
23 |     img = res_list[0]  # extract first image from iterator
24 |     img.save(fp=out_name, format='GIF', append_images=res_list,
25 |              save_all=True, duration=200, loop=0)
26 |     
27 | 
28 | if __name__ == '__main__':
29 |     img2gif(
30 |         '../test_00000003.jpg',
31 |         '../test_00000003.png',
32 |         '../output.gif')
33 | 


--------------------------------------------------------------------------------
/adet/modeling/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | Taken from https://github.com/fundamentalvision/Deformable-DETR
 4 | **************************************************************************************************
 5 | * Deformable DETR
 6 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 7 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor ms_deform_attn_cuda_forward(
15 |     const at::Tensor &value, 
16 |     const at::Tensor &spatial_shapes,
17 |     const at::Tensor &level_start_index,
18 |     const at::Tensor &sampling_loc,
19 |     const at::Tensor &attn_weight,
20 |     const int im2col_step);
21 | 
22 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
23 |     const at::Tensor &value, 
24 |     const at::Tensor &spatial_shapes,
25 |     const at::Tensor &level_start_index,
26 |     const at::Tensor &sampling_loc,
27 |     const at::Tensor &attn_weight,
28 |     const at::Tensor &grad_output,
29 |     const int im2col_step);
30 | 
31 | 


--------------------------------------------------------------------------------
/adet/modeling/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | Taken from https://github.com/fundamentalvision/Deformable-DETR
 4 | **************************************************************************************************
 5 | * Deformable DETR
 6 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 7 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor
15 | ms_deform_attn_cpu_forward(
16 |     const at::Tensor &value, 
17 |     const at::Tensor &spatial_shapes,
18 |     const at::Tensor &level_start_index,
19 |     const at::Tensor &sampling_loc,
20 |     const at::Tensor &attn_weight,
21 |     const int im2col_step);
22 | 
23 | std::vector<at::Tensor>
24 | ms_deform_attn_cpu_backward(
25 |     const at::Tensor &value, 
26 |     const at::Tensor &spatial_shapes,
27 |     const at::Tensor &level_start_index,
28 |     const at::Tensor &sampling_loc,
29 |     const at::Tensor &attn_weight,
30 |     const at::Tensor &grad_output,
31 |     const int im2col_step);
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/configs/Base-CIS.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "OSFormer"
 3 |   MASK_ON: True
 4 |   BACKBONE:
 5 |     NAME: "build_resnet_backbone"
 6 |   RESNETS:
 7 |     OUT_FEATURES: [ "res2", "res3", "res4", "res5" ]
 8 |   BASIS_MODULE:
 9 |     NUM_CLASSES: 1
10 |   OSFormer:
11 |     NUM_CLASSES: 1
12 |     INSTANCE_IN_FEATURES: [ "res3", "res4", "res5" ]
13 |     MASK_IN_FEATURES: [ "res2", "trans3", "trans4", "trans5" ]
14 |     FEAT_INSTANCE_STRIDES: [ 8, 16, 32 ]
15 |     FEAT_SCALE_RANGES: ((1, 192), (96, 384), (192, 2048))
16 |     NUM_GRIDS: [ 36, 24, 16 ]
17 |     INS_FUSION: 'camin'
18 |     DCIN_NORM: False
19 |     NOFPN: True
20 |     SEM_LOSS: True
21 |     C2F_MASK: True
22 |     ENC_LAYERS: 6
23 |     DEC_LAYERS: 3
24 |     LOSS:
25 |       SEM_WEIGHT: 1.0
26 |       SEM_TYPE: 'dice'
27 |   SEM_SEG_HEAD:
28 |     NUM_CLASSES: 1
29 | DATASETS:
30 |   TRAIN: ("my_data_train_coco_cod_style",)
31 |   TEST: ("my_data_test_coco_cod_style", "my_data_test_coco_nc4k_style")
32 | SOLVER:
33 |   BASE_LR: 0.00025
34 |   WEIGHT_DECAY: 0.0001
35 |   STEPS: (60000, 80000)
36 |   MAX_ITER: 90000
37 |   IMS_PER_BATCH: 2
38 |   WARMUP_FACTOR: 0.01
39 |   WARMUP_ITERS: 1000
40 | INPUT:
41 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
42 |   MIN_SIZE_TEST: 800
43 |   MASK_FORMAT: "bitmask"
44 | VERSION: 2
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/adet/modeling/osformer/instance_fusion.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | 
 4 | class DCIN(nn.Module):
 5 |     def __init__(self, num_kernels, norm):
 6 |         super(DCIN, self).__init__()
 7 | 
 8 |         self.affine_scale = nn.Linear(num_kernels, num_kernels, bias=True)
 9 |         self.affine_bias = nn.Linear(num_kernels, 1, bias=True)
10 |         self.norm = norm
11 | 
12 |     def forward(self, mask_features, kernel_features):
13 |         """
14 |         mask_features: shape of (1, c, w, h)
15 |         kernel_features: shape of (n, c)
16 | 
17 |         return: shape of (1, n, w, h)
18 |         """
19 |         kernel_w = self.affine_scale(kernel_features)  # (n, c)
20 |         kernel_b = self.affine_bias(kernel_features)  # (n, 1)
21 |         bs, c, w, h = mask_features.shape
22 |         x = mask_features.view((bs, c, -1))  # (bs, c, k)
23 |         if self.norm:
24 |             x_mean = x.mean(2, keepdim=True)  # (bs, c, 1)
25 |             x_centered = x - x_mean  # (bs, c, k)
26 |             # add 1e-10 to avoid NaN
27 |             x_std_rev = ((x_centered * x_centered).mean(2, keepdim=True) + 1e-10).rsqrt()  # (bs, c, 1)
28 |             x_norm = x_centered * x_std_rev  # (bs, c, k)
29 |         else:
30 |             x_norm = x
31 | 
32 |         return (kernel_w.matmul(x_norm) + kernel_b).view((bs, -1, w, h))
33 | 


--------------------------------------------------------------------------------
/adet/modeling/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | Taken from https://github.com/fundamentalvision/Deformable-DETR
 4 | **************************************************************************************************
 5 | * Deformable DETR
 6 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 7 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include <vector>
12 | 
13 | #include <ATen/ATen.h>
14 | #include <ATen/cuda/CUDAContext.h>
15 | 
16 | 
17 | at::Tensor
18 | ms_deform_attn_cpu_forward(
19 |     const at::Tensor &value, 
20 |     const at::Tensor &spatial_shapes,
21 |     const at::Tensor &level_start_index,
22 |     const at::Tensor &sampling_loc,
23 |     const at::Tensor &attn_weight,
24 |     const int im2col_step)
25 | {
26 |     AT_ERROR("Not implement on cpu");
27 | }
28 | 
29 | std::vector<at::Tensor>
30 | ms_deform_attn_cpu_backward(
31 |     const at::Tensor &value, 
32 |     const at::Tensor &spatial_shapes,
33 |     const at::Tensor &level_start_index,
34 |     const at::Tensor &sampling_loc,
35 |     const at::Tensor &attn_weight,
36 |     const at::Tensor &grad_output,
37 |     const int im2col_step)
38 | {
39 |     AT_ERROR("Not implement on cpu");
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/adet/modeling/osformer/trans_utils.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import torch
 3 | from torch import nn
 4 | 
 5 | 
 6 | def _get_clones(module, n):
 7 |     return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
 8 | 
 9 | 
10 | def with_pos_embed(tensor, pos):
11 |     return tensor if pos is None else tensor + pos
12 | 
13 | 
14 | def get_reference_points(spatial_shapes, batch_size, device):
15 |     reference_points_list = []  # [(2, 7832, 2), (2, 1980, 2), (2, 506, 2), (2, 132, 2)]
16 |     for lvl, (H_, W_) in enumerate(spatial_shapes):
17 |         # (H_, W_), (H_, W_)
18 |         ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
19 |                                       torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
20 |         ref_y = ref_y.reshape(-1).expand((batch_size, H_ * W_)) / H_  # (2, H_ * W_)
21 |         ref_x = ref_x.reshape(-1).expand((batch_size, H_ * W_)) / W_  # (2, H_ * W_)
22 |         ref = torch.stack((ref_x, ref_y), -1)
23 |         reference_points_list.append(ref)  # (2, H_ * W_, 2)
24 |     reference_points = torch.cat(reference_points_list, 1)  # (2, 10540, 2)
25 |     # reference_points[:, :, None] (2, 10450, 1, 2)  valid_ratios[:, None] (2, 1, 4, 2)
26 |     _, xx, yy = reference_points.shape
27 |     reference_points = reference_points[:, :, None].expand((batch_size, xx, lvl + 1, yy))  # (2, 10540, 4, 2)
28 |     return reference_points
29 | 


--------------------------------------------------------------------------------
/tools/compare_ap.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import numpy as np
 4 | from collections import defaultdict, OrderedDict
 5 | 
 6 | 
 7 | basedir = 'SOTA'
 8 | base_method = 'osformer'
 9 | datasets = ['cod', 'nc4k']
10 | file_template = 'my_data_test_coco_{}_style_ap.json'
11 | 
12 | 
13 | for dataset in datasets:
14 | 
15 |     with open(os.path.join(basedir, base_method, file_template.format(dataset)), 'r') as f:
16 |         ours = json.load(f)
17 |         print(base_method, len(ours.keys()))
18 | 
19 |     delta_dict = defaultdict(list)
20 | 
21 |     for method in os.listdir(basedir):
22 |         if method == base_method or not os.path.isdir(os.path.join(basedir, method)):
23 |             continue
24 | 
25 |         with open(os.path.join(basedir, method, file_template.format(dataset)), 'r') as f:
26 |             other = json.load(f)
27 |             print(method, len(other.keys()))
28 | 
29 |         for k, v in ours.items():
30 |             our_ap = float(ours[k]['AP'])
31 |             other_ap = float(other[k]['AP']) if other.get(k) else 0
32 |             if np.isnan(other_ap):
33 |                 other_ap = 0
34 |             delta_dict[k].append(our_ap - other_ap)
35 | 
36 |     od = []
37 |     for k, v in delta_dict.items():
38 |         print(k, v)
39 |         od.append((k, np.mean(v)))
40 | 
41 |     od.sort(key=lambda x: x[1], reverse=True)
42 |     res = OrderedDict()
43 |     for elem in od:
44 |         res[elem[0]] = elem[1]
45 | 
46 |     with open(os.path.join(basedir, 'desc_res_{}.json').format(dataset), 'w') as f:
47 |         json.dump(res, f, indent=4)
48 | 
49 | 


--------------------------------------------------------------------------------
/tools/csv2txt.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from collections import OrderedDict
 4 | 
 5 | 
 6 | def csv2tex(csv_path, out_path=None):
 7 |     if out_path is None:
 8 |         out_path = csv_path.replace('csv', '.txt')
 9 |     res = OrderedDict()
10 |     res_og = OrderedDict()
11 |     with open(csv_path, 'r') as f:
12 |         flag = False
13 |         for idx, line in enumerate(f):
14 |             data_list = line.replace('\n', '').split(',')
15 |             for ydx, elem in enumerate(data_list):
16 |                 if data_list[ydx] != '':
17 |                     try:
18 |                         if 'Max' in data_list[ydx]:
19 |                             flt = float(data_list[ydx].replace('Max', ''))
20 |                             data_list[ydx] = '\\textbf{' + '{:>4.1f}'.format(flt) + '}'
21 |                         else:
22 |                             flt = float(data_list[ydx])
23 |                             data_list[ydx] = '{:>4.1f}'.format(flt)
24 |                     except Exception as e:
25 |                         pass
26 |             if data_list[0] == '':
27 |                 if flag is False:
28 |                     flag = True
29 |                     continue
30 |                 break
31 |             flag = False
32 |             res[data_list[0]] = ' & '.join(data_list[1:])
33 |             res_og[data_list[0]] = data_list[1:]
34 | 
35 |         with open(out_path, 'w') as f:
36 |             for k, v in res.items():
37 |                 f.write('{:<15} & '.format(k) + v + ' \\\\ \n')
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     csv2tex('OSFormer-NoVal-Attribute.csv')
42 | 
43 | 


--------------------------------------------------------------------------------
/adet/utils/comm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import torch.distributed as dist
 4 | 
 5 | from detectron2.utils.comm import get_world_size
 6 | 
 7 | 
 8 | def reduce_sum(tensor):
 9 |     world_size = get_world_size()
10 |     if world_size < 2:
11 |         return tensor
12 |     tensor = tensor.clone()
13 |     dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
14 |     return tensor
15 | 
16 | 
17 | def aligned_bilinear(tensor, factor):
18 |     assert tensor.dim() == 4
19 |     assert factor >= 1
20 |     assert int(factor) == factor
21 | 
22 |     if factor == 1:
23 |         return tensor
24 | 
25 |     h, w = tensor.size()[2:]
26 |     tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate")
27 |     oh = factor * h + 1
28 |     ow = factor * w + 1
29 |     tensor = F.interpolate(
30 |         tensor, size=(oh, ow),
31 |         mode='bilinear',
32 |         align_corners=True
33 |     )
34 |     tensor = F.pad(
35 |         tensor, pad=(factor // 2, 0, factor // 2, 0),
36 |         mode="replicate"
37 |     )
38 | 
39 |     return tensor[:, :, :oh - 1, :ow - 1]
40 | 
41 | 
42 | def compute_locations(h, w, stride, device):
43 |     shifts_x = torch.arange(
44 |         0, w * stride, step=stride,
45 |         dtype=torch.float32, device=device
46 |     )
47 |     shifts_y = torch.arange(
48 |         0, h * stride, step=stride,
49 |         dtype=torch.float32, device=device
50 |     )
51 |     shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
52 |     shift_x = shift_x.reshape(-1)
53 |     shift_y = shift_y.reshape(-1)
54 |     locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
55 |     return locations
56 | 


--------------------------------------------------------------------------------
/adet/checkpoint/adet_checkpoint.py:
--------------------------------------------------------------------------------
 1 | import pickle, os
 2 | from fvcore.common.file_io import PathManager
 3 | from detectron2.checkpoint import DetectionCheckpointer
 4 | 
 5 | 
 6 | class AdetCheckpointer(DetectionCheckpointer):
 7 |     """
 8 |     Same as :class:`DetectronCheckpointer`, but is able to convert models
 9 |     in AdelaiDet, such as LPF backbone.
10 |     """
11 |     def _load_file(self, filename):
12 |         if filename.endswith(".pkl"):
13 |             with PathManager.open(filename, "rb") as f:
14 |                 data = pickle.load(f, encoding="latin1")
15 |             if "model" in data and "__author__" in data:
16 |                 # file is in Detectron2 model zoo format
17 |                 self.logger.info("Reading a file from '{}'".format(data["__author__"]))
18 |                 return data
19 |             else:
20 |                 # assume file is from Caffe2 / Detectron1 model zoo
21 |                 if "blobs" in data:
22 |                     # Detection models have "blobs", but ImageNet models don't
23 |                     data = data["blobs"]
24 |                 data = {k: v for k, v in data.items() if not k.endswith("_momentum")}
25 |                 if "weight_order" in data:
26 |                     del data["weight_order"]
27 |                 return {"model": data, "__author__": "Caffe2", "matching_heuristics": True}
28 | 
29 |         loaded = super()._load_file(filename)  # load native pth checkpoint
30 |         if "model" not in loaded:
31 |             loaded = {"model": loaded}
32 | 
33 |         basename = os.path.basename(filename).lower()
34 |         if "lpf" in basename or "dla" in basename:
35 |             loaded["matching_heuristics"] = True
36 |         return loaded
37 | 


--------------------------------------------------------------------------------
/adet/data/datasets/cis.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from detectron2.data.datasets.coco import load_coco_json
 3 | from detectron2.data import MetadataCatalog, DatasetCatalog
 4 | 
 5 | DATASET_ROOT = 'COD10K-v3'
 6 | ANN_ROOT = os.path.join(DATASET_ROOT, 'annotations')
 7 | TRAIN_PATH = os.path.join(DATASET_ROOT, 'Train/Image')
 8 | TEST_PATH = os.path.join(DATASET_ROOT, 'Test/Image')
 9 | TRAIN_JSON = os.path.join(ANN_ROOT, 'train_instance.json')
10 | TEST_JSON = os.path.join(ANN_ROOT, 'test2026.json')
11 | 
12 | NC4K_ROOT = 'NC4K'
13 | NC4K_PATH = os.path.join(NC4K_ROOT, 'Imgs')
14 | NC4K_JSON = os.path.join(NC4K_ROOT, 'nc4k_test.json')
15 | 
16 | CLASS_NAMES = ["foreground"]
17 | 
18 | PREDEFINED_SPLITS_DATASET = {
19 |     "my_data_train_coco_cod_style": (TRAIN_PATH, TRAIN_JSON),
20 |     "my_data_test_coco_cod_style": (TEST_PATH, TEST_JSON),
21 |     "my_data_test_coco_nc4k_style": (NC4K_PATH, NC4K_JSON),
22 | }
23 | 
24 | 
25 | def register_dataset():
26 |     """
27 |     purpose: register all splits of dataset with PREDEFINED_SPLITS_DATASET
28 |     """
29 |     for key, (image_root, json_file) in PREDEFINED_SPLITS_DATASET.items():
30 |         register_dataset_instances(name=key,
31 |                                    json_file=json_file,
32 |                                    image_root=image_root)
33 | 
34 | 
35 | def register_dataset_instances(name, json_file, image_root):
36 |     """
37 |     purpose: register dataset to DatasetCatalog,
38 |              register metadata to MetadataCatalog and set attribute
39 |     """
40 |     DatasetCatalog.register(name, lambda: load_coco_json(json_file, image_root, name))
41 |     MetadataCatalog.get(name).set(json_file=json_file,
42 |                                   image_root=image_root,
43 |                                   evaluator_type="coco")


--------------------------------------------------------------------------------
/adet/layers/csrc/vision.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "ml_nms/ml_nms.h"
 3 | #include "DefROIAlign/DefROIAlign.h"
 4 | #include "BezierAlign/BezierAlign.h"
 5 | 
 6 | namespace adet {
 7 | 
 8 | #ifdef WITH_CUDA
 9 | extern int get_cudart_version();
10 | #endif
11 | 
12 | std::string get_cuda_version() {
13 | #ifdef WITH_CUDA
14 |   std::ostringstream oss;
15 | 
16 |   // copied from
17 |   // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
18 |   auto printCudaStyleVersion = [&](int v) {
19 |     oss << (v / 1000) << "." << (v / 10 % 100);
20 |     if (v % 10 != 0) {
21 |       oss << "." << (v % 10);
22 |     }
23 |   };
24 |   printCudaStyleVersion(get_cudart_version());
25 |   return oss.str();
26 | #else
27 |   return std::string("not available");
28 | #endif
29 | }
30 | 
31 | // similar to
32 | // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
33 | std::string get_compiler_version() {
34 |   std::ostringstream ss;
35 | #if defined(__GNUC__)
36 | #ifndef __clang__
37 |   { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
38 | #endif
39 | #endif
40 | 
41 | #if defined(__clang_major__)
42 |   {
43 |     ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
44 |        << __clang_patchlevel__;
45 |   }
46 | #endif
47 | 
48 | #if defined(_MSC_VER)
49 |   { ss << "MSVC " << _MSC_FULL_VER; }
50 | #endif
51 |   return ss.str();
52 | }
53 | 
54 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
55 |   m.def("ml_nms", &ml_nms, "Multi-Label NMS");
56 |   m.def("def_roi_align_forward", &DefROIAlign_forward, "def_roi_align_forward");
57 |   m.def("def_roi_align_backward", &DefROIAlign_backward, "def_roi_align_backward");
58 |   m.def("bezier_align_forward", &BezierAlign_forward, "bezier_align_forward");
59 |   m.def("bezier_align_backward", &BezierAlign_backward, "bezier_align_backward");
60 | }
61 | 
62 | } // namespace adet
63 | 


--------------------------------------------------------------------------------
/adet/structures/beziers.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | import torch
 3 | 
 4 | 
 5 | class Beziers:
 6 |     """
 7 |     This structure stores a list of bezier curves as a Nx16 torch.Tensor.
 8 |     It will support some common methods about bezier shapes
 9 |     (`area`, `clip`, `nonempty`, etc),
10 |     and also behaves like a Tensor
11 |     (support indexing, `to(device)`, `.device`, and iteration over all boxes)
12 | 
13 |     Attributes:
14 |         tensor (torch.Tensor): float matrix of Nx4. Each row is (x1, y1, x2, y2).
15 |     """
16 | 
17 |     def __init__(self, tensor: torch.Tensor):
18 |         """
19 |         Args:
20 |             tensor (Tensor[float]): a Nx4 matrix.  Each row is (x1, y1, x2, y2).
21 |         """
22 |         device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu")
23 |         tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
24 |         if tensor.numel() == 0:
25 |             # Use reshape, so we don't end up creating a new tensor that does not depend on
26 |             # the inputs (and consequently confuses jit)
27 |             tensor = tensor.reshape((0, 16)).to(dtype=torch.float32, device=device)
28 |         assert tensor.dim() == 2 and tensor.size(-1) == 16, tensor.size()
29 | 
30 |         self.tensor = tensor
31 | 
32 |     def to(self, device: str) -> "Beziers":
33 |         return Beziers(self.tensor.to(device))
34 | 
35 |     def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Beziers":
36 |         """
37 |         Returns:
38 |             Beziers: Create a new :class:`Beziers` by indexing.
39 |         """
40 |         if isinstance(item, int):
41 |             return Beziers(self.tensor[item].view(1, -1))
42 |         b = self.tensor[item]
43 |         assert b.dim() == 2, "Indexing on Boxes with {} failed to return a matrix!".format(item)
44 |         return Beziers(b)


--------------------------------------------------------------------------------
/adet/layers/conv_with_kaiming_uniform.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from detectron2.layers import Conv2d
 4 | from .deform_conv import DFConv2d
 5 | from detectron2.layers.batch_norm import get_norm
 6 | 
 7 | 
 8 | def conv_with_kaiming_uniform(
 9 |         norm=None, activation=None,
10 |         use_deformable=False, use_sep=False):
11 |     def make_conv(
12 |         in_channels, out_channels, kernel_size, stride=1, dilation=1
13 |     ):
14 |         if use_deformable:
15 |             conv_func = DFConv2d
16 |         else:
17 |             conv_func = Conv2d
18 |         if use_sep:
19 |             assert in_channels == out_channels
20 |             groups = in_channels
21 |         else:
22 |             groups = 1
23 |         conv = conv_func(
24 |             in_channels,
25 |             out_channels,
26 |             kernel_size=kernel_size,
27 |             stride=stride,
28 |             padding=dilation * (kernel_size - 1) // 2,
29 |             dilation=dilation,
30 |             groups=groups,
31 |             bias=(norm is None)
32 |         )
33 |         if not use_deformable:
34 |             # Caffe2 implementation uses XavierFill, which in fact
35 |             # corresponds to kaiming_uniform_ in PyTorch
36 |             nn.init.kaiming_uniform_(conv.weight, a=1)
37 |             if norm is None:
38 |                 nn.init.constant_(conv.bias, 0)
39 |         module = [conv,]
40 |         if norm is not None and len(norm) > 0:
41 |             if norm == "GN":
42 |                 norm_module = nn.GroupNorm(32, out_channels)
43 |             else:
44 |                 norm_module = get_norm(norm, out_channels)
45 |             module.append(norm_module)
46 |         if activation is not None:
47 |             module.append(nn.ReLU(inplace=True))
48 |         if len(module) > 1:
49 |             return nn.Sequential(*module)
50 |         return conv
51 | 
52 |     return make_conv
53 | 


--------------------------------------------------------------------------------
/tools/eval_single.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import tqdm
 4 | import json
 5 | from collections import OrderedDict
 6 | from detectron2.engine.defaults import DefaultPredictor
 7 | from detectron2.evaluation.coco_evaluation import COCOEvaluator
 8 | 
 9 | from adet.config import get_cfg
10 | from adet.data.datasets.cis import register_dataset
11 | from tools.train_net import Trainer
12 | 
13 | 
14 | def setup_cfg(config_file, model_weights, confidence_threshold):
15 |     cfg = get_cfg()
16 |     cfg.merge_from_file(config_file)
17 |     # Set score_threshold for builtin models
18 |     cfg.MODEL.WEIGHTS = model_weights
19 |     cfg.MODEL.RETINANET.SCORE_THRESH_TEST = confidence_threshold
20 |     cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = confidence_threshold
21 |     cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = confidence_threshold
22 |     cfg.freeze()
23 | 
24 |     return cfg
25 | 
26 | 
27 | def main(config_file, model_weights, dataset_name, output_dir=None, confidence_threshold=0.3):
28 |     cfg = setup_cfg(config_file, model_weights, confidence_threshold)
29 | 
30 |     if output_dir is None:
31 |         output_dir = os.path.dirname(model_weights)
32 | 
33 |     if not os.path.exists(output_dir):
34 |         os.makedirs(output_dir)
35 | 
36 |     predictor = DefaultPredictor(cfg)
37 |     model = predictor.model
38 |     data_loader = Trainer.build_test_loader(cfg, dataset_name)
39 |     coco_eval = COCOEvaluator(dataset_name, output_dir=output_dir, tasks=('segm',))
40 | 
41 |     eval_res = OrderedDict()
42 |     for elem in tqdm.tqdm(data_loader):
43 |         predictions = model(elem)
44 |         coco_eval.reset()
45 |         coco_eval.process(elem, predictions)
46 |         eval_cur = coco_eval.evaluate()
47 |         filename = os.path.basename(elem[0]['file_name'])
48 |         eval_res[filename] = eval_cur['segm']
49 | 
50 |     with open(os.path.join(output_dir, '{}_ap.json'.format(dataset_name)), 'w') as f:
51 |         json.dump(eval_res, f, indent=4)
52 | 


--------------------------------------------------------------------------------
/adet/modeling/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | Taken from https://github.com/fundamentalvision/Deformable-DETR
 4 | **************************************************************************************************
 5 | * Deformable DETR
 6 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 7 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | 
13 | #include "cpu/ms_deform_attn_cpu.h"
14 | 
15 | #ifdef WITH_CUDA
16 | #include "cuda/ms_deform_attn_cuda.h"
17 | #endif
18 | 
19 | 
20 | at::Tensor
21 | ms_deform_attn_forward(
22 |     const at::Tensor &value, 
23 |     const at::Tensor &spatial_shapes,
24 |     const at::Tensor &level_start_index,
25 |     const at::Tensor &sampling_loc,
26 |     const at::Tensor &attn_weight,
27 |     const int im2col_step)
28 | {
29 |     if (value.type().is_cuda())
30 |     {
31 | #ifdef WITH_CUDA
32 |         return ms_deform_attn_cuda_forward(
33 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34 | #else
35 |         AT_ERROR("Not compiled with GPU support");
36 | #endif
37 |     }
38 |     AT_ERROR("Not implemented on the CPU");
39 | }
40 | 
41 | std::vector<at::Tensor>
42 | ms_deform_attn_backward(
43 |     const at::Tensor &value, 
44 |     const at::Tensor &spatial_shapes,
45 |     const at::Tensor &level_start_index,
46 |     const at::Tensor &sampling_loc,
47 |     const at::Tensor &attn_weight,
48 |     const at::Tensor &grad_output,
49 |     const int im2col_step)
50 | {
51 |     if (value.type().is_cuda())
52 |     {
53 | #ifdef WITH_CUDA
54 |         return ms_deform_attn_cuda_backward(
55 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56 | #else
57 |         AT_ERROR("Not compiled with GPU support");
58 | #endif
59 |     }
60 |     AT_ERROR("Not implemented on the CPU");
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/adet/modeling/osformer/feed_forward.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class FeedForwardNetwork(nn.Module):
 6 |     def __init__(self, d_model):
 7 |         super().__init__()
 8 |         self.d_model = d_model
 9 |         self.ffn = nn.Sequential(
10 |             nn.Conv2d(d_model, d_model, 3, padding=1),
11 |             nn.GroupNorm(8, d_model),
12 |             nn.GELU(),
13 |             nn.Conv2d(d_model, d_model, 3, padding=1)
14 |         )
15 | 
16 |     def forward(self, src, spatial_shapes, *args):
17 |         split_list = [(w * h) for (w, h) in spatial_shapes]
18 |         feat_levels = []
19 |         for memory, (w, h) in zip(src.split(split_list, 1), spatial_shapes):
20 |             memory = memory.view(-1, w, h, self.d_model).permute(0, 3, 1, 2)
21 |             memory = self.ffn(memory)
22 |             feat_levels.append(memory.flatten(2).transpose(1, 2))
23 |         return torch.cat(feat_levels, 1)
24 | 
25 | 
26 | class VanillaFeedForwardNetwork(nn.Module):
27 |     def __init__(self, d_model):
28 |         super().__init__()
29 |         self.ffn = nn.Sequential(
30 |             nn.Conv1d(d_model, d_model, 3, padding=1, bias=False),
31 |             nn.GroupNorm(8, d_model),
32 |             nn.GELU(),
33 |             nn.Conv1d(d_model, d_model, 3, padding=1, bias=False)
34 |         )
35 | 
36 |     def forward(self, src, *args):
37 |         return self.ffn(src.permute(0, 2, 1)).permute(0, 2, 1)
38 | 
39 | 
40 | class StdFeedForwardNetwork(nn.Module):
41 |     def __init__(self, d_model):
42 |         super().__init__()
43 |         self.ffn = nn.Sequential(
44 |             nn.Linear(d_model, d_model),
45 |             nn.ReLU(),
46 |             nn.LayerNorm(d_model)
47 |         )
48 |         self.norm = nn.LayerNorm(d_model)
49 | 
50 |     def forward(self, src, *args):
51 |         return self.norm(src + self.ffn(src))
52 | 
53 | 
54 | def get_ffn(d_model, ffn_type):
55 |     if ffn_type == 'std':
56 |         return StdFeedForwardNetwork(d_model)
57 |     if ffn_type == 'vanilla':
58 |         return VanillaFeedForwardNetwork(d_model)
59 |     return FeedForwardNetwork(d_model)
60 | 


--------------------------------------------------------------------------------
/adet/data/builtin.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from detectron2.data.datasets.register_coco import register_coco_instances
 4 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
 5 | 
 6 | from .datasets.text import register_text_instances
 7 | 
 8 | # register plane reconstruction
 9 | 
10 | _PREDEFINED_SPLITS_PIC = {
11 |     "pic_person_train": ("pic/image/train", "pic/annotations/train_person.json"),
12 |     "pic_person_val": ("pic/image/val", "pic/annotations/val_person.json"),
13 | }
14 | 
15 | metadata_pic = {
16 |     "thing_classes": ["person"]
17 | }
18 | 
19 | _PREDEFINED_SPLITS_TEXT = {
20 |     "totaltext_train": ("totaltext/train_images", "totaltext/train.json"),
21 |     "totaltext_val": ("totaltext/test_images", "totaltext/test.json"),
22 |     "ctw1500_word_train": ("CTW1500/ctwtrain_text_image", "CTW1500/annotations/train_ctw1500_maxlen100_v2.json"),
23 |     "ctw1500_word_test": ("CTW1500/ctwtest_text_image","CTW1500/annotations/test_ctw1500_maxlen100.json"),
24 |     "syntext1_train": ("syntext1/images", "syntext1/annotations/train.json"),
25 |     "syntext2_train": ("syntext2/images", "syntext2/annotations/train.json"),
26 |     "mltbezier_word_train": ("mlt2017/images","mlt2017/annotations/train.json"),
27 | }
28 | 
29 | metadata_text = {
30 |     "thing_classes": ["text"]
31 | }
32 | 
33 | 
34 | def register_all_coco(root="datasets"):
35 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_PIC.items():
36 |         # Assume pre-defined datasets live in `./datasets`.
37 |         register_coco_instances(
38 |             key,
39 |             metadata_pic,
40 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
41 |             os.path.join(root, image_root),
42 |         )
43 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_TEXT.items():
44 |         # Assume pre-defined datasets live in `./datasets`.
45 |         register_text_instances(
46 |             key,
47 |             metadata_text,
48 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
49 |             os.path.join(root, image_root),
50 |         )
51 | 
52 | 
53 | register_all_coco()


--------------------------------------------------------------------------------
/tools/combine_vis.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import cv2
 4 | import numpy as np
 5 | 
 6 | 
 7 | BASEDIR = 'SOTA'
 8 | DATASETS = {
 9 |     'cod': {'image': 'COD10K-v3/Test/Image/',
10 |             'gt': 'COD10K-v3/Test/GT_Instance/'},
11 |     'nc4k': {'image': 'NC4K/Imgs/',
12 |              'gt': 'NC4K/Instance/'}}
13 | ORDER_JSON = 'SOTA/desc_res_{}.json'
14 | OUTPUT_DIR = 'Combined'
15 | if not os.path.exists(OUTPUT_DIR):
16 |     os.makedirs(OUTPUT_DIR)
17 | 
18 | 
19 | # 4 rows 3 columns
20 | def combine_a_image(filename, dataset, score, order, nums_per_row=3, nums_per_col=4):
21 |     print(filename)
22 |     gt = cv2.imread(os.path.join(DATASETS[dataset]['gt'], filename.replace('.jpg', '.png')))
23 |     cv2.putText(gt, score, (60, 60), cv2.FONT_HERSHEY_PLAIN, 5.0, (0, 0, 255), 4)
24 |     shape = gt.shape
25 |     im = cv2.imread(os.path.join(DATASETS[dataset]['image'], filename))
26 |     target_shape = [shape[0] * nums_per_col, shape[1] * nums_per_row, 3]
27 |     result = np.zeros(target_shape)
28 |     result[:shape[0], :shape[1], :] = gt
29 | 
30 |     i = 0
31 |     for method in os.listdir(BASEDIR):
32 |         if not os.path.isdir(os.path.join(BASEDIR, method)):
33 |             continue
34 | 
35 |         print(method)
36 |         i += 1
37 |         row = i // nums_per_row
38 |         col = i % nums_per_row
39 | 
40 |         vis_map = cv2.imread(os.path.join(BASEDIR, method, 'vis',  filename))
41 |         if vis_map is None:
42 |             vis_map = im.copy()
43 |         if vis_map.shape != shape:
44 |             vis_map = cv2.resize(vis_map, (shape[1], shape[0]))
45 |         cv2.putText(vis_map, method, (60, 60), cv2.FONT_HERSHEY_PLAIN, 5.0, (0, 0, 255), 4)
46 |         result[row * shape[0]: (row + 1) * shape[0],
47 |                col * shape[1]: (col + 1) * shape[1], :] = vis_map
48 |     cv2.imwrite(os.path.join(OUTPUT_DIR, order + filename), result)
49 |     print('Save {} successfully!'.format(os.path.join(OUTPUT_DIR, order + filename)))
50 | 
51 | 
52 | for dataset_ in ['nc4k']: # DATASETS.keys():
53 |     with open(ORDER_JSON.format(dataset_), 'r') as f:
54 |         order_dict = json.load(f)
55 | 
56 |     for idx, (filename_, score_delta) in enumerate(order_dict.items()):
57 |         score_fmt = '{:.2f}'.format(score_delta * 100)
58 |         combine_a_image(filename_, dataset_, score_fmt, '{:04}_'.format(idx))
59 | 


--------------------------------------------------------------------------------
/adet/modeling/osformer/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Taken from https://github.com/facebookresearch/detr
 2 | 
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | import torch
 8 | from torch import nn
 9 | 
10 | 
11 | class PositionEmbeddingSine(nn.Module):
12 |     """
13 |     This is a more standard version of the position embedding, very similar to the one
14 |     used by the Attention is all you need paper, generalized to work on images.
15 |     """
16 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
17 |         super().__init__()
18 |         self.num_pos_feats = num_pos_feats
19 |         self.temperature = temperature
20 |         self.normalize = normalize
21 |         if scale is not None and normalize is False:
22 |             raise ValueError("normalize should be True if scale is passed")
23 |         if scale is None:
24 |             scale = 2 * math.pi
25 |         self.scale = scale
26 | 
27 |     def forward(self, tensor_list):
28 |         x = tensor_list
29 |         bs, _, w, h = x.shape
30 |         not_mask = torch.zeros((bs, w, h), dtype=torch.bool, device=x.device)
31 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
32 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
33 |         if self.normalize:
34 |             eps = 1e-6
35 |             y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
36 |             x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
37 | 
38 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
39 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)     #dim_t (128)
40 | 
41 |         # x_embed[:, :, :, None]  (bs, w, h, 1)
42 |         pos_x = x_embed[:, :, :, None] / dim_t
43 |         # pos_x.shape (ba, w, h, dim_t)
44 |         pos_y = y_embed[:, :, :, None] / dim_t
45 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
46 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
47 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
48 |         return pos
49 | 
50 | 
51 | def build_position_encoding(hidden_dim):
52 |     N_steps = hidden_dim // 2
53 |     position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
54 | 
55 |     return position_embedding
56 | 


--------------------------------------------------------------------------------
/adet/layers/iou_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class IOULoss(nn.Module):
 6 |     """
 7 |     Intersetion Over Union (IoU) loss which supports three
 8 |     different IoU computations:
 9 | 
10 |     * IoU
11 |     * Linear IoU
12 |     * gIoU
13 |     """
14 |     def __init__(self, loc_loss_type='iou'):
15 |         super(IOULoss, self).__init__()
16 |         self.loc_loss_type = loc_loss_type
17 | 
18 |     def forward(self, pred, target, weight=None):
19 |         """
20 |         Args:
21 |             pred: Nx4 predicted bounding boxes
22 |             target: Nx4 target bounding boxes
23 |             weight: N loss weight for each instance
24 |         """
25 |         pred_left = pred[:, 0]
26 |         pred_top = pred[:, 1]
27 |         pred_right = pred[:, 2]
28 |         pred_bottom = pred[:, 3]
29 | 
30 |         target_left = target[:, 0]
31 |         target_top = target[:, 1]
32 |         target_right = target[:, 2]
33 |         target_bottom = target[:, 3]
34 | 
35 |         target_aera = (target_left + target_right) * \
36 |                       (target_top + target_bottom)
37 |         pred_aera = (pred_left + pred_right) * \
38 |                     (pred_top + pred_bottom)
39 | 
40 |         w_intersect = torch.min(pred_left, target_left) + \
41 |                       torch.min(pred_right, target_right)
42 |         h_intersect = torch.min(pred_bottom, target_bottom) + \
43 |                       torch.min(pred_top, target_top)
44 | 
45 |         g_w_intersect = torch.max(pred_left, target_left) + \
46 |                         torch.max(pred_right, target_right)
47 |         g_h_intersect = torch.max(pred_bottom, target_bottom) + \
48 |                         torch.max(pred_top, target_top)
49 |         ac_uion = g_w_intersect * g_h_intersect
50 | 
51 |         area_intersect = w_intersect * h_intersect
52 |         area_union = target_aera + pred_aera - area_intersect
53 | 
54 |         ious = (area_intersect + 1.0) / (area_union + 1.0)
55 |         gious = ious - (ac_uion - area_union) / ac_uion
56 |         if self.loc_loss_type == 'iou':
57 |             losses = -torch.log(ious)
58 |         elif self.loc_loss_type == 'linear_iou':
59 |             losses = 1 - ious
60 |         elif self.loc_loss_type == 'giou':
61 |             losses = 1 - gious
62 |         else:
63 |             raise NotImplementedError
64 | 
65 |         if weight is not None:
66 |             return (losses * weight).sum()
67 |         else:
68 |             return losses.sum()
69 | 


--------------------------------------------------------------------------------
/adet/modeling/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # Taken from https://github.com/fundamentalvision/Deformable-DETR
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | 
 8 | import os
 9 | import glob
10 | 
11 | import torch
12 | 
13 | from torch.utils.cpp_extension import CUDA_HOME
14 | from torch.utils.cpp_extension import CppExtension
15 | from torch.utils.cpp_extension import CUDAExtension
16 | 
17 | from setuptools import find_packages
18 | from setuptools import setup
19 | 
20 | requirements = ["torch", "torchvision"]
21 | 
22 | 
23 | def get_extensions():
24 |     this_dir = os.path.dirname(os.path.abspath(__file__))
25 |     extensions_dir = os.path.join(this_dir, "src")
26 | 
27 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 | 
31 |     sources = main_file + source_cpu
32 |     extension = CppExtension
33 |     extra_compile_args = {"cxx": []}
34 |     define_macros = []
35 | 
36 |     if torch.cuda.is_available() and CUDA_HOME is not None:
37 |         extension = CUDAExtension
38 |         sources += source_cuda
39 |         define_macros += [("WITH_CUDA", None)]
40 |         extra_compile_args["nvcc"] = [
41 |             "-DCUDA_HAS_FP16=1",
42 |             "-D__CUDA_NO_HALF_OPERATORS__",
43 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
44 |             "-D__CUDA_NO_HALF2_OPERATORS__",
45 |         ]
46 |     else:
47 |         raise NotImplementedError('Cuda is not availabel')
48 | 
49 |     sources = [os.path.join(extensions_dir, s) for s in sources]
50 |     include_dirs = [extensions_dir]
51 |     ext_modules = [
52 |         extension(
53 |             "MultiScaleDeformableAttention",
54 |             sources,
55 |             include_dirs=include_dirs,
56 |             define_macros=define_macros,
57 |             extra_compile_args=extra_compile_args,
58 |         )
59 |     ]
60 |     return ext_modules
61 | 
62 | setup(
63 |     name="MultiScaleDeformableAttention",
64 |     version="1.0",
65 |     author="Weijie Su",
66 |     url="https://github.com/fundamentalvision/Deformable-DETR",
67 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
68 |     packages=find_packages(exclude=("configs", "tests",)),
69 |     ext_modules=get_extensions(),
70 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
71 | )
72 | 


--------------------------------------------------------------------------------
/adet/layers/gcn.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class Conv2D(nn.Module):
 7 |     def __init__(self, in_channels, out_channels, kernel_size, padding='same',
 8 |                  stride=1, dilation=1, groups=1):
 9 |         super(Conv2D, self).__init__()
10 | 
11 |         assert type(kernel_size) in [int, tuple], "Allowed kernel type [int or tuple], not {}".format(type(kernel_size))
12 |         assert padding == 'same', "Allowed padding type {}, not {}".format('same', padding)
13 | 
14 |         self.kernel_size = kernel_size
15 |         if isinstance(kernel_size, tuple):
16 |             self.h_kernel = kernel_size[0]
17 |             self.w_kernel = kernel_size[1]
18 |         else:
19 |             self.h_kernel = kernel_size
20 |             self.w_kernel = kernel_size
21 | 
22 |         self.padding = padding
23 |         self.stride = stride
24 |         self.dilation = dilation
25 |         self.groups = groups
26 |         self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
27 |                               stride=self.stride, dilation=self.dilation, groups=self.groups)
28 | 
29 |     def forward(self, x):
30 | 
31 |         if self.padding == 'same':
32 | 
33 |             height, width = x.shape[2:]
34 | 
35 |             h_pad_need = max(0, (height - 1) * self.stride + self.h_kernel - height)
36 |             w_pad_need = max(0, (width - 1) * self.stride + self.w_kernel - width)
37 | 
38 |             pad_left = w_pad_need // 2
39 |             pad_right = w_pad_need - pad_left
40 |             pad_top = h_pad_need // 2
41 |             pad_bottom = h_pad_need - pad_top
42 | 
43 |             padding = (pad_left, pad_right, pad_top, pad_bottom)
44 | 
45 |             x = F.pad(x, padding, 'constant', 0)
46 | 
47 |         x = self.conv(x)
48 | 
49 |         return x
50 | 
51 | 
52 | class GCN(nn.Module):
53 |     """
54 |         Large Kernel Matters -- https://arxiv.org/abs/1703.02719
55 |     """
56 |     def __init__(self, in_channels, out_channels, k=3):
57 |         super(GCN, self).__init__()
58 | 
59 |         self.conv_l1 = Conv2D(in_channels=in_channels, out_channels=out_channels, kernel_size=(k, 1), padding='same')
60 |         self.conv_l2 = Conv2D(in_channels=out_channels, out_channels=out_channels, kernel_size=(1, k), padding='same')
61 | 
62 |         self.conv_r1 = Conv2D(in_channels=in_channels, out_channels=out_channels, kernel_size=(1, k), padding='same')
63 |         self.conv_r2 = Conv2D(in_channels=out_channels, out_channels=out_channels, kernel_size=(k, 1), padding='same')
64 | 
65 |     def forward(self, x):
66 |         x1 = self.conv_l1(x)
67 |         x1 = self.conv_l2(x1)
68 | 
69 |         x2 = self.conv_r1(x)
70 |         x2 = self.conv_r2(x2)
71 | 
72 |         out = x1 + x2
73 | 
74 |         return out
75 | 


--------------------------------------------------------------------------------
/adet/modeling/backbone/fpn.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | import torch.nn.functional as F
 3 | import fvcore.nn.weight_init as weight_init
 4 | 
 5 | from detectron2.modeling.backbone import FPN, build_resnet_backbone
 6 | from detectron2.layers import ShapeSpec
 7 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
 8 | 
 9 | from .resnet_lpf import build_resnet_lpf_backbone
10 | from .resnet_interval import build_resnet_interval_backbone
11 | from .mobilenet import build_mnv2_backbone
12 | 
13 | 
14 | class LastLevelP6P7(nn.Module):
15 |     """
16 |     This module is used in RetinaNet and FCOS to generate extra layers, P6 and P7 from
17 |     C5 or P5 feature.
18 |     """
19 | 
20 |     def __init__(self, in_channels, out_channels, in_features="res5"):
21 |         super().__init__()
22 |         self.num_levels = 2
23 |         self.in_feature = in_features
24 |         self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
25 |         self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
26 |         for module in [self.p6, self.p7]:
27 |             weight_init.c2_xavier_fill(module)
28 | 
29 |     def forward(self, x):
30 |         p6 = self.p6(x)
31 |         p7 = self.p7(F.relu(p6))
32 |         return [p6, p7]
33 | 
34 | 
35 | class LastLevelP6(nn.Module):
36 |     """
37 |     This module is used in FCOS to generate extra layers
38 |     """
39 | 
40 |     def __init__(self, in_channels, out_channels, in_features="res5"):
41 |         super().__init__()
42 |         self.num_levels = 1
43 |         self.in_feature = in_features
44 |         self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
45 |         for module in [self.p6]:
46 |             weight_init.c2_xavier_fill(module)
47 | 
48 |     def forward(self, x):
49 |         p6 = self.p6(x)
50 |         return [p6]
51 | 
52 | 
53 | @BACKBONE_REGISTRY.register()
54 | def build_fcos_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
55 |     """
56 |     Args:
57 |         cfg: a detectron2 CfgNode
58 | 
59 |     Returns:
60 |         backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
61 |     """
62 |     if cfg.MODEL.BACKBONE.ANTI_ALIAS:
63 |         bottom_up = build_resnet_lpf_backbone(cfg, input_shape)
64 |     elif cfg.MODEL.RESNETS.DEFORM_INTERVAL > 1:
65 |         bottom_up = build_resnet_interval_backbone(cfg, input_shape)
66 |     elif cfg.MODEL.MOBILENET:
67 |         bottom_up = build_mnv2_backbone(cfg, input_shape)
68 |     else:
69 |         bottom_up = build_resnet_backbone(cfg, input_shape)
70 |     in_features = cfg.MODEL.FPN.IN_FEATURES
71 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
72 |     top_levels = cfg.MODEL.FCOS.TOP_LEVELS
73 |     in_channels_top = out_channels
74 |     if top_levels == 2:
75 |         top_block = LastLevelP6P7(in_channels_top, out_channels, "p5")
76 |     if top_levels == 1:
77 |         top_block = LastLevelP6(in_channels_top, out_channels, "p5")
78 |     elif top_levels == 0:
79 |         top_block = None
80 |     backbone = FPN(
81 |         bottom_up=bottom_up,
82 |         in_features=in_features,
83 |         out_channels=out_channels,
84 |         norm=cfg.MODEL.FPN.NORM,
85 |         top_block=top_block,
86 |         fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
87 |     )
88 |     return backbone
89 | 


--------------------------------------------------------------------------------
/adet/layers/csrc/DefROIAlign/DefROIAlign.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <torch/types.h>
  3 | 
  4 | namespace adet {
  5 | 
  6 | #ifdef WITH_CUDA
  7 | at::Tensor DefROIAlign_forward_cuda(
  8 |     const at::Tensor& input,
  9 |     const at::Tensor& rois,
 10 |     const at::Tensor& offsets,  // def added
 11 |     const float spatial_scale,
 12 |     const int pooled_height,
 13 |     const int pooled_width,
 14 |     const int sampling_ratio,
 15 |     const float trans_std,  // def added
 16 |     bool aligned);
 17 | 
 18 | at::Tensor DefROIAlign_backward_cuda(
 19 |     const at::Tensor& input,  // def added
 20 |     const at::Tensor& grad,
 21 |     const at::Tensor& rois,
 22 |     const at::Tensor& offsets,  // def added
 23 |     const at::Tensor& grad_offsets,  // def added
 24 |     const float spatial_scale,
 25 |     const int pooled_height,
 26 |     const int pooled_width,
 27 |     const int batch_size,
 28 |     const int channels,
 29 |     const int height,
 30 |     const int width,
 31 |     const int sampling_ratio,
 32 |     const float trans_std,  // def added
 33 |     bool aligned);
 34 | #endif
 35 | 
 36 | // Interface for Python
 37 | inline at::Tensor DefROIAlign_forward(
 38 |     const at::Tensor& input,
 39 |     const at::Tensor& rois,
 40 |     const at::Tensor& offsets,  // def added
 41 |     const float spatial_scale,
 42 |     const int pooled_height,
 43 |     const int pooled_width,
 44 |     const int sampling_ratio,
 45 |     const float trans_std,  // def added
 46 |     bool aligned) {
 47 |   if (input.type().is_cuda()) {
 48 | #ifdef WITH_CUDA
 49 |     return DefROIAlign_forward_cuda(
 50 |         input,
 51 |         rois,
 52 |         offsets,
 53 |         spatial_scale,
 54 |         pooled_height,
 55 |         pooled_width,
 56 |         sampling_ratio,
 57 |         trans_std,
 58 |         aligned);
 59 | #else
 60 |     AT_ERROR("Not compiled with GPU support");
 61 | #endif
 62 |   }
 63 |   AT_ERROR("CPU version not supported");
 64 | }
 65 | 
 66 | inline at::Tensor DefROIAlign_backward(
 67 |     const at::Tensor& input,  // def added
 68 |     const at::Tensor& grad,
 69 |     const at::Tensor& rois,
 70 |     const at::Tensor& offsets,  // def added
 71 |     const at::Tensor& grad_offsets,  // def added
 72 |     const float spatial_scale,
 73 |     const int pooled_height,
 74 |     const int pooled_width,
 75 |     const int batch_size,
 76 |     const int channels,
 77 |     const int height,
 78 |     const int width,
 79 |     const int sampling_ratio,
 80 |     const float trans_std,  // def added
 81 |     bool aligned) {
 82 |   if (grad.type().is_cuda()) {
 83 | #ifdef WITH_CUDA
 84 |     return DefROIAlign_backward_cuda(
 85 |         input,  // def added
 86 |         grad,
 87 |         rois,
 88 |         offsets,  // def added
 89 |         grad_offsets, // def added
 90 |         spatial_scale,
 91 |         pooled_height,
 92 |         pooled_width,
 93 |         batch_size,
 94 |         channels,
 95 |         height,
 96 |         width,
 97 |         sampling_ratio,
 98 |         trans_std, // def added
 99 |         aligned);
100 | #else
101 |     AT_ERROR("Not compiled with GPU support");
102 | #endif
103 |   }
104 |   AT_ERROR("CPU version not supported");
105 | }
106 | 
107 | } // namespace adet
108 | 


--------------------------------------------------------------------------------
/adet/layers/naive_group_norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import Module, Parameter
 3 | from torch.nn import init
 4 | 
 5 | 
 6 | class NaiveGroupNorm(Module):
 7 |     r"""NaiveGroupNorm implements Group Normalization with the high-level matrix operations in PyTorch.
 8 |     It is a temporary solution to export GN by ONNX before the official GN can be exported by ONNX.
 9 |     The usage of NaiveGroupNorm is exactly the same as the official :class:`torch.nn.GroupNorm`.
10 |     Args:
11 |         num_groups (int): number of groups to separate the channels into
12 |         num_channels (int): number of channels expected in input
13 |         eps: a value added to the denominator for numerical stability. Default: 1e-5
14 |         affine: a boolean value that when set to ``True``, this module
15 |             has learnable per-channel affine parameters initialized to ones (for weights)
16 |             and zeros (for biases). Default: ``True``.
17 | 
18 |     Shape:
19 |         - Input: :math:`(N, C, *)` where :math:`C=\text{num\_channels}`
20 |         - Output: :math:`(N, C, *)` (same shape as input)
21 | 
22 |     Examples::
23 | 
24 |         >>> input = torch.randn(20, 6, 10, 10)
25 |         >>> # Separate 6 channels into 3 groups
26 |         >>> m = NaiveGroupNorm(3, 6)
27 |         >>> # Separate 6 channels into 6 groups (equivalent with InstanceNorm)
28 |         >>> m = NaiveGroupNorm(6, 6)
29 |         >>> # Put all 6 channels into a single group (equivalent with LayerNorm)
30 |         >>> m = NaiveGroupNorm(1, 6)
31 |         >>> # Activating the module
32 |         >>> output = m(input)
33 | 
34 |     .. _`Group Normalization`: https://arxiv.org/abs/1803.08494
35 |     """
36 |     __constants__ = ['num_groups', 'num_channels', 'eps', 'affine', 'weight',
37 |                      'bias']
38 | 
39 |     def __init__(self, num_groups, num_channels, eps=1e-5, affine=True):
40 |         super(NaiveGroupNorm, self).__init__()
41 |         self.num_groups = num_groups
42 |         self.num_channels = num_channels
43 |         self.eps = eps
44 |         self.affine = affine
45 |         if self.affine:
46 |             self.weight = Parameter(torch.Tensor(num_channels))
47 |             self.bias = Parameter(torch.Tensor(num_channels))
48 |         else:
49 |             self.register_parameter('weight', None)
50 |             self.register_parameter('bias', None)
51 |         self.reset_parameters()
52 | 
53 |     def reset_parameters(self):
54 |         if self.affine:
55 |             init.ones_(self.weight)
56 |             init.zeros_(self.bias)
57 | 
58 |     def forward(self, input):
59 |         N, C, H, W = input.size()
60 |         assert C % self.num_groups == 0
61 |         input = input.reshape(N, self.num_groups, -1)
62 |         mean = input.mean(dim=-1, keepdim=True)
63 |         var = (input ** 2).mean(dim=-1, keepdim=True) - mean ** 2
64 |         std = torch.sqrt(var + self.eps)
65 | 
66 |         input = (input - mean) / std
67 |         input = input.reshape(N, C, H, W)
68 |         if self.affine:
69 |             input = input * self.weight.reshape(1, C, 1, 1) + self.bias.reshape(1, C, 1, 1)
70 |         return input
71 | 
72 |     def extra_repr(self):
73 |         return '{num_groups}, {num_channels}, eps={eps}, ' \
74 |             'affine={affine}'.format(**self.__dict__)
75 | 


--------------------------------------------------------------------------------
/adet/modeling/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # Taken from https://github.com/fundamentalvision/Deformable-DETR
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | 
 8 | from __future__ import absolute_import
 9 | from __future__ import print_function
10 | from __future__ import division
11 | 
12 | import torch
13 | import torch.nn.functional as F
14 | from torch.autograd import Function
15 | from torch.autograd.function import once_differentiable
16 | 
17 | import MultiScaleDeformableAttention as MSDA
18 | 
19 | 
20 | class MSDeformAttnFunction(Function):
21 |     @staticmethod
22 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
23 |         ctx.im2col_step = im2col_step
24 |         output = MSDA.ms_deform_attn_forward(
25 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
26 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
27 |         return output
28 | 
29 |     @staticmethod
30 |     @once_differentiable
31 |     def backward(ctx, grad_output):
32 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
33 |         grad_value, grad_sampling_loc, grad_attn_weight = \
34 |             MSDA.ms_deform_attn_backward(
35 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
36 | 
37 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
38 | 
39 | 
40 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
41 |     # for debug and test only,
42 |     # need to use cuda version instead
43 |     N_, S_, M_, D_ = value.shape
44 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
45 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
46 |     sampling_grids = 2 * sampling_locations - 1
47 |     sampling_value_list = []
48 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
49 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
50 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
51 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
52 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
53 |         # N_*M_, D_, Lq_, P_
54 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
55 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
56 |         sampling_value_list.append(sampling_value_l_)
57 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
58 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
59 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
60 |     return output.transpose(1, 2).contiguous()
61 | 


--------------------------------------------------------------------------------
/adet/utils/visualizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from detectron2.utils.visualizer import Visualizer
 4 | 
 5 | 
 6 | class TextVisualizer(Visualizer):
 7 |     def draw_instance_predictions(self, predictions):
 8 |         beziers = predictions.beziers.numpy()
 9 |         scores = predictions.scores.tolist()
10 |         recs = predictions.recs
11 | 
12 |         self.overlay_instances(beziers, recs, scores)
13 | 
14 |         return self.output
15 | 
16 |     def _bezier_to_poly(self, bezier):
17 |         # bezier to polygon
18 |         u = np.linspace(0, 1, 20)
19 |         bezier = bezier.reshape(2, 4, 2).transpose(0, 2, 1).reshape(4, 4)
20 |         points = np.outer((1 - u) ** 3, bezier[:, 0]) \
21 |             + np.outer(3 * u * ((1 - u) ** 2), bezier[:, 1]) \
22 |             + np.outer(3 * (u ** 2) * (1 - u), bezier[:, 2]) \
23 |             + np.outer(u ** 3, bezier[:, 3])
24 |         points = np.concatenate((points[:, :2], points[:, 2:]), axis=0)
25 | 
26 |         return points
27 | 
28 |     def _decode_recognition(self, rec):
29 |         CTLABELS = [' ','!','"','#','$','%','&','\'','(',')','*','+',',','-','.','/','0','1','2','3','4','5','6','7','8','9',':',';','<','=','>','?','@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_','`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','{','|','}','~']
30 | 
31 |         s = ''
32 |         for c in rec:
33 |             c = int(c)
34 |             if c < 95:
35 |                 s += CTLABELS[c]
36 |             elif c == 95:
37 |                 s += u'口'
38 |         return s
39 | 
40 |     def _ctc_decode_recognition(self, rec):
41 |         CTLABELS = [' ','!','"','#','$','%','&','\'','(',')','*','+',',','-','.','/','0','1','2','3','4','5','6','7','8','9',':',';','<','=','>','?','@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_','`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','{','|','}','~']
42 | 
43 |         # ctc decoding
44 |         last_char = False
45 |         s = ''
46 |         for c in rec:
47 |             c = int(c)
48 |             if c < 95:
49 |                 if last_char != c:
50 |                     s += CTLABELS[c]
51 |                     last_char = c
52 |             elif c == 95:
53 |                 s += u'口'
54 |             else:
55 |                 last_char = False
56 |         return s
57 | 
58 |     def overlay_instances(self, beziers, recs, scores, alpha=0.5):
59 |         color = (0.1, 0.2, 0.5)
60 | 
61 |         for bezier, rec, score in zip(beziers, recs, scores):
62 |             polygon = self._bezier_to_poly(bezier)
63 |             self.draw_polygon(polygon, color, alpha=alpha)
64 | 
65 |             # draw text in the top left corner
66 |             # text = self._decode_recognition(rec)
67 |             # text = "{:.3f}: {}".format(score, text)
68 |             # lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
69 |             # text_pos = polygon[0]
70 |             # horiz_align = "left"
71 |             # font_size = self._default_font_size
72 |             #
73 |             # self.draw_text(
74 |             #     text,
75 |             #     text_pos,
76 |             #     color=lighter_color,
77 |             #     horizontal_alignment=horiz_align,
78 |             #     font_size=font_size,
79 |             # )


--------------------------------------------------------------------------------
/adet/data/detection_utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | 
  6 | from detectron2.data import transforms as T
  7 | from detectron2.data.detection_utils import \
  8 |     annotations_to_instances as d2_anno_to_inst
  9 | from detectron2.data.detection_utils import \
 10 |     transform_instance_annotations as d2_transform_inst_anno
 11 | 
 12 | 
 13 | def transform_instance_annotations(
 14 |     annotation, transforms, image_size, *, keypoint_hflip_indices=None
 15 | ):
 16 | 
 17 |     annotation = d2_transform_inst_anno(
 18 |         annotation,
 19 |         transforms,
 20 |         image_size,
 21 |         keypoint_hflip_indices=keypoint_hflip_indices,
 22 |     )
 23 | 
 24 |     if "beziers" in annotation:
 25 |         beziers = transform_beziers_annotations(annotation["beziers"], transforms)
 26 |         annotation["beziers"] = beziers
 27 |     return annotation
 28 | 
 29 | 
 30 | def transform_beziers_annotations(beziers, transforms):
 31 |     """
 32 |     Transform keypoint annotations of an image.
 33 | 
 34 |     Args:
 35 |         beziers (list[float]): Nx16 float in Detectron2 Dataset format.
 36 |         transforms (TransformList):
 37 |     """
 38 |     # (N*2,) -> (N, 2)
 39 |     beziers = np.asarray(beziers, dtype="float64").reshape(-1, 2)
 40 |     beziers = transforms.apply_coords(beziers).reshape(-1)
 41 | 
 42 |     # This assumes that HorizFlipTransform is the only one that does flip
 43 |     do_hflip = (
 44 |         sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
 45 |     )
 46 |     if do_hflip:
 47 |         raise ValueError("Flipping text data is not supported (also disencouraged).")
 48 | 
 49 |     return beziers
 50 | 
 51 | 
 52 | def annotations_to_instances(annos, image_size, mask_format="polygon"):
 53 |     instance = d2_anno_to_inst(annos, image_size, mask_format)
 54 | 
 55 |     if not annos:
 56 |         return instance
 57 | 
 58 |     # add attributes
 59 |     if "beziers" in annos[0]:
 60 |         beziers = [obj.get("beziers", []) for obj in annos]
 61 |         instance.beziers = torch.as_tensor(beziers, dtype=torch.float32)
 62 | 
 63 |     if "rec" in annos[0]:
 64 |         text = [obj.get("rec", []) for obj in annos]
 65 |         instance.text = torch.as_tensor(text, dtype=torch.int32)
 66 | 
 67 |     return instance
 68 | 
 69 | 
 70 | def build_augmentation(cfg, is_train):
 71 |     """
 72 |     With option to don't use hflip
 73 | 
 74 |     Returns:
 75 |         list[Augmentation]
 76 |     """
 77 |     if is_train:
 78 |         min_size = cfg.INPUT.MIN_SIZE_TRAIN
 79 |         max_size = cfg.INPUT.MAX_SIZE_TRAIN
 80 |         sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
 81 |     else:
 82 |         min_size = cfg.INPUT.MIN_SIZE_TEST
 83 |         max_size = cfg.INPUT.MAX_SIZE_TEST
 84 |         sample_style = "choice"
 85 |     if sample_style == "range":
 86 |         assert (
 87 |             len(min_size) == 2
 88 |         ), "more than 2 ({}) min_size(s) are provided for ranges".format(len(min_size))
 89 | 
 90 |     logger = logging.getLogger(__name__)
 91 | 
 92 |     augmentation = []
 93 |     augmentation.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
 94 |     if is_train:
 95 |         if cfg.INPUT.HFLIP_TRAIN:
 96 |             augmentation.append(T.RandomFlip())
 97 |         logger.info("Augmentations used in training: " + str(augmentation))
 98 |     return augmentation
 99 | 
100 | 
101 | build_transform_gen = build_augmentation
102 | """
103 | Alias for backward-compatibility.
104 | """
105 | 


--------------------------------------------------------------------------------
/adet/layers/csrc/BezierAlign/BezierAlign.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <torch/types.h>
  3 | 
  4 | namespace adet {
  5 | 
  6 | at::Tensor BezierAlign_forward_cpu(
  7 |     const at::Tensor& input,
  8 |     const at::Tensor& rois,
  9 |     const float spatial_scale,
 10 |     const int pooled_height,
 11 |     const int pooled_width,
 12 |     const int sampling_ratio,
 13 |     bool aligned);
 14 | 
 15 | at::Tensor BezierAlign_backward_cpu(
 16 |     const at::Tensor& grad,
 17 |     const at::Tensor& rois,
 18 |     const float spatial_scale,
 19 |     const int pooled_height,
 20 |     const int pooled_width,
 21 |     const int batch_size,
 22 |     const int channels,
 23 |     const int height,
 24 |     const int width,
 25 |     const int sampling_ratio,
 26 |     bool aligned);
 27 | 
 28 | #ifdef WITH_CUDA
 29 | at::Tensor BezierAlign_forward_cuda(
 30 |     const at::Tensor& input,
 31 |     const at::Tensor& rois,
 32 |     const float spatial_scale,
 33 |     const int pooled_height,
 34 |     const int pooled_width,
 35 |     const int sampling_ratio,
 36 |     bool aligned);
 37 | 
 38 | at::Tensor BezierAlign_backward_cuda(
 39 |     const at::Tensor& grad,
 40 |     const at::Tensor& rois,
 41 |     const float spatial_scale,
 42 |     const int pooled_height,
 43 |     const int pooled_width,
 44 |     const int batch_size,
 45 |     const int channels,
 46 |     const int height,
 47 |     const int width,
 48 |     const int sampling_ratio,
 49 |     bool aligned);
 50 | #endif
 51 | 
 52 | // Interface for Python
 53 | inline at::Tensor BezierAlign_forward(
 54 |     const at::Tensor& input,
 55 |     const at::Tensor& rois,
 56 |     const float spatial_scale,
 57 |     const int pooled_height,
 58 |     const int pooled_width,
 59 |     const int sampling_ratio,
 60 |     bool aligned) {
 61 |   if (input.type().is_cuda()) {
 62 | #ifdef WITH_CUDA
 63 |     return BezierAlign_forward_cuda(
 64 |         input,
 65 |         rois,
 66 |         spatial_scale,
 67 |         pooled_height,
 68 |         pooled_width,
 69 |         sampling_ratio,
 70 |         aligned);
 71 | #else
 72 |     AT_ERROR("Not compiled with GPU support");
 73 | #endif
 74 |   }
 75 |   return BezierAlign_forward_cpu(
 76 |       input,
 77 |       rois,
 78 |       spatial_scale,
 79 |       pooled_height,
 80 |       pooled_width,
 81 |       sampling_ratio,
 82 |       aligned);
 83 | }
 84 | 
 85 | inline at::Tensor BezierAlign_backward(
 86 |     const at::Tensor& grad,
 87 |     const at::Tensor& rois,
 88 |     const float spatial_scale,
 89 |     const int pooled_height,
 90 |     const int pooled_width,
 91 |     const int batch_size,
 92 |     const int channels,
 93 |     const int height,
 94 |     const int width,
 95 |     const int sampling_ratio,
 96 |     bool aligned) {
 97 |   if (grad.type().is_cuda()) {
 98 | #ifdef WITH_CUDA
 99 |     return BezierAlign_backward_cuda(
100 |         grad,
101 |         rois,
102 |         spatial_scale,
103 |         pooled_height,
104 |         pooled_width,
105 |         batch_size,
106 |         channels,
107 |         height,
108 |         width,
109 |         sampling_ratio,
110 |         aligned);
111 | #else
112 |     AT_ERROR("Not compiled with GPU support");
113 | #endif
114 |   }
115 |   return BezierAlign_backward_cpu(
116 |       grad,
117 |       rois,
118 |       spatial_scale,
119 |       pooled_height,
120 |       pooled_width,
121 |       batch_size,
122 |       channels,
123 |       height,
124 |       width,
125 |       sampling_ratio,
126 |       aligned);
127 | }
128 | 
129 | } // namespace detectron2
130 | 


--------------------------------------------------------------------------------
/tools/plot_utils.py:
--------------------------------------------------------------------------------
 1 | # Modified from https://github.com/facebookresearch/detr
 2 | import os
 3 | import pandas as pd
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | from pathlib import Path, PurePath
 7 | 
 8 | 
 9 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'),
10 |               ewm_col=0, log_name='metrics.json'):
11 |     """
12 |     Function to plot specific fields from training log(s). Plots both training and test results.
13 | 
14 |     :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file
15 |               - fields = which results to plot from each log file - plots both training and test for each field.
16 |               - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots
17 |               - log_name = optional, name of log file if different than default 'log.txt'.
18 | 
19 |     :: Outputs - matplotlib plots of results in fields, color coded for each log file.
20 |                - solid lines are training results, dashed lines are test results.
21 | 
22 |     """
23 |     func_name = "plot_utils.py::plot_logs"
24 | 
25 |     # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path,
26 |     # convert single Path to list to avoid 'not iterable' error
27 | 
28 |     if not isinstance(logs, list):
29 |         if isinstance(logs, PurePath):
30 |             logs = [logs]
31 |             print(f"{func_name} info: logs param expects a list argument, converted to list[Path].")
32 |         else:
33 |             raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \
34 |             Expect list[Path] or single Path obj, received {type(logs)}")
35 | 
36 |     # Quality checks - verify valid dir(s), that every item in list is Path object, and that log_name exists in each dir
37 |     for i, dir in enumerate(logs):
38 |         if not isinstance(dir, PurePath):
39 |             raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}")
40 |         if not dir.exists():
41 |             raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}")
42 |         # verify log_name exists
43 |         fn = Path(dir / log_name)
44 |         if not fn.exists():
45 |             print(f"-> missing {log_name}.  Have you gotten to Epoch 1 in training?")
46 |             print(f"--> full path of missing log file: {fn}")
47 |             return
48 | 
49 |     # load log file(s) and plot
50 |     dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs]
51 | 
52 |     fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5))
53 | 
54 |     for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))):
55 |         for j, field in enumerate(fields):
56 |             df.rolling(60).mean()[::60].interpolate().ewm(com=ewm_col).mean().plot(
57 |                 y=field,
58 |                 ax=axs[j],
59 |                 color=color,
60 |                 style='-')
61 |     for ax, field in zip(axs, fields):
62 |         ax.legend([Path(p).name for p in logs])
63 |         ax.set_title(field)
64 | 
65 |     return fig, axs
66 | 
67 | 
68 | def save_plot(log_path_list, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), save_name='result.png'):
69 |     if isinstance(log_path_list, list):
70 |         log = [Path(log_path) for log_path in log_path_list]
71 |     else:
72 |         log = Path(log_path_list)
73 |     fig, _ = plot_logs(log, fields)
74 |     fig.savefig(os.path.join(log_path_list[0], save_name))
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     save_plot(['OSFormer',
79 |                'OSFormer-ZEROLIKE',
80 |                'OSFormer-NNEMBEDDING'],
81 |               fields=('total_loss', 'loss_ins'))
82 | 


--------------------------------------------------------------------------------
/adet/layers/bezier_align.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from torch.autograd import Function
 3 | from torch.autograd.function import once_differentiable
 4 | from torch.nn.modules.utils import _pair
 5 | 
 6 | from adet import _C
 7 | 
 8 | 
 9 | class _BezierAlign(Function):
10 |     @staticmethod
11 |     def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio, aligned):
12 |         ctx.save_for_backward(roi)
13 |         ctx.output_size = _pair(output_size)
14 |         ctx.spatial_scale = spatial_scale
15 |         ctx.sampling_ratio = sampling_ratio
16 |         ctx.input_shape = input.size()
17 |         ctx.aligned = aligned
18 |         output = _C.bezier_align_forward(
19 |             input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned
20 |         )
21 |         return output
22 | 
23 |     @staticmethod
24 |     @once_differentiable
25 |     def backward(ctx, grad_output):
26 |         rois, = ctx.saved_tensors
27 |         output_size = ctx.output_size
28 |         spatial_scale = ctx.spatial_scale
29 |         sampling_ratio = ctx.sampling_ratio
30 |         bs, ch, h, w = ctx.input_shape
31 |         grad_input = _C.bezier_align_backward(
32 |             grad_output,
33 |             rois,
34 |             spatial_scale,
35 |             output_size[0],
36 |             output_size[1],
37 |             bs,
38 |             ch,
39 |             h,
40 |             w,
41 |             sampling_ratio,
42 |             ctx.aligned,
43 |         )
44 |         return grad_input, None, None, None, None, None
45 | 
46 | 
47 | bezier_align = _BezierAlign.apply
48 | 
49 | 
50 | class BezierAlign(nn.Module):
51 |     def __init__(self, output_size, spatial_scale, sampling_ratio, aligned=True):
52 |         """
53 |         Args:
54 |             output_size (tuple): h, w
55 |             spatial_scale (float): scale the input boxes by this number
56 |             sampling_ratio (int): number of inputs samples to take for each output
57 |                 sample. 0 to take samples densely.
58 |             aligned (bool): if False, use the legacy implementation in
59 |                 Detectron. If True, align the results more perfectly.
60 | 
61 |         Note:
62 |             The meaning of aligned=True:
63 | 
64 |             With `aligned=True`,
65 |             we first appropriately scale the ROI and then shift it by -0.5
66 |             prior to calling bezier_align. This produces the correct neighbors; see
67 |             adet/tests/test_bezier_align.py for verification.
68 | 
69 |             The difference does not make a difference to the model's performance if
70 |             ROIAlign is used together with conv layers.
71 |         """
72 |         super(BezierAlign, self).__init__()
73 |         self.output_size = output_size
74 |         self.spatial_scale = spatial_scale
75 |         self.sampling_ratio = sampling_ratio
76 |         self.aligned = aligned
77 | 
78 |     def forward(self, input, rois):
79 |         """
80 |         Args:
81 |             input: NCHW images
82 |             rois: Bx17 boxes. First column is the index into N. The other 16 columns are [xy]x8.
83 |         """
84 |         assert rois.dim() == 2 and rois.size(1) == 17
85 |         return bezier_align(
86 |             input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned
87 |         )
88 | 
89 |     def __repr__(self):
90 |         tmpstr = self.__class__.__name__ + "("
91 |         tmpstr += "output_size=" + str(self.output_size)
92 |         tmpstr += ", spatial_scale=" + str(self.spatial_scale)
93 |         tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
94 |         tmpstr += ", aligned=" + str(self.aligned)
95 |         tmpstr += ")"
96 |         return tmpstr
97 | 


--------------------------------------------------------------------------------
/demo/vis_pred_json.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import json
  4 | from collections import defaultdict, OrderedDict
  5 | 
  6 | import torch
  7 | import pycocotools.mask as mask_util
  8 | from pycocotools.coco import COCO
  9 | from detectron2.structures import Instances
 10 | from detectron2.utils.visualizer import Visualizer, ColorMode
 11 | from detectron2.evaluation.coco_evaluation import COCOEvaluator
 12 | from adet.data.datasets.cis import register_dataset
 13 | 
 14 | 
 15 | def pre_process_json(json_file, score_threshold=0.3):
 16 |     with open(json_file, 'r') as f:
 17 |         data = json.load(f)
 18 | 
 19 |     data_filtered = defaultdict(list)
 20 |     for d in data:
 21 |         if d['score'] > score_threshold:
 22 |             data_filtered[d['image_id']].append(d)
 23 | 
 24 |     return data_filtered
 25 | 
 26 | 
 27 | def data2instance(data):
 28 |     instances = {}
 29 |     for k in data.keys():
 30 |         if len(data[k]) == 0:
 31 |             instances[k] = None
 32 |         results = Instances(data[k][0]['segmentation']['size'])
 33 |         scores = []
 34 |         segms = []
 35 |         for elem in data[k]:
 36 |             scores.append(elem['score'])
 37 |             segms.append(mask_util.decode(elem['segmentation']))
 38 |         results.scores = torch.as_tensor(scores)
 39 |         results.pred_masks = torch.as_tensor(segms)
 40 |         instances[k] = results
 41 | 
 42 |     return instances
 43 | 
 44 | 
 45 | def vis_single_image(img_info, ins, img_dir, output_dir, instance_mode=ColorMode.IMAGE):
 46 |     img_filename = img_info['file_name']
 47 |     if ins is None:
 48 |         ins = Instances((img_info['height'], img_info['width']))
 49 | 
 50 |     im = cv2.imread(os.path.join(img_dir, img_filename))[:, :, ::-1]
 51 |     visualizer = Visualizer(im, instance_mode=instance_mode)
 52 |     vis_output = visualizer.draw_instance_predictions(predictions=ins)
 53 |     out_filename = os.path.join(output_dir, img_filename)
 54 |     print('Save {} successfully.'.format(out_filename))
 55 |     vis_output.save(out_filename)
 56 | 
 57 | 
 58 | def eval_single_image(coco_eval, prediction):
 59 |     coco_eval.reset()
 60 |     coco_eval._predictions.append(prediction)
 61 |     return coco_eval.evaluate()
 62 | 
 63 | 
 64 | def vis(res_json, dataset_name, output_dir=None, score_threshold=0.3):
 65 |     if output_dir is None:
 66 |         output_dir = os.path.dirname(res_json)
 67 | 
 68 |     vis_dir = os.path.join(output_dir, 'vis')
 69 |     if not os.path.exists(vis_dir):
 70 |         os.makedirs(vis_dir)
 71 | 
 72 |     anno_json, img_dir = datasets[dataset_name]
 73 |     coco = COCO(anno_json)
 74 |     coco_eval = COCOEvaluator(dataset_name, output_dir=output_dir, tasks=('segm',))
 75 | 
 76 |     data = pre_process_json(res_json, score_threshold)
 77 |     instances = data2instance(data)
 78 |     print('Get instances successfully.')
 79 | 
 80 |     eval_res = OrderedDict()
 81 |     for img_id, ins in instances.items():
 82 |         vis_single_image(coco.imgs[img_id], ins, img_dir, vis_dir)
 83 |         prediction = {"image_id": img_id, "instances": data[img_id]}
 84 |         eval_cur = eval_single_image(coco_eval, prediction)
 85 |         eval_res[coco.imgs[img_id]['file_name']] = eval_cur['segm']
 86 | 
 87 |     with open(os.path.join(output_dir, '{}_ap.json'.format(dataset_name)), 'w') as f:
 88 |         json.dump(eval_res, f, indent=4)
 89 | 
 90 | 
 91 | if __name__ == '__main__':
 92 |     datasets = {
 93 |         "my_data_test_coco_cod_style": [
 94 |             'COD10K-v3/annotations/test2026.json',
 95 |             'COD10K-v3/Test/Image/'
 96 |         ],
 97 |         "my_data_test_coco_nc4k_style": [
 98 |             'NC4K/nc4k_test.json',
 99 |             'NC4K/Imgs/'
100 |         ]
101 |     }
102 | 
103 |     register_dataset()
104 | 


--------------------------------------------------------------------------------
/adet/layers/def_roi_align.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.autograd import Function
  4 | from torch.autograd.function import once_differentiable
  5 | from torch.nn.modules.utils import _pair
  6 | 
  7 | from adet import _C
  8 | 
  9 | 
 10 | class _DefROIAlign(Function):
 11 |     @staticmethod
 12 |     def forward(ctx, input, roi, offsets, output_size, spatial_scale, sampling_ratio, trans_std, aligned):
 13 |         ctx.save_for_backward(input, roi, offsets)
 14 |         ctx.output_size = _pair(output_size)
 15 |         ctx.spatial_scale = spatial_scale
 16 |         ctx.sampling_ratio = sampling_ratio
 17 |         ctx.trans_std = trans_std
 18 |         ctx.input_shape = input.size()
 19 |         ctx.aligned = aligned
 20 |         output = _C.def_roi_align_forward(
 21 |             input, roi, offsets, spatial_scale, output_size[0], output_size[1],
 22 |             sampling_ratio, trans_std, aligned
 23 |         )
 24 |         return output
 25 | 
 26 |     @staticmethod
 27 |     @once_differentiable
 28 |     def backward(ctx, grad_output):
 29 |         data, rois, offsets = ctx.saved_tensors
 30 |         output_size = ctx.output_size
 31 |         spatial_scale = ctx.spatial_scale
 32 |         sampling_ratio = ctx.sampling_ratio
 33 |         trans_std = ctx.trans_std
 34 |         bs, ch, h, w = ctx.input_shape
 35 |         grad_offsets = torch.zeros_like(offsets)
 36 | 
 37 |         grad_input = _C.def_roi_align_backward(
 38 |             data,
 39 |             grad_output,
 40 |             rois,
 41 |             offsets,
 42 |             grad_offsets,
 43 |             spatial_scale,
 44 |             output_size[0],
 45 |             output_size[1],
 46 |             bs,
 47 |             ch,
 48 |             h,
 49 |             w,
 50 |             sampling_ratio,
 51 |             trans_std,
 52 |             ctx.aligned,
 53 |         )
 54 |         return grad_input, None, grad_offsets, None, None, None, None, None
 55 | 
 56 | 
 57 | def_roi_align = _DefROIAlign.apply
 58 | 
 59 | 
 60 | class DefROIAlign(nn.Module):
 61 |     def __init__(self, output_size, spatial_scale,
 62 |                  sampling_ratio, trans_std, aligned=True):
 63 |         """
 64 |         Args:
 65 |             output_size (tuple): h, w
 66 |             spatial_scale (float): scale the input boxes by this number
 67 |             sampling_ratio (int): number of inputs samples to take for each output
 68 |                 sample. 0 to take samples densely.
 69 |             trans_std (float): offset scale according to the normalized roi size
 70 |             aligned (bool): if False, use the legacy implementation in
 71 |                 Detectron. If True, align the results more perfectly.
 72 |         """
 73 |         super(DefROIAlign, self).__init__()
 74 |         self.output_size = output_size
 75 |         self.spatial_scale = spatial_scale
 76 |         self.sampling_ratio = sampling_ratio
 77 |         self.trans_std = trans_std
 78 |         self.aligned = aligned
 79 | 
 80 |     def forward(self, input, rois, offsets):
 81 |         """
 82 |         Args:
 83 |             input: NCHW images
 84 |             rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
 85 |         """
 86 |         assert rois.dim() == 2 and rois.size(1) == 5
 87 |         return def_roi_align(
 88 |             input, rois, offsets, self.output_size,
 89 |             self.spatial_scale, self.sampling_ratio,
 90 |             self.trans_std, self.aligned
 91 |         )
 92 | 
 93 |     def __repr__(self):
 94 |         tmpstr = self.__class__.__name__ + "("
 95 |         tmpstr += "output_size=" + str(self.output_size)
 96 |         tmpstr += ", spatial_scale=" + str(self.spatial_scale)
 97 |         tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
 98 |         tmpstr += ", trans_std=" + str(self.trans_std)
 99 |         tmpstr += ", aligned=" + str(self.aligned)
100 |         tmpstr += ")"
101 |         return tmpstr
102 | 


--------------------------------------------------------------------------------
/adet/data/augmentation.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import numpy as np
  4 | from fvcore.transforms import transform as T
  5 | 
  6 | from detectron2.data.transforms import RandomCrop, StandardAugInput
  7 | from detectron2.structures import BoxMode
  8 | 
  9 | 
 10 | def gen_crop_transform_with_instance(crop_size, image_size, instances, crop_box=True):
 11 |     """
 12 |     Generate a CropTransform so that the cropping region contains
 13 |     the center of the given instance.
 14 | 
 15 |     Args:
 16 |         crop_size (tuple): h, w in pixels
 17 |         image_size (tuple): h, w
 18 |         instance (dict): an annotation dict of one instance, in Detectron2's
 19 |             dataset format.
 20 |     """
 21 |     bbox = random.choice(instances)
 22 |     crop_size = np.asarray(crop_size, dtype=np.int32)
 23 |     center_yx = (bbox[1] + bbox[3]) * 0.5, (bbox[0] + bbox[2]) * 0.5
 24 |     assert (
 25 |         image_size[0] >= center_yx[0] and image_size[1] >= center_yx[1]
 26 |     ), "The annotation bounding box is outside of the image!"
 27 |     assert (
 28 |         image_size[0] >= crop_size[0] and image_size[1] >= crop_size[1]
 29 |     ), "Crop size is larger than image size!"
 30 | 
 31 |     min_yx = np.maximum(np.floor(center_yx).astype(np.int32) - crop_size, 0)
 32 |     max_yx = np.maximum(np.asarray(image_size, dtype=np.int32) - crop_size, 0)
 33 |     max_yx = np.minimum(max_yx, np.ceil(center_yx).astype(np.int32))
 34 | 
 35 |     y0 = np.random.randint(min_yx[0], max_yx[0] + 1)
 36 |     x0 = np.random.randint(min_yx[1], max_yx[1] + 1)
 37 | 
 38 |     # if some instance is cropped extend the box
 39 |     if not crop_box:
 40 |         num_modifications = 0
 41 |         modified = True
 42 | 
 43 |         # convert crop_size to float
 44 |         crop_size = crop_size.astype(np.float32)
 45 |         while modified:
 46 |             modified, x0, y0, crop_size = adjust_crop(x0, y0, crop_size, instances)
 47 |             num_modifications += 1
 48 |             if num_modifications > 100:
 49 |                 raise ValueError(
 50 |                     "Cannot finished cropping adjustment within 100 tries (#instances {}).".format(
 51 |                         len(instances)
 52 |                     )
 53 |                 )
 54 |                 return T.CropTransform(0, 0, image_size[1], image_size[0])
 55 | 
 56 |     return T.CropTransform(*map(int, (x0, y0, crop_size[1], crop_size[0])))
 57 | 
 58 | 
 59 | def adjust_crop(x0, y0, crop_size, instances, eps=1e-3):
 60 |     modified = False
 61 | 
 62 |     x1 = x0 + crop_size[1]
 63 |     y1 = y0 + crop_size[0]
 64 | 
 65 |     for bbox in instances:
 66 | 
 67 |         if bbox[0] < x0 - eps and bbox[2] > x0 + eps:
 68 |             crop_size[1] += x0 - bbox[0]
 69 |             x0 = bbox[0]
 70 |             modified = True
 71 | 
 72 |         if bbox[0] < x1 - eps and bbox[2] > x1 + eps:
 73 |             crop_size[1] += bbox[2] - x1
 74 |             x1 = bbox[2]
 75 |             modified = True
 76 | 
 77 |         if bbox[1] < y0 - eps and bbox[3] > y0 + eps:
 78 |             crop_size[0] += y0 - bbox[1]
 79 |             y0 = bbox[1]
 80 |             modified = True
 81 | 
 82 |         if bbox[1] < y1 - eps and bbox[3] > y1 + eps:
 83 |             crop_size[0] += bbox[3] - y1
 84 |             y1 = bbox[3]
 85 |             modified = True
 86 | 
 87 |     return modified, x0, y0, crop_size
 88 | 
 89 | 
 90 | class RandomCropWithInstance(RandomCrop):
 91 |     """ Instance-aware cropping.
 92 |     """
 93 | 
 94 |     def __init__(self, crop_type, crop_size, crop_instance=True):
 95 |         """
 96 |         Args:
 97 |             crop_instance (bool): if False, extend cropping boxes to avoid cropping instances
 98 |         """
 99 |         super().__init__(crop_type, crop_size)
100 |         self.crop_instance = crop_instance
101 |         self.input_args = ("image", "boxes")
102 | 
103 |     def get_transform(self, img, boxes):
104 |         image_size = img.shape[:2]
105 |         crop_size = self.get_crop_size(image_size)
106 |         return gen_crop_transform_with_instance(
107 |             crop_size, image_size, boxes, crop_box=self.crop_instance
108 |         )
109 | 


--------------------------------------------------------------------------------
/tools/visualize_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import numpy as np
  4 | import os
  5 | from itertools import chain
  6 | import cv2
  7 | import tqdm
  8 | from PIL import Image
  9 | 
 10 | from detectron2.data import DatasetCatalog, MetadataCatalog, build_detection_train_loader
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data.build import filter_images_with_few_keypoints
 13 | from detectron2.utils.logger import setup_logger
 14 | from detectron2.utils.visualizer import Visualizer
 15 | 
 16 | from adet.config import get_cfg
 17 | from adet.data.dataset_mapper import DatasetMapperWithBasis
 18 | 
 19 | 
 20 | def setup(args):
 21 |     cfg = get_cfg()
 22 |     if args.config_file:
 23 |         cfg.merge_from_file(args.config_file)
 24 |     cfg.merge_from_list(args.opts)
 25 |     cfg.freeze()
 26 |     return cfg
 27 | 
 28 | 
 29 | def parse_args(in_args=None):
 30 |     parser = argparse.ArgumentParser(description="Visualize ground-truth data")
 31 |     parser.add_argument(
 32 |         "--source",
 33 |         choices=["annotation", "dataloader"],
 34 |         required=True,
 35 |         help="visualize the annotations or the data loader (with pre-processing)",
 36 |     )
 37 |     parser.add_argument("--config-file", metavar="FILE", help="path to config file")
 38 |     parser.add_argument("--output-dir", default="./", help="path to output directory")
 39 |     parser.add_argument("--show", action="store_true", help="show output in a window")
 40 |     parser.add_argument(
 41 |         "--opts",
 42 |         help="Modify config options using the command-line",
 43 |         default=[],
 44 |         nargs=argparse.REMAINDER,
 45 |     )
 46 |     return parser.parse_args(in_args)
 47 | 
 48 | 
 49 | if __name__ == "__main__":
 50 |     args = parse_args()
 51 |     logger = setup_logger()
 52 |     logger.info("Arguments: " + str(args))
 53 |     cfg = setup(args)
 54 | 
 55 |     dirname = args.output_dir
 56 |     os.makedirs(dirname, exist_ok=True)
 57 |     metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
 58 | 
 59 |     def output(vis, fname):
 60 |         if args.show:
 61 |             print(fname)
 62 |             cv2.imshow("window", vis.get_image()[:, :, ::-1])
 63 |             cv2.waitKey()
 64 |         else:
 65 |             filepath = os.path.join(dirname, fname)
 66 |             print("Saving to {} ...".format(filepath))
 67 |             vis.save(filepath)
 68 | 
 69 |     scale = 2.0 if args.show else 1.0
 70 |     if args.source == "dataloader":
 71 |         mapper = DatasetMapperWithBasis(cfg, True)
 72 |         train_data_loader = build_detection_train_loader(cfg, mapper)
 73 |         for batch in train_data_loader:
 74 |             for per_image in batch:
 75 |                 # Pytorch tensor is in (C, H, W) format
 76 |                 img = per_image["image"].permute(1, 2, 0)
 77 |                 if cfg.INPUT.FORMAT == "BGR":
 78 |                     img = img[:, :, [2, 1, 0]]
 79 |                 else:
 80 |                     img = np.asarray(Image.fromarray(img, mode=cfg.INPUT.FORMAT).convert("RGB"))
 81 | 
 82 |                 visualizer = Visualizer(img, metadata=metadata, scale=scale)
 83 |                 target_fields = per_image["instances"].get_fields()
 84 |                 labels = [metadata.thing_classes[i] for i in target_fields["gt_classes"]]
 85 |                 vis = visualizer.overlay_instances(
 86 |                     labels=labels,
 87 |                     boxes=target_fields.get("gt_boxes", None),
 88 |                     masks=target_fields.get("gt_masks", None),
 89 |                     keypoints=target_fields.get("gt_keypoints", None),
 90 |                 )
 91 |                 output(vis, str(per_image["image_id"]) + ".jpg")
 92 |     else:
 93 |         dicts = list(chain.from_iterable([DatasetCatalog.get(k) for k in cfg.DATASETS.TRAIN]))
 94 |         if cfg.MODEL.KEYPOINT_ON:
 95 |             dicts = filter_images_with_few_keypoints(dicts, 1)
 96 |         for dic in tqdm.tqdm(dicts):
 97 |             img = utils.read_image(dic["file_name"], "RGB")
 98 |             visualizer = Visualizer(img, metadata=metadata, scale=scale)
 99 |             vis = visualizer.draw_dataset_dict(dic)
100 |             output(vis, os.path.basename(dic["file_name"]))


--------------------------------------------------------------------------------
/adet/modeling/ops/test.py:
--------------------------------------------------------------------------------
 1 | # Taken from https://github.com/fundamentalvision/Deformable-DETR
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | 
 8 | from __future__ import absolute_import
 9 | from __future__ import print_function
10 | from __future__ import division
11 | 
12 | import time
13 | import torch
14 | import torch.nn as nn
15 | from torch.autograd import gradcheck
16 | 
17 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
18 | 
19 | 
20 | N, M, D = 1, 2, 2
21 | Lq, L, P = 2, 2, 2
22 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
23 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
24 | S = sum([(H*W).item() for H, W in shapes])
25 | 
26 | 
27 | torch.manual_seed(3)
28 | 
29 | 
30 | @torch.no_grad()
31 | def check_forward_equal_with_pytorch_double():
32 |     value = torch.rand(N, S, M, D).cuda() * 0.01
33 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
34 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
35 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
36 |     im2col_step = 2
37 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
38 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
39 |     fwdok = torch.allclose(output_cuda, output_pytorch)
40 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
41 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
42 | 
43 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
44 | 
45 | 
46 | @torch.no_grad()
47 | def check_forward_equal_with_pytorch_float():
48 |     value = torch.rand(N, S, M, D).cuda() * 0.01
49 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
50 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
51 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
52 |     im2col_step = 2
53 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
54 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
55 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
56 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
57 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
58 | 
59 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
60 | 
61 | 
62 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
63 | 
64 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
65 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
66 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
67 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
68 |     im2col_step = 2
69 |     func = MSDeformAttnFunction.apply
70 | 
71 |     value.requires_grad = grad_value
72 |     sampling_locations.requires_grad = grad_sampling_loc
73 |     attention_weights.requires_grad = grad_attn_weight
74 | 
75 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
76 | 
77 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     check_forward_equal_with_pytorch_double()
82 |     check_forward_equal_with_pytorch_float()
83 | 
84 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
85 |         check_gradient_numerical(channels, True, True, True)
86 | 
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/adet/layers/deform_conv.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | 
  4 | from detectron2.layers import Conv2d
  5 | 
  6 | 
  7 | class _NewEmptyTensorOp(torch.autograd.Function):
  8 |     @staticmethod
  9 |     def forward(ctx, x, new_shape):
 10 |         ctx.shape = x.shape
 11 |         return x.new_empty(new_shape)
 12 | 
 13 |     @staticmethod
 14 |     def backward(ctx, grad):
 15 |         shape = ctx.shape
 16 |         return _NewEmptyTensorOp.apply(grad, shape), None
 17 | 
 18 | 
 19 | class DFConv2d(nn.Module):
 20 |     """
 21 |     Deformable convolutional layer with configurable
 22 |     deformable groups, dilations and groups.
 23 | 
 24 |     Code is from:
 25 |     https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/layers/misc.py
 26 | 
 27 | 
 28 |     """
 29 |     def __init__(
 30 |             self,
 31 |             in_channels,
 32 |             out_channels,
 33 |             with_modulated_dcn=True,
 34 |             kernel_size=3,
 35 |             stride=1,
 36 |             groups=1,
 37 |             dilation=1,
 38 |             deformable_groups=1,
 39 |             bias=False,
 40 |             padding=None
 41 |     ):
 42 |         super(DFConv2d, self).__init__()
 43 |         if isinstance(kernel_size, (list, tuple)):
 44 |             assert isinstance(stride, (list, tuple))
 45 |             assert isinstance(dilation, (list, tuple))
 46 |             assert len(kernel_size) == 2
 47 |             assert len(stride) == 2
 48 |             assert len(dilation) == 2
 49 |             padding = (
 50 |                 dilation[0] * (kernel_size[0] - 1) // 2,
 51 |                 dilation[1] * (kernel_size[1] - 1) // 2
 52 |             )
 53 |             offset_base_channels = kernel_size[0] * kernel_size[1]
 54 |         else:
 55 |             padding = dilation * (kernel_size - 1) // 2
 56 |             offset_base_channels = kernel_size * kernel_size
 57 |         if with_modulated_dcn:
 58 |             from detectron2.layers.deform_conv import ModulatedDeformConv
 59 |             offset_channels = offset_base_channels * 3  # default: 27
 60 |             conv_block = ModulatedDeformConv
 61 |         else:
 62 |             from detectron2.layers.deform_conv import DeformConv
 63 |             offset_channels = offset_base_channels * 2  # default: 18
 64 |             conv_block = DeformConv
 65 |         self.offset = Conv2d(
 66 |             in_channels,
 67 |             deformable_groups * offset_channels,
 68 |             kernel_size=kernel_size,
 69 |             stride=stride,
 70 |             padding=padding,
 71 |             groups=1,
 72 |             dilation=dilation
 73 |         )
 74 |         for l in [self.offset, ]:
 75 |             nn.init.kaiming_uniform_(l.weight, a=1)
 76 |             torch.nn.init.constant_(l.bias, 0.)
 77 |         self.conv = conv_block(
 78 |             in_channels,
 79 |             out_channels,
 80 |             kernel_size=kernel_size,
 81 |             stride=stride,
 82 |             padding=padding,
 83 |             dilation=dilation,
 84 |             groups=groups,
 85 |             deformable_groups=deformable_groups,
 86 |             bias=bias
 87 |         )
 88 |         self.with_modulated_dcn = with_modulated_dcn
 89 |         self.kernel_size = kernel_size
 90 |         self.stride = stride
 91 |         self.padding = padding
 92 |         self.dilation = dilation
 93 |         self.offset_split = offset_base_channels * deformable_groups * 2
 94 | 
 95 |     def forward(self, x, return_offset=False):
 96 |         if x.numel() > 0:
 97 |             if not self.with_modulated_dcn:
 98 |                 offset_mask = self.offset(x)
 99 |                 x = self.conv(x, offset_mask)
100 |             else:
101 |                 offset_mask = self.offset(x)
102 |                 offset = offset_mask[:, :self.offset_split, :, :]
103 |                 mask = offset_mask[:, self.offset_split:, :, :].sigmoid()
104 |                 x = self.conv(x, offset, mask)
105 |             if return_offset:
106 |                 return x, offset_mask
107 |             return x
108 |         # get output shape
109 |         output_shape = [
110 |             (i + 2 * p - (di * (k - 1) + 1)) // d + 1
111 |             for i, p, di, k, d in zip(
112 |                 x.shape[-2:],
113 |                 self.padding,
114 |                 self.dilation,
115 |                 self.kernel_size,
116 |                 self.stride
117 |             )
118 |         ]
119 |         output_shape = [x.shape[0], self.conv.weight.shape[0]] + output_shape
120 |         return _NewEmptyTensorOp.apply(x, output_shape)
121 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import glob
  4 | import os
  5 | from setuptools import find_packages, setup
  6 | import torch
  7 | from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
  8 | 
  9 | torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
 10 | assert torch_ver >= [1, 3], "Requires PyTorch >= 1.3"
 11 | 
 12 | 
 13 | def get_adet_extensions():
 14 |     this_dir = os.path.dirname(os.path.abspath(__file__))
 15 |     extensions_dir = os.path.join(this_dir, "adet", "layers", "csrc")
 16 | 
 17 |     main_source = os.path.join(extensions_dir, "vision.cpp")
 18 |     sources = glob.glob(os.path.join(extensions_dir, "**", "*.cpp"))
 19 |     source_cuda = glob.glob(os.path.join(extensions_dir, "**", "*.cu")) + glob.glob(
 20 |         os.path.join(extensions_dir, "*.cu")
 21 |     )
 22 | 
 23 |     sources = [main_source] + sources
 24 | 
 25 |     extension = CppExtension
 26 | 
 27 |     extra_compile_args = {"cxx": []}
 28 |     define_macros = []
 29 | 
 30 |     if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1":
 31 |         extension = CUDAExtension
 32 |         sources += source_cuda
 33 |         define_macros += [("WITH_CUDA", None)]
 34 |         extra_compile_args["nvcc"] = [
 35 |             "-DCUDA_HAS_FP16=1",
 36 |             "-D__CUDA_NO_HALF_OPERATORS__",
 37 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
 38 |             "-D__CUDA_NO_HALF2_OPERATORS__",
 39 |         ]
 40 | 
 41 |         # It's better if pytorch can do this by default ..
 42 |         CC = os.environ.get("CC", None)
 43 |         if CC is not None:
 44 |             extra_compile_args["nvcc"].append("-ccbin={}".format(CC))
 45 | 
 46 |     sources = [os.path.join(extensions_dir, s) for s in sources]
 47 |     include_dirs = [extensions_dir]
 48 |     ext_modules = [
 49 |         extension(
 50 |             "adet._C",
 51 |             sources,
 52 |             include_dirs=include_dirs,
 53 |             define_macros=define_macros,
 54 |             extra_compile_args=extra_compile_args,
 55 |         )
 56 |     ]
 57 | 
 58 |     return ext_modules
 59 | 
 60 | 
 61 | def get_deformable_extensions():
 62 |     this_dir = os.path.dirname(os.path.abspath(__file__))
 63 |     extensions_dir = os.path.join(this_dir, "adet", "modeling", "ops", "src")
 64 | 
 65 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
 66 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
 67 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
 68 | 
 69 |     sources = main_file + source_cpu
 70 |     extension = CppExtension
 71 |     extra_compile_args = {"cxx": []}
 72 |     define_macros = []
 73 | 
 74 |     if torch.cuda.is_available() and CUDA_HOME is not None:
 75 |         extension = CUDAExtension
 76 |         sources += source_cuda
 77 |         define_macros += [("WITH_CUDA", None)]
 78 |         extra_compile_args["nvcc"] = [
 79 |             "-DCUDA_HAS_FP16=1",
 80 |             "-D__CUDA_NO_HALF_OPERATORS__",
 81 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
 82 |             "-D__CUDA_NO_HALF2_OPERATORS__",
 83 |         ]
 84 |     else:
 85 |         raise NotImplementedError('Cuda is not availabel')
 86 | 
 87 |     sources = [os.path.join(extensions_dir, s) for s in sources]
 88 |     include_dirs = [extensions_dir]
 89 |     ext_modules = [
 90 |         extension(
 91 |             "MultiScaleDeformableAttention",
 92 |             sources,
 93 |             include_dirs=include_dirs,
 94 |             define_macros=define_macros,
 95 |             extra_compile_args=extra_compile_args,
 96 |         )
 97 |     ]
 98 |     return ext_modules
 99 | 
100 | 
101 | setup(
102 |     name="AdelaiDet",
103 |     version="0.2.0",
104 |     author="Adelaide Intelligent Machines",
105 |     url="https://github.com/stanstarks/AdelaiDet",
106 |     description="AdelaiDet is AIM's research "
107 |     "platform for instance-level detection tasks based on Detectron2.",
108 |     packages=find_packages(exclude=("configs", "tests")),
109 |     python_requires=">=3.6",
110 |     install_requires=[
111 |         "setuptools==59.5.0",
112 |         "termcolor>=1.1",
113 |         "Pillow>=6.0",
114 |         "yacs>=0.1.6",
115 |         "tabulate",
116 |         "cloudpickle",
117 |         "matplotlib",
118 |         "tqdm>4.29.0",
119 |         "tensorboard",
120 |         "python-Levenshtein",
121 |         "Polygon3",
122 |         "shapely",
123 |         "kornia==0.6.8",
124 |         "opencv-python",
125 |         "timm"
126 |     ],
127 |     extras_require={"all": ["psutil"]},
128 |     ext_modules=get_adet_extensions() + get_deformable_extensions(),
129 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
130 | )
131 | 
132 | 


--------------------------------------------------------------------------------
/adet/modeling/backbone/lpf.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.parallel
  3 | import numpy as np
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | 
  8 | class Downsample(nn.Module):
  9 |     def __init__(self, pad_type='reflect', filt_size=3, stride=2, channels=None, pad_off=0):
 10 |         super(Downsample, self).__init__()
 11 |         self.filt_size = filt_size
 12 |         self.pad_off = pad_off
 13 |         self.pad_sizes = [int(1.*(filt_size-1)/2), int(np.ceil(1.*(filt_size-1)/2)), int(1.*(filt_size-1)/2), int(np.ceil(1.*(filt_size-1)/2))]
 14 |         self.pad_sizes = [pad_size+pad_off for pad_size in self.pad_sizes]
 15 |         self.stride = stride
 16 |         self.off = int((self.stride-1)/2.)
 17 |         self.channels = channels
 18 | 
 19 |         # print('Filter size [%i]'%filt_size)
 20 |         if(self.filt_size==1):
 21 |             a = np.array([1.,])
 22 |         elif(self.filt_size==2):
 23 |             a = np.array([1., 1.])
 24 |         elif(self.filt_size==3):
 25 |             a = np.array([1., 2., 1.])
 26 |         elif(self.filt_size==4):    
 27 |             a = np.array([1., 3., 3., 1.])
 28 |         elif(self.filt_size==5):    
 29 |             a = np.array([1., 4., 6., 4., 1.])
 30 |         elif(self.filt_size==6):    
 31 |             a = np.array([1., 5., 10., 10., 5., 1.])
 32 |         elif(self.filt_size==7):    
 33 |             a = np.array([1., 6., 15., 20., 15., 6., 1.])
 34 | 
 35 |         filt = torch.Tensor(a[:,None]*a[None,:])
 36 |         filt = filt/torch.sum(filt)
 37 |         self.register_buffer('filt', filt[None,None,:,:].repeat((self.channels,1,1,1)))
 38 | 
 39 |         self.pad = get_pad_layer(pad_type)(self.pad_sizes)
 40 | 
 41 |     def forward(self, inp):
 42 |         if(self.filt_size==1):
 43 |             if(self.pad_off==0):
 44 |                 return inp[:,:,::self.stride,::self.stride]    
 45 |             else:
 46 |                 return self.pad(inp)[:,:,::self.stride,::self.stride]
 47 |         else:
 48 |             return F.conv2d(self.pad(inp), self.filt, stride=self.stride, groups=inp.shape[1])
 49 | 
 50 | def get_pad_layer(pad_type):
 51 |     if(pad_type in ['refl','reflect']):
 52 |         PadLayer = nn.ReflectionPad2d
 53 |     elif(pad_type in ['repl','replicate']):
 54 |         PadLayer = nn.ReplicationPad2d
 55 |     elif(pad_type=='zero'):
 56 |         PadLayer = nn.ZeroPad2d
 57 |     else:
 58 |         print('Pad type [%s] not recognized'%pad_type)
 59 |     return PadLayer
 60 | 
 61 | 
 62 | class Downsample1D(nn.Module):
 63 |     def __init__(self, pad_type='reflect', filt_size=3, stride=2, channels=None, pad_off=0):
 64 |         super(Downsample1D, self).__init__()
 65 |         self.filt_size = filt_size
 66 |         self.pad_off = pad_off
 67 |         self.pad_sizes = [int(1. * (filt_size - 1) / 2), int(np.ceil(1. * (filt_size - 1) / 2))]
 68 |         self.pad_sizes = [pad_size + pad_off for pad_size in self.pad_sizes]
 69 |         self.stride = stride
 70 |         self.off = int((self.stride - 1) / 2.)
 71 |         self.channels = channels
 72 | 
 73 |         # print('Filter size [%i]' % filt_size)
 74 |         if(self.filt_size == 1):
 75 |             a = np.array([1., ])
 76 |         elif(self.filt_size == 2):
 77 |             a = np.array([1., 1.])
 78 |         elif(self.filt_size == 3):
 79 |             a = np.array([1., 2., 1.])
 80 |         elif(self.filt_size == 4):
 81 |             a = np.array([1., 3., 3., 1.])
 82 |         elif(self.filt_size == 5):
 83 |             a = np.array([1., 4., 6., 4., 1.])
 84 |         elif(self.filt_size == 6):
 85 |             a = np.array([1., 5., 10., 10., 5., 1.])
 86 |         elif(self.filt_size == 7):
 87 |             a = np.array([1., 6., 15., 20., 15., 6., 1.])
 88 | 
 89 |         filt = torch.Tensor(a)
 90 |         filt = filt / torch.sum(filt)
 91 |         self.register_buffer('filt', filt[None, None, :].repeat((self.channels, 1, 1)))
 92 | 
 93 |         self.pad = get_pad_layer_1d(pad_type)(self.pad_sizes)
 94 | 
 95 |     def forward(self, inp):
 96 |         if(self.filt_size == 1):
 97 |             if(self.pad_off == 0):
 98 |                 return inp[:, :, ::self.stride]
 99 |             else:
100 |                 return self.pad(inp)[:, :, ::self.stride]
101 |         else:
102 |             return F.conv1d(self.pad(inp), self.filt, stride=self.stride, groups=inp.shape[1])
103 | 
104 | 
105 | def get_pad_layer_1d(pad_type):
106 |     if(pad_type in ['refl', 'reflect']):
107 |         PadLayer = nn.ReflectionPad1d
108 |     elif(pad_type in ['repl', 'replicate']):
109 |         PadLayer = nn.ReplicationPad1d
110 |     elif(pad_type == 'zero'):
111 |         PadLayer = nn.ZeroPad1d
112 |     else:
113 |         print('Pad type [%s] not recognized' % pad_type)
114 |     return PadLayer
115 | 


--------------------------------------------------------------------------------
/tools/visualize_feat.py:
--------------------------------------------------------------------------------
  1 | from detectron2.utils.logger import setup_logger
  2 | setup_logger()
  3 | 
  4 | # import some common libraries
  5 | import os, cv2
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | # import some common detectron2 utilities
  9 | from detectron2.engine import DefaultPredictor
 10 | 
 11 | from adet.config import get_cfg
 12 | 
 13 | YML_PATH = ''
 14 | WEIGHTS = ''
 15 | OUTPATH = ''
 16 | if not os.path.exists(OUTPATH):
 17 |     os.makedirs(OUTPATH)
 18 | 
 19 | 
 20 | def setup(yml_path, weights):
 21 |     cfg = get_cfg()
 22 |     cfg.merge_from_file(yml_path)
 23 |     cfg.MODEL.WEIGHTS = weights
 24 |     cfg.MODEL.OSFormer.UPDATE_THR = 0.5
 25 |     cfg.freeze()
 26 |     predictor = DefaultPredictor(cfg)
 27 |     return cfg, predictor
 28 | 
 29 | 
 30 | def vis_features(feat):
 31 |     feat = feat.squeeze(0)
 32 |     return feat.square().sum(0)
 33 | 
 34 | 
 35 | def visualize(im_path, cfg, predictor, out_path):
 36 |     im_name = os.path.basename(im_path).split('.')[0]
 37 |     
 38 |     model = predictor.model
 39 |     
 40 |     im = cv2.imread(im_path)
 41 |     im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
 42 |     
 43 |     conv_features = []
 44 |     trans_features = []
 45 |     camin_features = []
 46 |     mask_features = []
 47 |     
 48 |     hooks = [
 49 |         # backbone feature
 50 |         model.backbone.res2.register_forward_hook(
 51 |             lambda self, input, output: conv_features.append(output)
 52 |         ),
 53 |         model.backbone.res3.register_forward_hook(
 54 |             lambda self, input, output: conv_features.append(output)
 55 |         ),
 56 |         model.backbone.res4.register_forward_hook(
 57 |             lambda self, input, output: conv_features.append(output)
 58 |         ),
 59 |         model.backbone.res5.register_forward_hook(
 60 |             lambda self, input, output: conv_features.append(output)
 61 |         ),
 62 | 
 63 |         # trans feature
 64 |         model.cate_head.trans_encoder.encoder.layers[5].register_forward_hook(
 65 |             lambda self, input, output: trans_features.append(output)
 66 |         ),
 67 |         
 68 |         # mask head feature
 69 |         model.mask_head.register_forward_hook(
 70 |             lambda self, input, output: mask_features.append(output)
 71 |         ),
 72 |         
 73 |         # camin feature
 74 |         model.dcin.register_forward_hook(
 75 |             lambda self, input, output: camin_features.append(output)
 76 |         )
 77 |     ]
 78 |     
 79 |     outputs = predictor(im)
 80 | 
 81 |     for hook in hooks:
 82 |         hook.remove()
 83 | 
 84 |     # save res feats, res2-res5
 85 |     spatial_shapes = []
 86 |     spatial_sizes = []
 87 |     for idx, elem in enumerate(conv_features):
 88 |         cur_feat = vis_features(elem).cpu().numpy()
 89 |         spatial_shapes.append(tuple(cur_feat.shape))
 90 |         spatial_sizes.append(cur_feat.shape[0] * cur_feat.shape[1])
 91 |         plt.axis('off')
 92 |         plt.imshow(cur_feat)
 93 |         plt.savefig(os.path.join(out_path, 'vis_res{}_{}.pdf'.format(idx + 2, im_name)), bbox_inches='tight', pad_inches=0.0)
 94 |         print(os.path.join(out_path, 'vis_res{}_{}.pdf'.format(idx + 2, im_name)))
 95 |         
 96 |     # save trans feats, trans3-trans5
 97 |     for idx, elem, (x, y) in zip(range(len(spatial_shapes) - 1), trans_features[0].split(spatial_sizes[1:], 1), spatial_shapes[1:]):
 98 |         feat = vis_features(elem.permute(0, 2, 1).view(1, -1, x, y)).cpu().numpy()
 99 |         plt.axis('off')
100 |         plt.imshow(feat)
101 |         plt.savefig(os.path.join(out_path, 'vis_trans{}_{}.pdf'.format(idx + 3, im_name)), bbox_inches='tight', pad_inches=0.0)
102 |         print(os.path.join(out_path, 'vis_trans{}_{}.pdf'.format(idx + 3, im_name)))
103 | 
104 |     # save camin output features
105 |     camin_feats = camin_features[0].squeeze(0).cpu().numpy()
106 |     for i in range(camin_feats.shape[0]):
107 |         feat = camin_feats[i]
108 |         plt.cla()  # ref https://stackoverflow.com/questions/8213522/when-to-use-cla-clf-or-close-for-clearing-a-plot-in-matplotlib
109 |         plt.axis('off')
110 |         plt.imshow(feat)
111 |         plt.savefig(os.path.join(out_path, 'vis_dcin{}_{}.pdf'.format(i, im_name)), bbox_inches='tight', pad_inches=0.0)
112 |         print(os.path.join(out_path, 'vis_dcin{}_{}.pdf'.format(i, im_name)))
113 | 
114 |     # save mask features
115 |     mask_feats = vis_features(mask_features[0][0]).cpu().numpy()
116 |     plt.imshow(mask_feats)
117 |     plt.savefig(os.path.join(out_path, 'vis_maskhead_{}.pdf'.format(im_name)), bbox_inches='tight', pad_inches=0.0)
118 |     print(os.path.join(out_path, 'vis_maskhead_{}.pdf'.format(im_name)))
119 |     
120 |     # save rea edges
121 |     for i in range(len(mask_features[0][1])):
122 |         feat = mask_features[0][1][i].squeeze().cpu().numpy()
123 |         plt.axis('off')
124 |         plt.imshow(feat)
125 |         plt.savefig(os.path.join(out_path, 'vis_rea_edge{}_{}.pdf'.format(i, im_name)), bbox_inches='tight', pad_inches=0.0)
126 |         print(os.path.join(out_path, 'vis_rea_edge{}_{}.pdf'.format(i, im_name)))
127 | 


--------------------------------------------------------------------------------
/adet/modeling/backbone/resnet_interval.py:
--------------------------------------------------------------------------------
  1 | from detectron2.layers import FrozenBatchNorm2d
  2 | from detectron2.modeling.backbone import BACKBONE_REGISTRY
  3 | from detectron2.modeling.backbone.resnet import (
  4 |     BasicStem,
  5 |     DeformBottleneckBlock,
  6 |     BottleneckBlock,
  7 |     ResNet,
  8 | )
  9 | 
 10 | 
 11 | def make_stage_intervals(block_class, num_blocks, first_stride, **kwargs):
 12 |     """
 13 |     Create a resnet stage by creating many blocks.
 14 |     Args:
 15 |         block_class (class): a subclass of ResNetBlockBase
 16 |         num_blocks (int):
 17 |         first_stride (int): the stride of the first block. The other blocks will have stride=1.
 18 |             A `stride` argument will be passed to the block constructor.
 19 |         kwargs: other arguments passed to the block constructor.
 20 | 
 21 |     Returns:
 22 |         list[nn.Module]: a list of block module.
 23 |     """
 24 |     blocks = []
 25 |     conv_kwargs = {key: kwargs[key] for key in kwargs if "deform" not in key}
 26 |     deform_kwargs = {key: kwargs[key] for key in kwargs if key != "deform_interval"}
 27 |     deform_interval = kwargs.get("deform_interval", None)
 28 |     for i in range(num_blocks):
 29 |         if deform_interval and i % deform_interval == 0:
 30 |             blocks.append(block_class(stride=first_stride if i == 0 else 1, **deform_kwargs))
 31 |         else:
 32 |             blocks.append(BottleneckBlock(stride=first_stride if i == 0 else 1, **conv_kwargs))
 33 |         conv_kwargs["in_channels"] = conv_kwargs["out_channels"]
 34 |         deform_kwargs["in_channels"] = deform_kwargs["out_channels"]
 35 |     return blocks
 36 | 
 37 | 
 38 | @BACKBONE_REGISTRY.register()
 39 | def build_resnet_interval_backbone(cfg, input_shape):
 40 |     """
 41 |     Create a ResNet instance from config.
 42 | 
 43 |     Returns:
 44 |         ResNet: a :class:`ResNet` instance.
 45 |     """
 46 |     # need registration of new blocks/stems?
 47 |     norm = cfg.MODEL.RESNETS.NORM
 48 |     stem = BasicStem(
 49 |         in_channels=input_shape.channels,
 50 |         out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
 51 |         norm=norm,
 52 |     )
 53 |     freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT
 54 | 
 55 |     if freeze_at >= 1:
 56 |         for p in stem.parameters():
 57 |             p.requires_grad = False
 58 |         stem = FrozenBatchNorm2d.convert_frozen_batchnorm(stem)
 59 | 
 60 |     # fmt: off
 61 |     out_features        = cfg.MODEL.RESNETS.OUT_FEATURES
 62 |     depth               = cfg.MODEL.RESNETS.DEPTH
 63 |     num_groups          = cfg.MODEL.RESNETS.NUM_GROUPS
 64 |     width_per_group     = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
 65 |     bottleneck_channels = num_groups * width_per_group
 66 |     in_channels         = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
 67 |     out_channels        = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
 68 |     stride_in_1x1       = cfg.MODEL.RESNETS.STRIDE_IN_1X1
 69 |     res5_dilation       = cfg.MODEL.RESNETS.RES5_DILATION
 70 |     deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
 71 |     deform_modulated    = cfg.MODEL.RESNETS.DEFORM_MODULATED
 72 |     deform_num_groups   = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
 73 |     deform_interval     = cfg.MODEL.RESNETS.DEFORM_INTERVAL
 74 |     # fmt: on
 75 |     assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
 76 | 
 77 |     num_blocks_per_stage = {50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3]}[depth]
 78 | 
 79 |     stages = []
 80 | 
 81 |     # Avoid creating variables without gradients
 82 |     # It consumes extra memory and may cause allreduce to fail
 83 |     out_stage_idx = [{"res2": 2, "res3": 3, "res4": 4, "res5": 5}[f] for f in out_features]
 84 |     max_stage_idx = max(out_stage_idx)
 85 |     for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
 86 |         dilation = res5_dilation if stage_idx == 5 else 1
 87 |         first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
 88 |         stage_kargs = {
 89 |             "num_blocks": num_blocks_per_stage[idx],
 90 |             "first_stride": first_stride,
 91 |             "in_channels": in_channels,
 92 |             "bottleneck_channels": bottleneck_channels,
 93 |             "out_channels": out_channels,
 94 |             "num_groups": num_groups,
 95 |             "norm": norm,
 96 |             "stride_in_1x1": stride_in_1x1,
 97 |             "dilation": dilation,
 98 |         }
 99 |         if deform_on_per_stage[idx]:
100 |             stage_kargs["block_class"] = DeformBottleneckBlock
101 |             stage_kargs["deform_modulated"] = deform_modulated
102 |             stage_kargs["deform_num_groups"] = deform_num_groups
103 |             stage_kargs["deform_interval"] = deform_interval
104 |         else:
105 |             stage_kargs["block_class"] = BottleneckBlock
106 |         blocks = make_stage_intervals(**stage_kargs)
107 |         in_channels = out_channels
108 |         out_channels *= 2
109 |         bottleneck_channels *= 2
110 | 
111 |         if freeze_at >= stage_idx:
112 |             for block in blocks:
113 |                 block.freeze()
114 |         stages.append(blocks)
115 |     return ResNet(stem, stages, out_features=out_features)
116 | 


--------------------------------------------------------------------------------
/adet/modeling/osformer/trans_encoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn.init import xavier_uniform_, constant_, normal_
  4 | 
  5 | from adet.modeling.ops.modules.ms_deform_attn import MSDeformAttn
  6 | from .trans_utils import _get_clones, get_reference_points, with_pos_embed
  7 | from .feed_forward import get_ffn
  8 | 
  9 | 
 10 | class CISTransformerEncoder(nn.Module):
 11 |     def __init__(self, d_model=256, nhead=8,
 12 |                  num_encoder_layers=6, dim_feedforward=1024, dropout=0.1,
 13 |                  ffn_type="default", num_feature_levels=4, enc_n_points=4):
 14 |         super().__init__()
 15 | 
 16 |         self.d_model = d_model
 17 |         self.nhead = nhead
 18 | 
 19 |         encoder_layer = TransformerEncoderLayer(d_model, dim_feedforward,
 20 |                                                 dropout, ffn_type,
 21 |                                                 num_feature_levels, nhead, enc_n_points)
 22 |         self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers)
 23 |         self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
 24 |         self.reference_points = nn.Linear(d_model, 2)
 25 | 
 26 |         self._reset_parameters()
 27 | 
 28 |     def _reset_parameters(self):
 29 |         for p in self.parameters():
 30 |             if p.dim() > 1:
 31 |                 nn.init.xavier_uniform_(p)
 32 |         for m in self.modules():
 33 |             if isinstance(m, MSDeformAttn):
 34 |                 m._reset_parameters()
 35 |         xavier_uniform_(self.reference_points.weight.data, gain=1.0)
 36 |         constant_(self.reference_points.bias.data, 0.)
 37 |         normal_(self.level_embed)
 38 | 
 39 |     def forward(self, srcs, pos_embeds):
 40 |         # prepare input for encoder
 41 |         src_flatten = []
 42 |         lvl_pos_embed_flatten = []
 43 |         spatial_shapes = []
 44 |         for lvl, (src, pos_embed) in enumerate(zip(srcs, pos_embeds)):
 45 |             bs, c, h, w = src.shape
 46 |             spatial_shape = (h, w)
 47 |             spatial_shapes.append(spatial_shape)
 48 |             src = src.flatten(2).transpose(1, 2)
 49 |             pos_embed = pos_embed.flatten(2).transpose(1, 2)
 50 |             lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
 51 |             lvl_pos_embed_flatten.append(lvl_pos_embed)
 52 |             src_flatten.append(src)
 53 |         src_flatten = torch.cat(src_flatten, 1)
 54 |         lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
 55 |         spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
 56 |         level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
 57 | 
 58 |         # encoder
 59 |         memory = self.encoder(src_flatten, spatial_shapes, level_start_index, lvl_pos_embed_flatten)
 60 | 
 61 |         return memory, level_start_index
 62 | 
 63 | 
 64 | class TransformerEncoderLayer(nn.Module):
 65 |     def __init__(self,
 66 |                  d_model=256, d_ffn=1024,
 67 |                  dropout=0.1, ffn_type="default",
 68 |                  n_levels=4, n_heads=8, n_points=4):
 69 |         super().__init__()
 70 | 
 71 |         # self attention
 72 |         self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
 73 |         self.dropout1 = nn.Dropout(dropout)
 74 |         self.norm1 = nn.LayerNorm(d_model)
 75 | 
 76 |         # ffn
 77 |         self.ffn = get_ffn(d_model, ffn_type)
 78 | 
 79 |     def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None):
 80 |         # self attention
 81 |         src2 = self.self_attn(with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask)
 82 |         src = src + self.dropout1(src2)
 83 |         src = self.norm1(src)   # (bs, w*h, dim)
 84 | 
 85 |         # ffn
 86 |         src = self.ffn(src, spatial_shapes, level_start_index)
 87 | 
 88 |         return src
 89 | 
 90 | 
 91 | class TransformerEncoder(nn.Module):
 92 |     def __init__(self, encoder_layer, num_layers):
 93 |         super().__init__()
 94 |         self.layers = _get_clones(encoder_layer, num_layers)
 95 |         self.num_layers = num_layers
 96 | 
 97 |     def forward(self, src, spatial_shapes, level_start_index, pos=None):
 98 |         output = src
 99 |         batch_size = src.shape[0]
100 |         reference_points = get_reference_points(spatial_shapes, batch_size, device=src.device)
101 |         for _, layer in enumerate(self.layers):
102 |             output = layer(output, pos, reference_points, spatial_shapes, level_start_index)
103 | 
104 |         return output
105 | 
106 | 
107 | def build_transformer_encoder(cfg):
108 |     return CISTransformerEncoder(
109 |         d_model=cfg.MODEL.OSFormer.HIDDEN_DIM,
110 |         nhead=cfg.MODEL.OSFormer.NHEAD,
111 |         num_encoder_layers=cfg.MODEL.OSFormer.ENC_LAYERS,
112 |         dim_feedforward=cfg.MODEL.OSFormer.DIM_FEEDFORWARD,
113 |         dropout=0.1,
114 |         ffn_type=cfg.MODEL.OSFormer.FFN,
115 |         num_feature_levels=len(cfg.MODEL.OSFormer.FEAT_INSTANCE_STRIDES),
116 |         enc_n_points=cfg.MODEL.OSFormer.ENC_POINTS)
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/adet/modeling/osformer/loss.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import torch.nn.functional as F
  4 | from fvcore.nn import sigmoid_focal_loss_jit
  5 | 
  6 | 
  7 | def dice_loss(input, target):
  8 |     input = input.contiguous().view(input.size()[0], -1)
  9 |     target = target.contiguous().view(target.size()[0], -1).float()
 10 | 
 11 |     a = torch.sum(input * target, 1)
 12 |     b = torch.sum(input * input, 1) + 0.001
 13 |     c = torch.sum(target * target, 1) + 0.001
 14 |     d = (2 * a) / (b + c)
 15 |     return 1 - d
 16 | 
 17 | def giou_loss(input, target):
 18 |     input = input.contiguous().view(input.size()[0], -1)
 19 |     target = target.contiguous().view(target.size()[0], -1).float()
 20 | 
 21 |     inters = torch.sum(input * target, 1)
 22 |     b = torch.sum(input * input, 1) + 0.001
 23 |     c = torch.sum(target * target, 1) + 0.001
 24 |     uni = b + c
 25 | 
 26 |     # ious
 27 |     ious = inters / uni
 28 |     loss = 1 - ious
 29 | 
 30 |     return loss
 31 | 
 32 | 
 33 | def reduce_loss(loss, reduction):
 34 |     """Reduce loss as specified.
 35 |     Args:
 36 |         loss (Tensor): Elementwise loss tensor.
 37 |         reduction (str): Options are "none", "mean" and "sum".
 38 |     Return:
 39 |         Tensor: Reduced loss tensor.
 40 |     """
 41 |     reduction_enum = F._Reduction.get_enum(reduction)
 42 |     # none: 0, elementwise_mean:1, sum: 2
 43 |     if reduction_enum == 0:
 44 |         return loss
 45 |     elif reduction_enum == 1:
 46 |         return loss.mean()
 47 |     elif reduction_enum == 2:
 48 |         return loss.sum()
 49 | 
 50 | 
 51 | def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
 52 |     """Apply element-wise weight and reduce loss.
 53 |     Args:
 54 |         loss (Tensor): Element-wise loss.
 55 |         weight (Tensor): Element-wise weights.
 56 |         reduction (str): Same as built-in losses of PyTorch.
 57 |         avg_factor (float): Avarage factor when computing the mean of losses.
 58 |     Returns:
 59 |         Tensor: Processed loss values.
 60 |     """
 61 |     # if weight is specified, apply element-wise weight
 62 |     if weight is not None:
 63 |         loss = loss * weight
 64 | 
 65 |     # if avg_factor is not specified, just reduce the loss
 66 |     if avg_factor is None:
 67 |         loss = reduce_loss(loss, reduction)
 68 |     else:
 69 |         # if reduction is mean, then average the loss by avg_factor
 70 |         if reduction == 'mean':
 71 |             loss = loss.sum() / avg_factor
 72 |         # if reduction is 'none', then do nothing, otherwise raise an error
 73 |         elif reduction != 'none':
 74 |             raise ValueError('avg_factor can not be used with reduction="sum"')
 75 |     return loss
 76 | 
 77 | 
 78 | def sigmoid_focal_loss(pred,
 79 |                        target,
 80 |                        weight=None,
 81 |                        gamma=2.0,
 82 |                        alpha=0.25,
 83 |                        reduction='mean',
 84 |                        avg_factor=None):
 85 |     # Function.apply does not accept keyword arguments, so the decorator
 86 |     # "weighted_loss" is not applicable
 87 |     loss = sigmoid_focal_loss_jit(pred, target, gamma=gamma, alpha=alpha)
 88 |     if weight is not None:
 89 |         if weight.shape != loss.shape:
 90 |             if weight.size(0) == loss.size(0):
 91 |                 # For most cases, weight is of shape (num_priors, ),
 92 |                 #  which means it does not have the second axis num_class
 93 |                 weight = weight.view(-1, 1)
 94 |             else:
 95 |                 # Sometimes, weight per anchor per class is also needed. e.g.
 96 |                 #  in FSAF. But it may be flattened of shape
 97 |                 #  (num_priors x num_class, ), while loss is still of shape
 98 |                 #  (num_priors, num_class).
 99 |                 assert weight.numel() == loss.numel()
100 |                 weight = weight.view(loss.size(0), -1)
101 |         assert weight.ndim == loss.ndim
102 |     loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
103 |     return loss
104 | 
105 | 
106 | class FocalLoss(nn.Module):
107 | 
108 |     def __init__(self,
109 |                  use_sigmoid=True,
110 |                  gamma=2.0,
111 |                  alpha=0.25,
112 |                  reduction='mean',
113 |                  loss_weight=1.0):
114 |         super(FocalLoss, self).__init__()
115 |         assert use_sigmoid is True, 'Only sigmoid focal loss supported now.'
116 |         self.use_sigmoid = use_sigmoid
117 |         self.gamma = gamma
118 |         self.alpha = alpha
119 |         self.reduction = reduction
120 |         self.loss_weight = loss_weight
121 | 
122 |     def forward(self,
123 |                 pred,
124 |                 target,
125 |                 weight=None,
126 |                 avg_factor=None,
127 |                 reduction_override=None):
128 |         assert reduction_override in (None, 'none', 'mean', 'sum')
129 |         reduction = (
130 |             reduction_override if reduction_override else self.reduction)
131 |         if self.use_sigmoid:
132 |             loss_cls = self.loss_weight * sigmoid_focal_loss(
133 |                 pred,
134 |                 target,
135 |                 weight,
136 |                 gamma=self.gamma,
137 |                 alpha=self.alpha,
138 |                 reduction=reduction,
139 |                 avg_factor=avg_factor)
140 |         else:
141 |             raise NotImplementedError
142 |         return loss_cls
143 | 


--------------------------------------------------------------------------------
/adet/layers/csrc/ml_nms/ml_nms.cu:
--------------------------------------------------------------------------------
  1 | #include <ATen/ATen.h>
  2 | #include <ATen/cuda/CUDAContext.h>
  3 | #include <THC/THC.h>
  4 | #include <THC/THCDeviceUtils.cuh>
  5 | 
  6 | #include <vector>
  7 | #include <iostream>
  8 | 
  9 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
 10 | 
 11 | __device__ inline float devIoU(float const * const a, float const * const b) {
 12 |   if (a[5] != b[5]) {
 13 |     return 0.0;
 14 |   }
 15 |   float left = max(a[0], b[0]), right = min(a[2], b[2]);
 16 |   float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
 17 |   float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
 18 |   float interS = width * height;
 19 |   float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
 20 |   float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
 21 |   return interS / (Sa + Sb - interS);
 22 | }
 23 | 
 24 | __global__ void ml_nms_kernel(const int n_boxes, const float nms_overlap_thresh,
 25 |                            const float *dev_boxes, unsigned long long *dev_mask) {
 26 |   const int row_start = blockIdx.y;
 27 |   const int col_start = blockIdx.x;
 28 | 
 29 |   // if (row_start > col_start) return;
 30 | 
 31 |   const int row_size =
 32 |         min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
 33 |   const int col_size =
 34 |         min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
 35 | 
 36 |   __shared__ float block_boxes[threadsPerBlock * 6];
 37 |   if (threadIdx.x < col_size) {
 38 |     block_boxes[threadIdx.x * 6 + 0] =
 39 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
 40 |     block_boxes[threadIdx.x * 6 + 1] =
 41 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
 42 |     block_boxes[threadIdx.x * 6 + 2] =
 43 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
 44 |     block_boxes[threadIdx.x * 6 + 3] =
 45 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
 46 |     block_boxes[threadIdx.x * 6 + 4] =
 47 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
 48 |     block_boxes[threadIdx.x * 6 + 5] =
 49 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
 50 |   }
 51 |   __syncthreads();
 52 | 
 53 |   if (threadIdx.x < row_size) {
 54 |     const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
 55 |     const float *cur_box = dev_boxes + cur_box_idx * 6;
 56 |     int i = 0;
 57 |     unsigned long long t = 0;
 58 |     int start = 0;
 59 |     if (row_start == col_start) {
 60 |       start = threadIdx.x + 1;
 61 |     }
 62 |     for (i = start; i < col_size; i++) {
 63 |       if (devIoU(cur_box, block_boxes + i * 6) > nms_overlap_thresh) {
 64 |         t |= 1ULL << i;
 65 |       }
 66 |     }
 67 |     const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
 68 |     dev_mask[cur_box_idx * col_blocks + col_start] = t;
 69 |   }
 70 | }
 71 | 
 72 | namespace adet {
 73 | 
 74 | // boxes is a N x 6 tensor
 75 | at::Tensor ml_nms_cuda(const at::Tensor boxes, const float nms_overlap_thresh) {
 76 |   using scalar_t = float;
 77 |   AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
 78 |   auto scores = boxes.select(1, 4);
 79 |   auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
 80 |   auto boxes_sorted = boxes.index_select(0, order_t);
 81 | 
 82 |   int boxes_num = boxes.size(0);
 83 | 
 84 |   const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
 85 | 
 86 |   scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();
 87 | 
 88 |   THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
 89 | 
 90 |   unsigned long long* mask_dev = NULL;
 91 |   //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
 92 |   //                      boxes_num * col_blocks * sizeof(unsigned long long)));
 93 | 
 94 |   mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
 95 | 
 96 |   dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
 97 |               THCCeilDiv(boxes_num, threadsPerBlock));
 98 |   dim3 threads(threadsPerBlock);
 99 |   ml_nms_kernel<<<blocks, threads>>>(boxes_num,
100 |                                   nms_overlap_thresh,
101 |                                   boxes_dev,
102 |                                   mask_dev);
103 | 
104 |   std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
105 |   THCudaCheck(cudaMemcpy(&mask_host[0],
106 |                         mask_dev,
107 |                         sizeof(unsigned long long) * boxes_num * col_blocks,
108 |                         cudaMemcpyDeviceToHost));
109 | 
110 |   std::vector<unsigned long long> remv(col_blocks);
111 |   memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
112 | 
113 |   at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
114 |   int64_t* keep_out = keep.data<int64_t>();
115 | 
116 |   int num_to_keep = 0;
117 |   for (int i = 0; i < boxes_num; i++) {
118 |     int nblock = i / threadsPerBlock;
119 |     int inblock = i % threadsPerBlock;
120 | 
121 |     if (!(remv[nblock] & (1ULL << inblock))) {
122 |       keep_out[num_to_keep++] = i;
123 |       unsigned long long *p = &mask_host[0] + i * col_blocks;
124 |       for (int j = nblock; j < col_blocks; j++) {
125 |         remv[j] |= p[j];
126 |       }
127 |     }
128 |   }
129 | 
130 |   THCudaFree(state, mask_dev);
131 |   // TODO improve this part
132 |   return std::get<0>(order_t.index({
133 |                        keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
134 |                          order_t.device(), keep.scalar_type())
135 |                      }).sort(0, false));
136 | }
137 | 
138 | } // namespace adet


--------------------------------------------------------------------------------
/adet/config/defaults.py:
--------------------------------------------------------------------------------
  1 | from detectron2.config.defaults import _C
  2 | from detectron2.config import CfgNode as CN
  3 | 
  4 | # ---------------------------------------------------------------------------- #
  5 | # Additional Configs
  6 | # ---------------------------------------------------------------------------- #
  7 | _C.MODEL.MOBILENET = False
  8 | _C.MODEL.BACKBONE.ANTI_ALIAS = False
  9 | _C.MODEL.RESNETS.DEFORM_INTERVAL = 1
 10 | _C.INPUT.HFLIP_TRAIN = True
 11 | _C.INPUT.CROP.CROP_INSTANCE = True
 12 | 
 13 | # ---------------------------------------------------------------------------- #
 14 | # Basis Module Options
 15 | # ---------------------------------------------------------------------------- #
 16 | _C.MODEL.BASIS_MODULE = CN()
 17 | _C.MODEL.BASIS_MODULE.NAME = "ProtoNet"
 18 | _C.MODEL.BASIS_MODULE.NUM_BASES = 4
 19 | _C.MODEL.BASIS_MODULE.LOSS_ON = False
 20 | _C.MODEL.BASIS_MODULE.ANN_SET = "coco"
 21 | _C.MODEL.BASIS_MODULE.CONVS_DIM = 128
 22 | _C.MODEL.BASIS_MODULE.IN_FEATURES = ["p3", "p4", "p5"]
 23 | _C.MODEL.BASIS_MODULE.NORM = "SyncBN"
 24 | _C.MODEL.BASIS_MODULE.NUM_CONVS = 3
 25 | _C.MODEL.BASIS_MODULE.COMMON_STRIDE = 8
 26 | _C.MODEL.BASIS_MODULE.NUM_CLASSES = 80
 27 | _C.MODEL.BASIS_MODULE.LOSS_WEIGHT = 0.3
 28 | 
 29 | # ---------------------------------------------------------------------------- #
 30 | # OSFormer Options
 31 | # ---------------------------------------------------------------------------- #
 32 | _C.MODEL.OSFormer = CN()
 33 | 
 34 | # Instance hyper-parameters
 35 | _C.MODEL.OSFormer.INSTANCE_IN_FEATURES = ["p2", "p3", "p4", "p5", "p6"]
 36 | _C.MODEL.OSFormer.FEAT_INSTANCE_STRIDES = [8, 8, 16, 32, 32]
 37 | _C.MODEL.OSFormer.FEAT_SCALE_RANGES = ((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048))
 38 | _C.MODEL.OSFormer.SIGMA = 0.2
 39 | # Channel size for the instance head.
 40 | _C.MODEL.OSFormer.INSTANCE_IN_CHANNELS = 256
 41 | _C.MODEL.OSFormer.INSTANCE_CHANNELS = 256
 42 | # Convolutions to use in the instance head.
 43 | _C.MODEL.OSFormer.NUM_INSTANCE_CONVS = 4
 44 | _C.MODEL.OSFormer.USE_DCN_IN_INSTANCE = False
 45 | _C.MODEL.OSFormer.TYPE_DCN = 'DCN'
 46 | _C.MODEL.OSFormer.NUM_GRIDS = [40, 36, 24, 16, 12]
 47 | # Number of foreground classes.
 48 | _C.MODEL.OSFormer.NUM_CLASSES = 80   # COCO
 49 | _C.MODEL.OSFormer.NUM_KERNELS = 256
 50 | _C.MODEL.OSFormer.NORM = "GN"
 51 | _C.MODEL.OSFormer.USE_COORD_CONV = True
 52 | _C.MODEL.OSFormer.PRIOR_PROB = 0.01
 53 | 
 54 | # Mask hyper-parameters.
 55 | # Channel size for the mask tower.
 56 | _C.MODEL.OSFormer.MASK_IN_FEATURES = ["p2", "p3", "p4", "p5"]
 57 | _C.MODEL.OSFormer.MASK_IN_CHANNELS = 256
 58 | _C.MODEL.OSFormer.MASK_CHANNELS = 128
 59 | _C.MODEL.OSFormer.NUM_MASKS = 256
 60 | 
 61 | # Test cfg.
 62 | # _C.MODEL.OSFormer.CONFIDENCE_SCORE = 0.25
 63 | _C.MODEL.OSFormer.NMS_PRE = 500
 64 | _C.MODEL.OSFormer.SCORE_THR = 0.1
 65 | _C.MODEL.OSFormer.UPDATE_THR = 0.05
 66 | _C.MODEL.OSFormer.MASK_THR = 0.5
 67 | _C.MODEL.OSFormer.MAX_PER_IMG = 100
 68 | _C.MODEL.OSFormer.RESIZE_INPUT_FACTOR = 1
 69 | # NMS type: matrix OR mask.
 70 | _C.MODEL.OSFormer.NMS_TYPE = "matrix"
 71 | # Matrix NMS kernel type: gaussian OR linear.
 72 | _C.MODEL.OSFormer.NMS_KERNEL = "gaussian"
 73 | _C.MODEL.OSFormer.NMS_SIGMA = 2
 74 | 
 75 | # Loss cfg.
 76 | _C.MODEL.OSFormer.LOSS = CN()
 77 | _C.MODEL.OSFormer.LOSS.FOCAL_USE_SIGMOID = True
 78 | _C.MODEL.OSFormer.LOSS.FOCAL_ALPHA = 0.25
 79 | _C.MODEL.OSFormer.LOSS.FOCAL_GAMMA = 2.0
 80 | _C.MODEL.OSFormer.LOSS.FOCAL_WEIGHT = 1.0
 81 | _C.MODEL.OSFormer.LOSS.DICE_WEIGHT = 3.0
 82 | _C.MODEL.OSFormer.LOSS.SEM_WEIGHT = 1.0
 83 | _C.MODEL.OSFormer.LOSS.INS_EDGE_WEIGHT = 1.0
 84 | _C.MODEL.OSFormer.LOSS.SEM_TYPE = 'dice'
 85 | 
 86 | # Transformer cfg
 87 | _C.MODEL.OSFormer.HIDDEN_DIM = 256
 88 | _C.MODEL.OSFormer.NUMBER_FEATURE_LEVELS = 5   # P2 P3 P4 P5 P6
 89 | _C.MODEL.OSFormer.NHEAD = 8
 90 | _C.MODEL.OSFormer.ENC_LAYERS = 6
 91 | _C.MODEL.OSFormer.DEC_LAYERS = 6
 92 | _C.MODEL.OSFormer.DIM_FEEDFORWARD = 1024
 93 | _C.MODEL.OSFormer.ENC_POINTS = 4
 94 | 
 95 | # Structure cfg
 96 | _C.MODEL.OSFormer.C2F_MASK = False
 97 | _C.MODEL.OSFormer.NOFPN = False
 98 | _C.MODEL.OSFormer.SEM_LOSS = False
 99 | _C.MODEL.OSFormer.SINGLE_SEM = False
100 | _C.MODEL.OSFormer.INS_EDGE = False
101 | _C.MODEL.OSFormer.FFN = 'default'
102 | _C.MODEL.OSFormer.INS_FUSION = 'default'
103 | _C.MODEL.OSFormer.DCIN_NORM = True
104 | 
105 | # Query selection cfg
106 | _C.MODEL.OSFormer.QS = CN()
107 | _C.MODEL.OSFormer.QS.ENABLE = True
108 | _C.MODEL.OSFormer.QS.INPUT = "ENC"  # ENC/GRID
109 | _C.MODEL.OSFormer.QS.SHARE_HEAD = False
110 | _C.MODEL.OSFormer.QS.NUM_QUERIES = 300
111 | 
112 | # ---------------------------------------------------------------------------- #
113 | # PVT Options
114 | # ---------------------------------------------------------------------------- #
115 | 
116 | _C.MODEL.PVTV2 = CN()
117 | _C.MODEL.PVTV2.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
118 | 
119 | # ---------------------------------------------------------------------------- #
120 | # SWIN Options
121 | # ---------------------------------------------------------------------------- #
122 | 
123 | _C.MODEL.SWIN = CN()
124 | _C.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
125 | _C.MODEL.SWIN.PATCH_SIZE = 4
126 | _C.MODEL.SWIN.EMBED_DIM = 96
127 | _C.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
128 | _C.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
129 | _C.MODEL.SWIN.WINDOW_SIZE = 7
130 | _C.MODEL.SWIN.MLP_RATIO = 4.0
131 | _C.MODEL.SWIN.QKV_BIAS = True
132 | _C.MODEL.SWIN.QK_SCALE = None
133 | _C.MODEL.SWIN.DROP_RATE = 0.0
134 | _C.MODEL.SWIN.ATTN_DROP_RATE = 0.0
135 | _C.MODEL.SWIN.DROP_PATH_RATE = 0.3
136 | _C.MODEL.SWIN.APE = False
137 | _C.MODEL.SWIN.PATCH_NORM = True
138 | _C.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
139 | _C.MODEL.SWIN.USE_CHECKPOINT = False
140 | 


--------------------------------------------------------------------------------
/adet/modeling/backbone/mobilenet.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | from torch import nn
  4 | from torch.nn import BatchNorm2d
  5 | #from detectron2.layers.batch_norm import NaiveSyncBatchNorm as BatchNorm2d
  6 | from detectron2.layers import Conv2d
  7 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
  8 | from detectron2.modeling.backbone import Backbone
  9 | 
 10 | 
 11 | def conv_bn(inp, oup, stride):
 12 |     return nn.Sequential(
 13 |         Conv2d(inp, oup, 3, stride, 1, bias=False),
 14 |         BatchNorm2d(oup),
 15 |         nn.ReLU6(inplace=True)
 16 |     )
 17 | 
 18 | 
 19 | def conv_1x1_bn(inp, oup):
 20 |     return nn.Sequential(
 21 |         Conv2d(inp, oup, 1, 1, 0, bias=False),
 22 |         BatchNorm2d(oup),
 23 |         nn.ReLU6(inplace=True)
 24 |     )
 25 | 
 26 | 
 27 | class InvertedResidual(nn.Module):
 28 |     def __init__(self, inp, oup, stride, expand_ratio):
 29 |         super(InvertedResidual, self).__init__()
 30 |         self.stride = stride
 31 |         assert stride in [1, 2]
 32 | 
 33 |         hidden_dim = int(round(inp * expand_ratio))
 34 |         self.use_res_connect = self.stride == 1 and inp == oup
 35 | 
 36 |         if expand_ratio == 1:
 37 |             self.conv = nn.Sequential(
 38 |                 # dw
 39 |                 Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
 40 |                 BatchNorm2d(hidden_dim),
 41 |                 nn.ReLU6(inplace=True),
 42 |                 # pw-linear
 43 |                 Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
 44 |                 BatchNorm2d(oup),
 45 |             )
 46 |         else:
 47 |             self.conv = nn.Sequential(
 48 |                 # pw
 49 |                 Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
 50 |                 BatchNorm2d(hidden_dim),
 51 |                 nn.ReLU6(inplace=True),
 52 |                 # dw
 53 |                 Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
 54 |                 BatchNorm2d(hidden_dim),
 55 |                 nn.ReLU6(inplace=True),
 56 |                 # pw-linear
 57 |                 Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
 58 |                 BatchNorm2d(oup),
 59 |             )
 60 | 
 61 |     def forward(self, x):
 62 |         if self.use_res_connect:
 63 |             return x + self.conv(x)
 64 |         else:
 65 |             return self.conv(x)
 66 | 
 67 | 
 68 | class MobileNetV2(Backbone):
 69 |     """
 70 |     Should freeze bn
 71 |     """
 72 |     def __init__(self, cfg, n_class=1000, input_size=224, width_mult=1.):
 73 |         super(MobileNetV2, self).__init__()
 74 |         block = InvertedResidual
 75 |         input_channel = 32
 76 |         interverted_residual_setting = [
 77 |             # t, c, n, s
 78 |             [1, 16, 1, 1],
 79 |             [6, 24, 2, 2],
 80 |             [6, 32, 3, 2],
 81 |             [6, 64, 4, 2],
 82 |             [6, 96, 3, 1],
 83 |             [6, 160, 3, 2],
 84 |             [6, 320, 1, 1],
 85 |         ]
 86 | 
 87 |         # building first layer
 88 |         assert input_size % 32 == 0
 89 |         input_channel = int(input_channel * width_mult)
 90 |         self.return_features_indices = [3, 6, 13, 17]
 91 |         self.return_features_num_channels = []
 92 |         self.features = nn.ModuleList([conv_bn(3, input_channel, 2)])
 93 |         # building inverted residual blocks
 94 |         for t, c, n, s in interverted_residual_setting:
 95 |             output_channel = int(c * width_mult)
 96 |             for i in range(n):
 97 |                 if i == 0:
 98 |                     self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
 99 |                 else:
100 |                     self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
101 |                 input_channel = output_channel
102 |                 if len(self.features) - 1 in self.return_features_indices:
103 |                     self.return_features_num_channels.append(output_channel)
104 | 
105 |         self._initialize_weights()
106 |         self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_AT)
107 | 
108 |     def _freeze_backbone(self, freeze_at):
109 |         for layer_index in range(freeze_at):
110 |             for p in self.features[layer_index].parameters():
111 |                 p.requires_grad = False
112 | 
113 |     def forward(self, x):
114 |         res = []
115 |         for i, m in enumerate(self.features):
116 |             x = m(x)
117 |             if i in self.return_features_indices:
118 |                 res.append(x)
119 |         return {'res{}'.format(i + 2): r for i, r in enumerate(res)}
120 | 
121 |     def _initialize_weights(self):
122 |         for m in self.modules():
123 |             if isinstance(m, Conv2d):
124 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
125 |                 m.weight.data.normal_(0, (2. / n) ** 0.5)
126 |                 if m.bias is not None:
127 |                     m.bias.data.zero_()
128 |             elif isinstance(m, BatchNorm2d):
129 |                 m.weight.data.fill_(1)
130 |                 m.bias.data.zero_()
131 |             elif isinstance(m, nn.Linear):
132 |                 n = m.weight.size(1)
133 |                 m.weight.data.normal_(0, 0.01)
134 |                 m.bias.data.zero_()
135 | 
136 | @BACKBONE_REGISTRY.register()
137 | def build_mnv2_backbone(cfg, input_shape):
138 |     """
139 |     Create a ResNet instance from config.
140 | 
141 |     Returns:
142 |         ResNet: a :class:`ResNet` instance.
143 |     """
144 |     out_features = cfg.MODEL.RESNETS.OUT_FEATURES
145 | 
146 |     out_feature_channels = {"res2": 24, "res3": 32,
147 |                             "res4": 96, "res5": 320}
148 |     out_feature_strides = {"res2": 4, "res3": 8, "res4": 16, "res5": 32}
149 |     model = MobileNetV2(cfg)
150 |     model._out_features = out_features
151 |     model._out_feature_channels = out_feature_channels
152 |     model._out_feature_strides = out_feature_strides
153 |     return model
154 | 


--------------------------------------------------------------------------------
/adet/modeling/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # Taken from https://github.com/fundamentalvision/Deformable-DETR
  2 | # ------------------------------------------------------------------------------------------------
  3 | # Deformable DETR
  4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  6 | # ------------------------------------------------------------------------------------------------
  7 | 
  8 | from __future__ import absolute_import
  9 | from __future__ import print_function
 10 | from __future__ import division
 11 | 
 12 | import warnings
 13 | import math
 14 | 
 15 | import torch
 16 | from torch import nn
 17 | import torch.nn.functional as F
 18 | from torch.nn.init import xavier_uniform_, constant_
 19 | 
 20 | from ..functions import MSDeformAttnFunction
 21 | 
 22 | 
 23 | def _is_power_of_2(n):
 24 |     if (not isinstance(n, int)) or (n < 0):
 25 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 26 |     return (n & (n-1) == 0) and n != 0
 27 | 
 28 | 
 29 | class MSDeformAttn(nn.Module):
 30 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 31 |         """
 32 |         Multi-Scale Deformable Attention Module
 33 |         :param d_model      hidden dimension
 34 |         :param n_levels     number of feature levels
 35 |         :param n_heads      number of attention heads
 36 |         :param n_points     number of sampling points per attention head per feature level
 37 |         """
 38 |         super().__init__()
 39 |         if d_model % n_heads != 0:
 40 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 41 |         _d_per_head = d_model // n_heads
 42 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 43 |         if not _is_power_of_2(_d_per_head):
 44 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 45 |                           "which is more efficient in our CUDA implementation.")
 46 | 
 47 |         self.im2col_step = 64
 48 | 
 49 |         self.d_model = d_model
 50 |         self.n_levels = n_levels
 51 |         self.n_heads = n_heads
 52 |         self.n_points = n_points
 53 | 
 54 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 55 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 56 |         self.value_proj = nn.Linear(d_model, d_model)
 57 |         self.output_proj = nn.Linear(d_model, d_model)
 58 | 
 59 |         self._reset_parameters()
 60 | 
 61 |     def _reset_parameters(self):
 62 |         constant_(self.sampling_offsets.weight.data, 0.)
 63 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 64 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 65 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 66 |         for i in range(self.n_points):
 67 |             grid_init[:, :, i, :] *= i + 1
 68 |         with torch.no_grad():
 69 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 70 |         constant_(self.attention_weights.weight.data, 0.)
 71 |         constant_(self.attention_weights.bias.data, 0.)
 72 |         xavier_uniform_(self.value_proj.weight.data)
 73 |         constant_(self.value_proj.bias.data, 0.)
 74 |         xavier_uniform_(self.output_proj.weight.data)
 75 |         constant_(self.output_proj.bias.data, 0.)
 76 | 
 77 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 78 |         """
 79 |         :param query                       (N, Length_{query}, C)
 80 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 81 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 82 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 83 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 84 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 85 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 86 | 
 87 |         :return output                     (N, Length_{query}, C)
 88 |         """
 89 |         N, Len_q, _ = query.shape
 90 |         N, Len_in, _ = input_flatten.shape
 91 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 92 | 
 93 |         value = self.value_proj(input_flatten)
 94 |         if input_padding_mask is not None:
 95 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
 96 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
 97 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
 98 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
 99 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
100 |         # N, Len_q, n_heads, n_levels, n_points, 2
101 |         if reference_points.shape[-1] == 2:
102 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
103 |             sampling_locations = reference_points[:, :, None, :, None, :] \
104 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
105 |         elif reference_points.shape[-1] == 4:
106 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
107 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
108 |         else:
109 |             raise ValueError(
110 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
111 |         output = MSDeformAttnFunction.apply(
112 |             value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
113 |         output = self.output_proj(output)
114 |         return output
115 | 


--------------------------------------------------------------------------------
/adet/modeling/osformer/trans_decoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn.init import xavier_uniform_, constant_, normal_
  4 | 
  5 | from adet.modeling.ops.modules.ms_deform_attn import MSDeformAttn
  6 | from .trans_utils import _get_clones, get_reference_points, with_pos_embed
  7 | from .feed_forward import get_ffn
  8 | 
  9 | 
 10 | class CISTransformerDecoder(nn.Module):
 11 |     def __init__(self, d_model=256, nhead=8,
 12 |                  num_encoder_layers=6, dim_feedforward=1024, dropout=0.1,
 13 |                  ffn_type="default", num_feature_levels=4, enc_n_points=4):
 14 |         super().__init__()
 15 | 
 16 |         self.d_model = d_model
 17 |         self.nhead = nhead
 18 | 
 19 |         decoder_layer = TransformerDecoderLayer(d_model, dim_feedforward,
 20 |                                                 dropout, ffn_type,
 21 |                                                 num_feature_levels, nhead, enc_n_points)
 22 |         self.decoder = TransformerDecoder(decoder_layer, num_encoder_layers)
 23 |         self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
 24 |         self.reference_points = nn.Linear(d_model, 2)
 25 | 
 26 |         self._reset_parameters()
 27 | 
 28 |     def _reset_parameters(self):
 29 |         for p in self.parameters():
 30 |             if p.dim() > 1:
 31 |                 nn.init.xavier_uniform_(p)
 32 |         for m in self.modules():
 33 |             if isinstance(m, MSDeformAttn):
 34 |                 m._reset_parameters()
 35 |         xavier_uniform_(self.reference_points.weight.data, gain=1.0)
 36 |         constant_(self.reference_points.bias.data, 0.)
 37 |         normal_(self.level_embed)
 38 | 
 39 |     def forward(self, srcs, pos_embeds, memorys=None, pos_memorys=None):
 40 | 
 41 |         # prepare input for decoder
 42 |         src_flatten = []
 43 |         memory_flatten = []
 44 |         lvl_pos_embed_flatten = []
 45 |         lvl_pos_memory_flatten = []
 46 |         spatial_shapes = []
 47 |         spatial_shape_grids = []
 48 |         for lvl, (src, pos_embed, memory, pos_memory) in enumerate(zip(srcs, pos_embeds, memorys, pos_memorys)):
 49 |             bs1, c1, h1, w1 = src.shape
 50 |             spatial_shape_src = (h1, w1)
 51 |             spatial_shape_grids.append(spatial_shape_src)
 52 |             bs, c, h, w = memory.shape
 53 |             spatial_shape = (h, w)
 54 |             spatial_shapes.append(spatial_shape)
 55 |             src = src.flatten(2).transpose(1, 2)
 56 |             memory = memory.flatten(2).transpose(1, 2)
 57 |             pos_embed = pos_embed.flatten(2).transpose(1, 2)
 58 |             pos_memory = pos_memory.flatten(2).transpose(1, 2)
 59 |             lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
 60 |             lvl_pos_embed_flatten.append(lvl_pos_embed)
 61 |             lvl_pos_memory = pos_memory + self.level_embed[lvl].view(1, 1, -1)
 62 |             lvl_pos_memory_flatten.append(lvl_pos_memory)
 63 |             src_flatten.append(src)
 64 |             memory_flatten.append(memory)
 65 |         src_flatten = torch.cat(src_flatten, 1)
 66 |         memory_flatten = torch.cat(memory_flatten, 1)
 67 |         lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
 68 |         lvl_pos_memory_flatten = torch.cat(lvl_pos_memory_flatten, 1)
 69 |         spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
 70 |         level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
 71 |         spatial_shape_grids = torch.as_tensor(spatial_shape_grids, dtype=torch.long, device=src_flatten.device)
 72 |         level_start_index_grid = torch.cat((spatial_shape_grids.new_zeros((1, )), spatial_shape_grids.prod(1).cumsum(0)[:-1]))
 73 | 
 74 |         # decoder
 75 |         memory = self.decoder(src_flatten, memory_flatten, spatial_shapes, spatial_shape_grids, level_start_index_grid,
 76 |                               level_start_index, lvl_pos_embed_flatten, lvl_pos_memory_flatten)
 77 | 
 78 |         return memory, level_start_index
 79 | 
 80 | 
 81 | class TransformerDecoderLayer(nn.Module):
 82 |     def __init__(self,
 83 |                  d_model=256, d_ffn=1024,
 84 |                  dropout=0.1, ffn_type="default",
 85 |                  n_levels=4, n_heads=8, n_points=4):
 86 |         super().__init__()
 87 | 
 88 |         # self attention
 89 |         self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
 90 |         self.dropout1 = nn.Dropout(dropout)
 91 |         self.norm1 = nn.LayerNorm(d_model)
 92 | 
 93 |         # ffn
 94 |         self.ffn = get_ffn(d_model, ffn_type)
 95 | 
 96 |     def forward(self, src, pos_embed, memorys, pos_memory, reference_points, spatial_shapes,
 97 |                 level_start_index, spatial_shape_grids, level_start_index_grid):
 98 |         # self attention
 99 |         src2 = self.self_attn(with_pos_embed(src, pos_embed), reference_points,
100 |                               with_pos_embed(memorys, pos_memory), spatial_shapes, level_start_index)
101 |         src = src + self.dropout1(src2)
102 |         src = self.norm1(src)
103 | 
104 |         # ffn
105 |         src = self.ffn(src, spatial_shape_grids, level_start_index_grid)
106 | 
107 |         return src
108 | 
109 | 
110 | class TransformerDecoder(nn.Module):
111 |     def __init__(self, decoder_layer, num_layers):
112 |         super().__init__()
113 |         self.layers = _get_clones(decoder_layer, num_layers)
114 |         self.num_layers = num_layers
115 | 
116 |     def forward(self, src, memorys, spatial_shapes, spatial_shape_grids,
117 |                 level_start_index_grid, level_start_index, pos_embed, pos_memory):
118 |         output = src
119 |         batch_size = src.shape[0]
120 |         reference_points = get_reference_points(spatial_shape_grids, batch_size, device=src.device)
121 |         for _, layer in enumerate(self.layers):
122 |             output = layer(output, pos_embed, memorys, pos_memory, reference_points, spatial_shapes,
123 |                            level_start_index, spatial_shape_grids, level_start_index_grid)
124 | 
125 |         return output
126 | 
127 | 
128 | def build_transformer_decoder(cfg):
129 |     return CISTransformerDecoder(
130 |         d_model=cfg.MODEL.OSFormer.HIDDEN_DIM,
131 |         nhead=cfg.MODEL.OSFormer.NHEAD,
132 |         num_encoder_layers=cfg.MODEL.OSFormer.DEC_LAYERS,
133 |         dim_feedforward=cfg.MODEL.OSFormer.DIM_FEEDFORWARD,
134 |         dropout=0.1,
135 |         ffn_type=cfg.MODEL.OSFormer.FFN,
136 |         num_feature_levels=len(cfg.MODEL.OSFormer.FEAT_INSTANCE_STRIDES),
137 |         enc_n_points=cfg.MODEL.OSFormer.ENC_POINTS)
138 | 


--------------------------------------------------------------------------------
/demo/demo.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import glob
  3 | import multiprocessing as mp
  4 | import os
  5 | import time
  6 | import cv2
  7 | import tqdm
  8 | 
  9 | from detectron2.data.detection_utils import read_image
 10 | from detectron2.utils.logger import setup_logger
 11 | from detectron2.utils.analysis import parameter_count
 12 | 
 13 | from predictor import VisualizationDemo
 14 | from adet.config import get_cfg
 15 | 
 16 | # constants
 17 | WINDOW_NAME = "COCO detections"
 18 | 
 19 | 
 20 | def setup_cfg(args):
 21 |     # load config from file and command-line arguments
 22 |     cfg = get_cfg()
 23 |     cfg.merge_from_file(args.config_file)
 24 |     cfg.merge_from_list(args.opts)
 25 |     # Set score_threshold for builtin models
 26 |     cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
 27 |     cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
 28 |     cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
 29 |     cfg.freeze()
 30 |     return cfg
 31 | 
 32 | 
 33 | def get_parser():
 34 |     parser = argparse.ArgumentParser(description="Detectron2 Demo")
 35 |     parser.add_argument(
 36 |         "--config-file",
 37 |         default="configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_inference_acc_test.yaml",
 38 |         metavar="FILE",
 39 |         help="path to config file",
 40 |     )
 41 |     parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
 42 |     parser.add_argument("--video-input", help="Path to video file.")
 43 |     parser.add_argument("--input", nargs="+", help="A list of space separated input images")
 44 |     parser.add_argument(
 45 |         "--output",
 46 |         help="A file or directory to save output visualizations. "
 47 |         "If not given, will show output in an OpenCV window.",
 48 |     )
 49 |     parser.add_argument(
 50 |         "--sod",
 51 |         action='store_true'
 52 |     )
 53 | 
 54 |     parser.add_argument(
 55 |         "--confidence-threshold",
 56 |         type=float,
 57 |         default=0.3,
 58 |         help="Minimum score for instance predictions to be shown",
 59 |     )
 60 |     parser.add_argument(
 61 |         "--opts",
 62 |         help="Modify config options using the command-line 'KEY VALUE' pairs",
 63 |         default=[],
 64 |         nargs=argparse.REMAINDER,
 65 |     )
 66 |     return parser
 67 | 
 68 | 
 69 | if __name__ == "__main__":
 70 |     mp.set_start_method("spawn", force=True)
 71 |     args = get_parser().parse_args()
 72 |     logger = setup_logger()
 73 |     logger.info("Arguments: " + str(args))
 74 | 
 75 |     cfg = setup_cfg(args)
 76 | 
 77 |     demo = VisualizationDemo(cfg)
 78 |     print('total parameter:', parameter_count(demo.predictor.model)[''])
 79 | 
 80 |     if args.input:
 81 |         if os.path.isdir(args.input[0]):
 82 |             args.input = [os.path.join(args.input[0], fname) for fname in os.listdir(args.input[0])]
 83 |         elif len(args.input) == 1:
 84 |             args.input = glob.glob(os.path.expanduser(args.input[0]))
 85 |             assert args.input, "The input path(s) was not found"
 86 |         for path in tqdm.tqdm(args.input, disable=not args.output):
 87 |             # use PIL, to be consistent with evaluation
 88 |             img = read_image(path, format="BGR")
 89 |             start_time = time.time()
 90 |             if args.sod:
 91 |                 sod = demo.run_on_image_sod(img)
 92 |                 out_filename = os.path.join(args.output, os.path.basename(path))
 93 |                 cv2.imwrite(out_filename, sod)
 94 |                 continue
 95 |             predictions, visualized_output = demo.run_on_image(img)
 96 |             logger.info(
 97 |                 "{}: detected {} instances in {:.2f}s".format(
 98 |                     path, len(predictions["instances"]), time.time() - start_time
 99 |                 )
100 |             )
101 | 
102 |             if args.output:
103 |                 if os.path.isdir(args.output):
104 |                     assert os.path.isdir(args.output), args.output
105 |                     out_filename = os.path.join(args.output, os.path.basename(path))
106 |                 else:
107 |                     assert len(args.input) == 1, "Please specify a directory with args.output"
108 |                     out_filename = args.output
109 |                 visualized_output.save(out_filename)
110 |                 if cfg.MODEL.OSFormer.SEM_LOSS:
111 |                     cv2.imwrite(
112 |                         os.path.join(args.output, 'edge_' + os.path.basename(path)),
113 |                         predictions["pred_sems"].squeeze(0).cpu().numpy() * 255)
114 |             else:
115 |                 cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
116 |                 if cv2.waitKey(0) == 27:
117 |                     break  # esc to quit
118 |     elif args.webcam:
119 |         assert args.input is None, "Cannot have both --input and --webcam!"
120 |         cam = cv2.VideoCapture(0)
121 |         for vis in tqdm.tqdm(demo.run_on_video(cam)):
122 |             cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
123 |             cv2.imshow(WINDOW_NAME, vis)
124 |             if cv2.waitKey(1) == 27:
125 |                 break  # esc to quit
126 |         cv2.destroyAllWindows()
127 |     elif args.video_input:
128 |         video = cv2.VideoCapture(args.video_input)
129 |         width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
130 |         height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
131 |         frames_per_second = video.get(cv2.CAP_PROP_FPS)
132 |         num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
133 |         basename = os.path.basename(args.video_input)
134 | 
135 |         if args.output:
136 |             if os.path.isdir(args.output):
137 |                 output_fname = os.path.join(args.output, basename)
138 |                 output_fname = os.path.splitext(output_fname)[0] + ".mkv"
139 |             else:
140 |                 output_fname = args.output
141 |             assert not os.path.isfile(output_fname), output_fname
142 |             output_file = cv2.VideoWriter(
143 |                 filename=output_fname,
144 |                 # some installation of opencv may not support x264 (due to its license),
145 |                 # you can try other format (e.g. MPEG)
146 |                 fourcc=cv2.VideoWriter_fourcc(*"x264"),
147 |                 fps=float(frames_per_second),
148 |                 frameSize=(width, height),
149 |                 isColor=True,
150 |             )
151 |         assert os.path.isfile(args.video_input)
152 |         for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
153 |             if args.output:
154 |                 output_file.write(vis_frame)
155 |             else:
156 |                 cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
157 |                 cv2.imshow(basename, vis_frame)
158 |                 if cv2.waitKey(1) == 27:
159 |                     break  # esc to quit
160 |         video.release()
161 |         if args.output:
162 |             output_file.release()
163 |         else:
164 |             cv2.destroyAllWindows()
165 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # OSFormer: One-Stage Camouflaged Instance Segmentation with Transformers (ECCV 2022)
  2 | 
  3 | ![OSFormer](docs/OSFormer.png)
  4 | 
  5 | Official Implementation of "[OSFormer: One-Stage Camouflaged Instance Segmentation with Transformers](https://arxiv.org/abs/2207.02255)"
  6 | 
  7 | [Jialun Pei*](https://scholar.google.com/citations?user=1lPivLsAAAAJ&hl=en), [Tianyang Cheng*](https://github.com/Patrickctyyx), [Deng-Ping Fan](https://dengpingfan.github.io/), [He Tang](https://scholar.google.com/citations?hl=en&user=70XLFUsAAAAJ), Chuanbo Chen, and [Luc Van Gool](https://ee.ethz.ch/the-department/faculty/professors/person-detail.OTAyMzM=.TGlzdC80MTEsMTA1ODA0MjU5.html)
  8 | 
  9 | [[Paper]](https://arxiv.org/abs/2207.02255); [[Chinese Version]](https://dengpingfan.github.io/papers/[2022][ECCV]OSFormer_Chinese.pdf); [[Official Version]](https://link.springer.com/content/pdf/10.1007/978-3-031-19797-0_2.pdf); [[Project Page]](https://blog.patrickcty.cc/OSFormer-Homepage/)
 10 | 
 11 | **Contact:** dengpfan@gmail.com, peijl@hust.edu.cn
 12 | 
 13 | |            *Sample 1*             |             *Sample 2*             |             *Sample 3*             |             *Sample 4*             |
 14 | | :------------------------------: | :-------------------------------: | :-------------------------------: | :-------------------------------: |
 15 | | <img src="docs/COD10K-CAM-3-Flying-53-Bird-3024.gif"  height=125 width=170> | <img src="docs/COD10K-CAM-3-Flying-65-Owl-4620.gif" height=125 width=170> | <img src="docs/488.gif" height=125 width=170> |  <img src="docs/4126.gif" height=125 width=170> |
 16 | 
 17 | ## Environment preparation
 18 | 
 19 | The code is tested on CUDA 11.1 and pytorch 1.9.0, change the versions below to your desired ones.
 20 | 
 21 | ```shell
 22 | git clone https://github.com/PJLallen/OSFormer.git
 23 | cd OSFormer
 24 | conda create -n osformer python=3.8 -y
 25 | conda activate osformer
 26 | conda install pytorch==1.9.0 torchvision cudatoolkit=11.1 -c pytorch -c nvidia -y
 27 | python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu111/torch1.9/index.html
 28 | python setup.py build develop
 29 | ```
 30 | 
 31 | ## Dataset preparation
 32 | 
 33 | ### Download the datasets
 34 | 
 35 | - **COD10K**: [Baidu](https://pan.baidu.com/s/1IPcPjdg1EJ-h9HPoU42nHA) (password:hust) / [Google](https://drive.google.com/file/d/1YGa3v-MiXy-3MMJDkidLXPt0KQwygt-Z/view?usp=sharing) / [Quark](https://pan.quark.cn/s/07ba3258b777); **Json files:** [Baidu](https://pan.baidu.com/s/1kRawj-hzBDycCkZZfQjFhg) (password:hust) / [Google](https://drive.google.com/drive/folders/1Yvz63C8c7LOHFRgm06viUM9XupARRPif?usp=sharing)
 36 | - **NC4K**: [Baidu](https://pan.baidu.com/s/1li4INx4klQ_j8ftODyw2Zg) (password:hust) / [Google](https://drive.google.com/file/d/1eK_oi-N4Rmo6IIxUNbYHBiNWuDDLGr_k/view?usp=sharing); **Json files:** [Baidu](https://pan.baidu.com/s/1DBPFtAL2iEjefwiqXE_GWA) (password:hust) / [Google](https://drive.google.com/drive/folders/1LyK7tl2QVZBFiNaWI_n0ZVa0QiwF2B8e?usp=sharing)
 37 | 
 38 | ### Register datasets
 39 | 
 40 | 1. generate coco annotation files, you may refer to [the tutorial of mmdetection](https://github.com/open-mmlab/mmdetection/blob/master/docs/en/2_new_data_model.md) for some help
 41 | 2. change the path of the datasets as well as annotations in `adet/data/datasets/cis.py`, please refer to [the docs of detectron2](https://detectron2.readthedocs.io/en/latest/) for more help
 42 | 
 43 | ```python
 44 | # adet/data/datasets/cis.py
 45 | # change the paths 
 46 | DATASET_ROOT = 'COD10K-v3'
 47 | ANN_ROOT = os.path.join(DATASET_ROOT, 'annotations')
 48 | TRAIN_PATH = os.path.join(DATASET_ROOT, 'Train/Image')
 49 | TEST_PATH = os.path.join(DATASET_ROOT, 'Test/Image')
 50 | TRAIN_JSON = os.path.join(ANN_ROOT, 'train_instance.json')
 51 | TEST_JSON = os.path.join(ANN_ROOT, 'test2026.json')
 52 | 
 53 | NC4K_ROOT = 'NC4K'
 54 | NC4K_PATH = os.path.join(NC4K_ROOT, 'Imgs')
 55 | NC4K_JSON = os.path.join(NC4K_ROOT, 'nc4k_test.json')
 56 | ```
 57 | 
 58 | ## Pre-trained models
 59 | 
 60 | Model weights: [Baidu](https://pan.baidu.com/s/1Ao3Myqa6xiA9ymAkZgZOeQ) (password:l6vn) / [Google](https://drive.google.com/drive/folders/1pl9iM1NAfN5N6Voc03oPmlbKJ-YNldMF?usp=sharing) / [Quark](https://pan.quark.cn/s/6676592ff08b)
 61 | 
 62 | | Model         | Config                                           | COD10K-test AP | NC4K-test AP |
 63 | |:--------------|:------------------------------------------------ |:---------------|:-------------|
 64 | | R50-550       | [configs/CIS_RT.yaml](configs/CIS_RT.yaml)       | 36.0           | 41.4         |
 65 | | R50           | [configs/CIS_R50.yaml](configs/CIS_R50.yaml)     | 41.0           | 42.5         |
 66 | | R101          | [configs/CIS_R101.yaml](configs/CIS_R101.yaml)   | 42.0           | 44.4         |
 67 | | PVTv2-B2-Li   | [configs/CIS_PVTv2B2Li](configs/CIS_PVTv2B2Li)   | 47.2           | 50.5         |
 68 | | SWIN-T        | [configs/CIS_SWINT.yaml](configs/CIS_SWINT.yaml) | 47.7           | 50.2         |
 69 | 
 70 | ## Visualization results
 71 | 
 72 | The visual results are achieved by our OSFormer with ResNet-50 trained on the COD10K training set.
 73 | 
 74 | - Results on the COD10K test set: [Baidu](https://pan.baidu.com/s/16xH7coaGoOGiB5x1AXgy5w) (password:hust) /
 75 | [Google](https://drive.google.com/open?id=16XMw6NaiCQdHG1By-1a7s8SmnyEqlmYD)
 76 | - Results on the NC4K test set: [Baidu](https://pan.baidu.com/s/15Y-7fNcHRhu38Vjybx1HMg) (password:hust) /
 77 | [Google](https://drive.google.com/file/d/1cRcwbD3Y3fMO3n7eTtA6VGZWKCWwJSU0/view?usp=sharing)
 78 | 
 79 | ## Frequently asked questions
 80 | 
 81 | [FAQ](https://github.com/PJLallen/OSFormer/blob/main/docs/faq.md)
 82 | 
 83 | ## Usage
 84 | 
 85 | ### Train
 86 | 
 87 | ```shell
 88 | python tools/train_net.py --config-file configs/CIS_R50.yaml --num-gpus 1 \
 89 |   OUTPUT_DIR {PATH_TO_OUTPUT_DIR}
 90 | ```
 91 | 
 92 | Please replace `{PATH_TO_OUTPUT_DIR}` to your own output dir
 93 | 
 94 | ### Inference
 95 | 
 96 | ```shell
 97 | python tools/train_net.py --config-file configs/CIS_R50.yaml --eval-only \
 98 |   MODEL.WEIGHTS {PATH_TO_PRE_TRAINED_WEIGHTS}
 99 | ```
100 | 
101 | Please replace `{PATH_TO_PRE_TRAINED_WEIGHTS}` to the pre-trained weights
102 | 
103 | ### Eval
104 | 
105 | ```shell
106 | python demo/demo.py --config-file configs/CIS_R50.yaml \
107 |   --input {PATH_TO_THE_IMG_DIR_OR_FIRE} \
108 |   --output {PATH_TO_SAVE_DIR_OR_IMAGE_FILE} \
109 |   --opts MODEL.WEIGHTS {PATH_TO_PRE_TRAINED_WEIGHTS}
110 | ```
111 | 
112 | - `{PATH_TO_THE_IMG_DIR_OR_FIRE}`: you can put image dir or image paths here
113 | - `{PATH_TO_SAVE_DIR_OR_IMAGE_FILE}`: the place where the visualizations will be saved
114 | - `{PATH_TO_PRE_TRAINED_WEIGHTS}`: please put the pre-trained weights here
115 | 
116 | 
117 | ## Acknowledgement
118 | 
119 | This work is based on:
120 | - [detectron2](https://github.com/facebookresearch/detectron2)
121 | - [AdelaiDet](https://github.com/aim-uofa/AdelaiDet)
122 | - [DETR](https://github.com/facebookresearch/detr)
123 | - [Deformable DETR](https://github.com/fundamentalvision/Deformable-DETR)
124 | 
125 | We also get help from [mmdetection](https://github.com/open-mmlab/mmdetection). Thanks them for their great work!
126 | 
127 | ## Citation
128 | 
129 | If this helps you, please cite this work:
130 | 
131 | ```
132 | @inproceedings{pei2022osformer,
133 |   title={OSFormer: One-Stage Camouflaged Instance Segmentation with Transformers},
134 |   author={Pei, Jialun and Cheng, Tianyang and Fan, Deng-Ping and Tang, He and Chen, Chuanbo and Van Gool, Luc},
135 |   booktitle={European conference on computer vision},
136 |   year={2022},
137 |   organization={Springer}
138 | }
139 | ```
140 | 


--------------------------------------------------------------------------------
/tools/train_net.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from collections import OrderedDict
  4 | 
  5 | import detectron2.utils.comm as comm
  6 | from detectron2.data import MetadataCatalog, build_detection_train_loader
  7 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch
  8 | from detectron2.utils.events import EventStorage
  9 | from detectron2.evaluation import (
 10 |     COCOEvaluator,
 11 |     COCOPanopticEvaluator,
 12 |     DatasetEvaluators,
 13 |     LVISEvaluator,
 14 |     PascalVOCDetectionEvaluator,
 15 |     SemSegEvaluator,
 16 |     verify_results,
 17 | )
 18 | from detectron2.modeling import GeneralizedRCNNWithTTA
 19 | 
 20 | from detectron2.utils.logger import setup_logger
 21 | from adet.data.dataset_mapper import DatasetMapperWithBasis
 22 | from adet.config import get_cfg
 23 | from adet.checkpoint import AdetCheckpointer
 24 | from adet.evaluation import TextEvaluator
 25 | from adet.data.datasets.cis import register_dataset
 26 | 
 27 | 
 28 | class Trainer(DefaultTrainer):
 29 |     """
 30 |     This is the same Trainer except that we rewrite the
 31 |     `build_train_loader` method.
 32 |     """
 33 | 
 34 |     def train_loop(self, start_iter: int, max_iter: int):
 35 |         """
 36 |         Args:
 37 |             start_iter, max_iter (int): See docs above
 38 |         """
 39 |         logger = logging.getLogger("adet.trainer")
 40 |         logger.info("Starting training from iteration {}".format(start_iter))
 41 | 
 42 |         self.iter = self.start_iter = start_iter
 43 |         self.max_iter = max_iter
 44 | 
 45 |         with EventStorage(start_iter) as self.storage:
 46 |             self.before_train()
 47 |             for self.iter in range(start_iter, max_iter):
 48 |                 self.before_step()
 49 |                 self.run_step()
 50 |                 self.after_step()
 51 |             self.after_train()
 52 | 
 53 |     def train(self):
 54 |         """
 55 |         Run training.
 56 | 
 57 |         Returns:
 58 |             OrderedDict of results, if evaluation is enabled. Otherwise None.
 59 |         """
 60 |         self.train_loop(self.start_iter, self.max_iter)
 61 |         if hasattr(self, "_last_eval_results") and comm.is_main_process():
 62 |             verify_results(self.cfg, self._last_eval_results)
 63 |             return self._last_eval_results
 64 | 
 65 |     @classmethod
 66 |     def build_train_loader(cls, cfg):
 67 |         """
 68 |         Returns:
 69 |             iterable
 70 | 
 71 |         It calls :func:`detectron2.data.build_detection_train_loader` with a customized
 72 |         DatasetMapper, which adds categorical labels as a semantic mask.
 73 |         """
 74 |         mapper = DatasetMapperWithBasis(cfg, True)
 75 |         return build_detection_train_loader(cfg, mapper=mapper)
 76 | 
 77 |     @classmethod
 78 |     def build_evaluator(cls, cfg, dataset_name, output_folder=None):
 79 |         """
 80 |         Create evaluator(s) for a given dataset.
 81 |         This uses the special metadata "evaluator_type" associated with each builtin dataset.
 82 |         For your own dataset, you can simply create an evaluator manually in your
 83 |         script and do not have to worry about the hacky if-else logic here.
 84 |         """
 85 |         if output_folder is None:
 86 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
 87 |         evaluator_list = []
 88 |         evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
 89 |         if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
 90 |             evaluator_list.append(
 91 |                 SemSegEvaluator(
 92 |                     dataset_name,
 93 |                     distributed=True,
 94 |                     num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
 95 |                     ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
 96 |                     output_dir=output_folder,
 97 |                 )
 98 |             )
 99 |         if evaluator_type in ["coco", "coco_panoptic_seg"]:
100 |             evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder))
101 |         if evaluator_type == "coco_panoptic_seg":
102 |             evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
103 |         if evaluator_type == "pascal_voc":
104 |             return PascalVOCDetectionEvaluator(dataset_name)
105 |         if evaluator_type == "lvis":
106 |             return LVISEvaluator(dataset_name, cfg, True, output_folder)
107 |         if evaluator_type == "text":
108 |             return TextEvaluator(dataset_name, cfg, True, output_folder)
109 |         if len(evaluator_list) == 0:
110 |             raise NotImplementedError(
111 |                 "no Evaluator for the dataset {} with the type {}".format(
112 |                     dataset_name, evaluator_type
113 |                 )
114 |             )
115 |         if len(evaluator_list) == 1:
116 |             return evaluator_list[0]
117 |         return DatasetEvaluators(evaluator_list)
118 | 
119 |     @classmethod
120 |     def test_with_TTA(cls, cfg, model):
121 |         logger = logging.getLogger("adet.trainer")
122 |         # In the end of training, run an evaluation with TTA
123 |         # Only support some R-CNN models.
124 |         logger.info("Running inference with test-time augmentation ...")
125 |         model = GeneralizedRCNNWithTTA(cfg, model)
126 |         evaluators = [
127 |             cls.build_evaluator(
128 |                 cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
129 |             )
130 |             for name in cfg.DATASETS.TEST
131 |         ]
132 |         res = cls.test(cfg, model, evaluators)
133 |         res = OrderedDict({k + "_TTA": v for k, v in res.items()})
134 |         return res
135 | 
136 | 
137 | def setup(args):
138 |     """
139 |     Create configs and perform basic setups.
140 |     """
141 |     cfg = get_cfg()
142 |     cfg.merge_from_file(args.config_file)
143 |     cfg.merge_from_list(args.opts)
144 |     cfg.freeze()
145 |     default_setup(cfg, args)
146 | 
147 |     rank = comm.get_rank()
148 |     setup_logger(cfg.OUTPUT_DIR, distributed_rank=rank, name="adet")
149 | 
150 |     return cfg
151 | 
152 | 
153 | def main(args):
154 | 
155 |     cfg = setup(args)
156 |     register_dataset()
157 | 
158 |     if args.eval_only:
159 |         model = Trainer.build_model(cfg)
160 |         AdetCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
161 |             cfg.MODEL.WEIGHTS, resume=args.resume
162 |         )
163 |         res = Trainer.test(cfg, model) # d2 defaults.py
164 |         if comm.is_main_process():
165 |             verify_results(cfg, res)
166 |         if cfg.TEST.AUG.ENABLED:
167 |             res.update(Trainer.test_with_TTA(cfg, model))
168 |         return res
169 | 
170 |     """
171 |     If you'd like to do anything fancier than the standard training logic,
172 |     consider writing your own training loop or subclassing the trainer.
173 |     """
174 |     trainer = Trainer(cfg)
175 |     trainer.resume_or_load(resume=args.resume)
176 |     if cfg.TEST.AUG.ENABLED:
177 |         trainer.register_hooks(
178 |             [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
179 |         )
180 |     return trainer.train()
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     args = default_argument_parser().parse_args()
185 |     print("Command Line Args:", args)
186 |     launch(
187 |         main,
188 |         args.num_gpus,
189 |         num_machines=args.num_machines,
190 |         machine_rank=args.machine_rank,
191 |         dist_url=args.dist_url,
192 |         args=(args,),
193 |     )
194 | 


--------------------------------------------------------------------------------
/adet/modeling/ops/src/cuda/ms_deform_attn_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*!
  2 | **************************************************************************************************
  3 | Taken from https://github.com/fundamentalvision/Deformable-DETR
  4 | **************************************************************************************************
  5 | * Deformable DETR
  6 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
  7 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  8 | **************************************************************************************************
  9 | */
 10 | 
 11 | #include <vector>
 12 | #include "cuda/ms_deform_im2col_cuda.cuh"
 13 | 
 14 | #include <ATen/ATen.h>
 15 | #include <ATen/cuda/CUDAContext.h>
 16 | #include <cuda.h>
 17 | #include <cuda_runtime.h>
 18 | 
 19 | 
 20 | at::Tensor ms_deform_attn_cuda_forward(
 21 |     const at::Tensor &value, 
 22 |     const at::Tensor &spatial_shapes,
 23 |     const at::Tensor &level_start_index,
 24 |     const at::Tensor &sampling_loc,
 25 |     const at::Tensor &attn_weight,
 26 |     const int im2col_step)
 27 | {
 28 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 29 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 30 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 31 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 32 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 33 | 
 34 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
 35 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
 36 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
 37 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
 38 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
 39 | 
 40 |     const int batch = value.size(0);
 41 |     const int spatial_size = value.size(1);
 42 |     const int num_heads = value.size(2);
 43 |     const int channels = value.size(3);
 44 | 
 45 |     const int num_levels = spatial_shapes.size(0);
 46 | 
 47 |     const int num_query = sampling_loc.size(1);
 48 |     const int num_point = sampling_loc.size(4);
 49 | 
 50 |     const int im2col_step_ = std::min(batch, im2col_step);
 51 | 
 52 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
 53 |     
 54 |     auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
 55 | 
 56 |     const int batch_n = im2col_step_;
 57 |     auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
 58 |     auto per_value_size = spatial_size * num_heads * channels;
 59 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
 60 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
 61 |     for (int n = 0; n < batch/im2col_step_; ++n)
 62 |     {
 63 |         auto columns = output_n.select(0, n);
 64 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
 65 |             ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
 66 |                 value.data<scalar_t>() + n * im2col_step_ * per_value_size,
 67 |                 spatial_shapes.data<int64_t>(),
 68 |                 level_start_index.data<int64_t>(),
 69 |                 sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
 70 |                 attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
 71 |                 batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
 72 |                 columns.data<scalar_t>());
 73 | 
 74 |         }));
 75 |     }
 76 | 
 77 |     output = output.view({batch, num_query, num_heads*channels});
 78 | 
 79 |     return output;
 80 | }
 81 | 
 82 | 
 83 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
 84 |     const at::Tensor &value, 
 85 |     const at::Tensor &spatial_shapes,
 86 |     const at::Tensor &level_start_index,
 87 |     const at::Tensor &sampling_loc,
 88 |     const at::Tensor &attn_weight,
 89 |     const at::Tensor &grad_output,
 90 |     const int im2col_step)
 91 | {
 92 | 
 93 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 94 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 95 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 96 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 97 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 98 |     AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
 99 | 
100 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
101 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
102 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
103 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
104 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
105 |     AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
106 | 
107 |     const int batch = value.size(0);
108 |     const int spatial_size = value.size(1);
109 |     const int num_heads = value.size(2);
110 |     const int channels = value.size(3);
111 | 
112 |     const int num_levels = spatial_shapes.size(0);
113 | 
114 |     const int num_query = sampling_loc.size(1);
115 |     const int num_point = sampling_loc.size(4);
116 | 
117 |     const int im2col_step_ = std::min(batch, im2col_step);
118 | 
119 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
120 | 
121 |     auto grad_value = at::zeros_like(value);
122 |     auto grad_sampling_loc = at::zeros_like(sampling_loc);
123 |     auto grad_attn_weight = at::zeros_like(attn_weight);
124 | 
125 |     const int batch_n = im2col_step_;
126 |     auto per_value_size = spatial_size * num_heads * channels;
127 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
128 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
129 |     auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
130 |     
131 |     for (int n = 0; n < batch/im2col_step_; ++n)
132 |     {
133 |         auto grad_output_g = grad_output_n.select(0, n);
134 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
135 |             ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
136 |                                     grad_output_g.data<scalar_t>(),
137 |                                     value.data<scalar_t>() + n * im2col_step_ * per_value_size,
138 |                                     spatial_shapes.data<int64_t>(),
139 |                                     level_start_index.data<int64_t>(),
140 |                                     sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
141 |                                     attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
142 |                                     batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
143 |                                     grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
144 |                                     grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
145 |                                     grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
146 | 
147 |         }));
148 |     }
149 | 
150 |     return {
151 |         grad_value, grad_sampling_loc, grad_attn_weight
152 |     };
153 | }


--------------------------------------------------------------------------------
/adet/utils/measures.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | from __future__ import absolute_import
  4 | from __future__ import unicode_literals
  5 | from __future__ import print_function
  6 | from __future__ import division
  7 | 
  8 | import operator
  9 | 
 10 | from functools import reduce
 11 | 
 12 | 
 13 | def get_num_gen(gen):
 14 |     return sum(1 for x in gen)
 15 | 
 16 | 
 17 | def is_pruned(layer):
 18 |     try:
 19 |         layer.mask
 20 |         return True
 21 |     except AttributeError:
 22 |         return False
 23 | 
 24 | 
 25 | def is_leaf(model):
 26 |     return get_num_gen(model.children()) == 0
 27 | 
 28 | 
 29 | def get_layer_info(layer):
 30 |     layer_str = str(layer)
 31 |     type_name = layer_str[:layer_str.find('(')].strip()
 32 |     return type_name
 33 | 
 34 | 
 35 | def get_layer_param(model):
 36 |     return sum([reduce(operator.mul, i.size(), 1) for i in model.parameters()])
 37 | 
 38 | 
 39 | ### The input batch size should be 1 to call this function
 40 | def measure_layer(layer, *args):
 41 |     global count_ops, count_params
 42 | 
 43 |     for x in args:
 44 |         delta_ops = 0
 45 |         delta_params = 0
 46 |         multi_add = 1
 47 |         type_name = get_layer_info(layer)
 48 | 
 49 |         ### ops_conv
 50 |         if type_name in ['Conv2d']:
 51 |             out_h = int((x.size()[2] + 2 * layer.padding[0] / layer.dilation[0] - layer.kernel_size[0]) /
 52 |                         layer.stride[0] + 1)
 53 |             out_w = int((x.size()[3] + 2 * layer.padding[1] / layer.dilation[1] - layer.kernel_size[1]) /
 54 |                         layer.stride[1] + 1)
 55 |             delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] * layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
 56 |             delta_params = get_layer_param(layer)
 57 | 
 58 |         elif type_name in ['ConvTranspose2d']:
 59 |             _, _, in_h, in_w = x.size()
 60 |             out_h = int((in_h-1)*layer.stride[0] - 2 * layer.padding[0] + layer.kernel_size[0] + layer.output_padding[0])
 61 |             out_w = int((in_w-1)*layer.stride[1] - 2 * layer.padding[1] + layer.kernel_size[1] + layer.output_padding[1])
 62 |             delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] *  \
 63 |                         layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
 64 |             delta_params = get_layer_param(layer)
 65 | 
 66 |         ### ops_learned_conv
 67 |         elif type_name in ['LearnedGroupConv']:
 68 |             measure_layer(layer.relu, x)
 69 |             measure_layer(layer.norm, x)
 70 |             conv = layer.conv
 71 |             out_h = int((x.size()[2] + 2 * conv.padding[0] - conv.kernel_size[0]) /
 72 |                         conv.stride[0] + 1)
 73 |             out_w = int((x.size()[3] + 2 * conv.padding[1] - conv.kernel_size[1]) /
 74 |                         conv.stride[1] + 1)
 75 |             delta_ops = conv.in_channels * conv.out_channels * conv.kernel_size[0] * conv.kernel_size[1] * out_h * out_w / layer.condense_factor * multi_add
 76 |             delta_params = get_layer_param(conv) / layer.condense_factor
 77 | 
 78 |         ### ops_nonlinearity
 79 |         elif type_name in ['ReLU', 'ReLU6']:
 80 |             delta_ops = x.numel()
 81 |             delta_params = get_layer_param(layer)
 82 | 
 83 |         ### ops_pooling
 84 |         elif type_name in ['AvgPool2d', 'MaxPool2d']:
 85 |             in_w = x.size()[2]
 86 |             kernel_ops = layer.kernel_size * layer.kernel_size
 87 |             out_w = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1)
 88 |             out_h = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1)
 89 |             delta_ops = x.size()[0] * x.size()[1] * out_w * out_h * kernel_ops
 90 |             delta_params = get_layer_param(layer)
 91 | 
 92 |         elif type_name in ['LastLevelMaxPool']:
 93 |             pass
 94 | 
 95 |         elif type_name in ['AdaptiveAvgPool2d']:
 96 |             delta_ops = x.size()[0] * x.size()[1] * x.size()[2] * x.size()[3]
 97 |             delta_params = get_layer_param(layer)
 98 | 
 99 |         elif type_name in ['ZeroPad2d', 'RetinaNetPostProcessor']:
100 |             pass
101 |             #delta_ops = x.size()[0] * x.size()[1] * x.size()[2] * x.size()[3]
102 |             #delta_params = get_layer_param(layer)
103 | 
104 |         ### ops_linear
105 |         elif type_name in ['Linear']:
106 |             weight_ops = layer.weight.numel() * multi_add
107 |             # bias_ops = layer.bias.numel()
108 |             # delta_ops = x.size()[0] * (weight_ops + bias_ops)
109 |             delta_ops = x.size()[0] * (weight_ops )
110 |             delta_params = get_layer_param(layer)
111 | 
112 |         ### ops_nothing
113 |         elif type_name in ['BatchNorm2d', 'Dropout2d', 'DropChannel', 'Dropout', 'FrozenBatchNorm2d', 'GroupNorm']:
114 |             delta_params = get_layer_param(layer)
115 | 
116 |         elif type_name in ['SumTwo']:
117 |             delta_ops = x.numel()
118 | 
119 |         elif type_name in ['AggregateCell']:
120 |             if not layer.pre_transform:
121 |                 delta_ops = 2 * x.numel() # twice for each input
122 |             else:
123 |                 measure_layer(layer.branch_1, x)
124 |                 measure_layer(layer.branch_2, x)
125 |                 delta_params = get_layer_param(layer)
126 | 
127 |         elif type_name in ['Identity', 'Zero']:
128 |             pass
129 | 
130 |         elif type_name in ['Scale']:
131 |             delta_params = get_layer_param(layer)
132 |             delta_ops = x.numel()
133 | 
134 |         elif type_name in ['FCOSPostProcessor', 'RPNPostProcessor', 'KeypointPostProcessor',
135 |                            'ROIAlign', 'PostProcessor', 'KeypointRCNNPredictor', 
136 |                            'NaiveSyncBatchNorm', 'Upsample', 'Sequential']:
137 |             pass
138 | 
139 |         elif type_name in ['DeformConv']:
140 |             # don't count bilinear
141 |             offset_conv = list(layer.parameters())[0]
142 |             delta_ops = reduce(operator.mul, offset_conv.size(), x.size()[2] * x.size()[3])
143 |             out_h = int((x.size()[2] + 2 * layer.padding[0] / layer.dilation[0]
144 |                          - layer.kernel_size[0]) / layer.stride[0] + 1)
145 |             out_w = int((x.size()[3] + 2 * layer.padding[1] / layer.dilation[1]
146 |                          - layer.kernel_size[1]) / layer.stride[1] + 1)
147 |             delta_ops += layer.in_channels * layer.out_channels * layer.kernel_size[0] * layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
148 |             delta_params = get_layer_param(layer)
149 | 
150 |         ### unknown layer type
151 |         else:
152 |             raise TypeError('unknown layer type: %s' % type_name)
153 | 
154 |         count_ops += delta_ops
155 |         count_params += delta_params
156 |     return
157 | 
158 | 
159 | def measure_model(model, x):
160 |     global count_ops, count_params
161 |     count_ops = 0
162 |     count_params = 0
163 | 
164 |     def should_measure(x):
165 |         return is_leaf(x) or is_pruned(x)
166 | 
167 |     def modify_forward(model):
168 |         for child in model.children():
169 |             if should_measure(child):
170 |                 def new_forward(m):
171 |                     def lambda_forward(*args):
172 |                         measure_layer(m, *args)
173 |                         return m.old_forward(*args)
174 |                     return lambda_forward
175 |                 child.old_forward = child.forward
176 |                 child.forward = new_forward(child)
177 |             else:
178 |                 modify_forward(child)
179 | 
180 |     def restore_forward(model):
181 |         for child in model.children():
182 |             # leaf node
183 |             if is_leaf(child) and hasattr(child, 'old_forward'):
184 |                 child.forward = child.old_forward
185 |                 child.old_forward = None
186 |             else:
187 |                 restore_forward(child)
188 | 
189 |     modify_forward(model)
190 |     out = model.forward(x)
191 |     restore_forward(model)
192 | 
193 |     return out, count_ops, count_params
194 | 


--------------------------------------------------------------------------------
/adet/data/dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import logging
  3 | import os.path as osp
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from fvcore.common.file_io import PathManager
  8 | from PIL import Image
  9 | from pycocotools import mask as maskUtils
 10 | 
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data import transforms as T
 13 | from detectron2.data.dataset_mapper import DatasetMapper
 14 | from detectron2.data.detection_utils import SizeMismatchError
 15 | from detectron2.structures import BoxMode
 16 | 
 17 | from .augmentation import RandomCropWithInstance
 18 | from .detection_utils import (annotations_to_instances, build_augmentation,
 19 |                               transform_instance_annotations)
 20 | 
 21 | """
 22 | This file contains the default mapping that's applied to "dataset dicts".
 23 | """
 24 | 
 25 | __all__ = ["DatasetMapperWithBasis"]
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | def segmToRLE(segm, img_size):
 31 |     h, w = img_size
 32 |     if type(segm) == list:
 33 |         # polygon -- a single object might consist of multiple parts
 34 |         # we merge all parts into one mask rle code
 35 |         rles = maskUtils.frPyObjects(segm, h, w)
 36 |         rle = maskUtils.merge(rles)
 37 |     elif type(segm["counts"]) == list:
 38 |         # uncompressed RLE
 39 |         rle = maskUtils.frPyObjects(segm, h, w)
 40 |     else:
 41 |         # rle
 42 |         rle = segm
 43 |     return rle
 44 | 
 45 | 
 46 | def segmToMask(segm, img_size):
 47 |     rle = segmToRLE(segm, img_size)
 48 |     m = maskUtils.decode(rle)
 49 |     return m
 50 | 
 51 | 
 52 | class DatasetMapperWithBasis(DatasetMapper):
 53 |     """
 54 |     This caller enables the default Detectron2 mapper to read an additional basis semantic label
 55 |     """
 56 | 
 57 |     def __init__(self, cfg, is_train=True):
 58 |         super().__init__(cfg, is_train)
 59 | 
 60 |         # Rebuild augmentations
 61 |         logger.info(
 62 |             "Rebuilding the augmentations. The previous augmentations will be overridden."
 63 |         )
 64 |         self.augmentation = build_augmentation(cfg, is_train)
 65 | 
 66 |         if cfg.INPUT.CROP.ENABLED and is_train:
 67 |             self.augmentation.insert(
 68 |                 0,
 69 |                 RandomCropWithInstance(
 70 |                     cfg.INPUT.CROP.TYPE,
 71 |                     cfg.INPUT.CROP.SIZE,
 72 |                     cfg.INPUT.CROP.CROP_INSTANCE,
 73 |                 ),
 74 |             )
 75 |             logging.getLogger(__name__).info(
 76 |                 "Cropping used in training: " + str(self.augmentation[0])
 77 |             )
 78 | 
 79 |         # fmt: off
 80 |         self.basis_loss_on       = cfg.MODEL.BASIS_MODULE.LOSS_ON
 81 |         self.ann_set             = cfg.MODEL.BASIS_MODULE.ANN_SET
 82 |         # fmt: on
 83 | 
 84 |     def __call__(self, dataset_dict):
 85 |         """
 86 |         Args:
 87 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 88 | 
 89 |         Returns:
 90 |             dict: a format that builtin models in detectron2 accept
 91 |         """
 92 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 93 |         # USER: Write your own image loading if it's not from a file
 94 |         try:
 95 |             image = utils.read_image(
 96 |                 dataset_dict["file_name"], format=self.image_format
 97 |             )
 98 |         except Exception as e:
 99 |             print(dataset_dict["file_name"])
100 |             print(e)
101 |             raise e
102 |         try:
103 |             utils.check_image_size(dataset_dict, image)
104 |         except SizeMismatchError as e:
105 |             expected_wh = (dataset_dict["width"], dataset_dict["height"])
106 |             image_wh = (image.shape[1], image.shape[0])
107 |             if (image_wh[1], image_wh[0]) == expected_wh:
108 |                 print("transposing image {}".format(dataset_dict["file_name"]))
109 |                 image = image.transpose(1, 0, 2)
110 |             else:
111 |                 raise e
112 | 
113 |         # USER: Remove if you don't do semantic/panoptic segmentation.
114 |         if "sem_seg_file_name" in dataset_dict:
115 |             sem_seg_gt = utils.read_image(
116 |                 dataset_dict.pop("sem_seg_file_name"), "L"
117 |             ).squeeze(2)
118 |         else:
119 |             sem_seg_gt = None
120 | 
121 |         boxes = np.asarray(
122 |             [
123 |                 BoxMode.convert(
124 |                     instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS
125 |                 )
126 |                 for instance in dataset_dict["annotations"]
127 |             ]
128 |         )
129 |         aug_input = T.StandardAugInput(image, boxes=boxes, sem_seg=sem_seg_gt)
130 |         transforms = aug_input.apply_augmentations(self.augmentation)
131 |         image, sem_seg_gt = aug_input.image, aug_input.sem_seg
132 | 
133 |         image_shape = image.shape[:2]  # h, w
134 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
135 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
136 |         # Therefore it's important to use torch.Tensor.
137 |         dataset_dict["image"] = torch.as_tensor(
138 |             np.ascontiguousarray(image.transpose(2, 0, 1))
139 |         )
140 |         if sem_seg_gt is not None:
141 |             dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long"))
142 | 
143 |         # USER: Remove if you don't use pre-computed proposals.
144 |         # Most users would not need this feature.
145 |         if self.proposal_topk:
146 |             utils.transform_proposals(
147 |                 dataset_dict,
148 |                 image_shape,
149 |                 transforms,
150 |                 proposal_topk=self.proposal_topk,
151 |                 min_box_size=self.proposal_min_box_size,
152 |             )
153 | 
154 |         if not self.is_train:
155 |             dataset_dict.pop("annotations", None)
156 |             dataset_dict.pop("sem_seg_file_name", None)
157 |             dataset_dict.pop("pano_seg_file_name", None)
158 |             return dataset_dict
159 | 
160 |         if "annotations" in dataset_dict:
161 |             # USER: Modify this if you want to keep them for some reason.
162 |             for anno in dataset_dict["annotations"]:
163 |                 if not self.use_instance_mask:
164 |                     anno.pop("segmentation", None)
165 |                 if not self.use_keypoint:
166 |                     anno.pop("keypoints", None)
167 | 
168 |             # USER: Implement additional transformations if you have other types of data
169 |             annos = [
170 |                 transform_instance_annotations(
171 |                     obj,
172 |                     transforms,
173 |                     image_shape,
174 |                     keypoint_hflip_indices=self.keypoint_hflip_indices,
175 |                 )
176 |                 for obj in dataset_dict.pop("annotations")
177 |                 if obj.get("iscrowd", 0) == 0
178 |             ]
179 |             instances = annotations_to_instances(
180 |                 annos, image_shape, mask_format=self.instance_mask_format
181 |             )
182 | 
183 |             # After transforms such as cropping are applied, the bounding box may no longer
184 |             # tightly bound the object. As an example, imagine a triangle object
185 |             # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
186 |             # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
187 |             if self.recompute_boxes:
188 |                 instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
189 |             dataset_dict["instances"] = utils.filter_empty_instances(instances)
190 | 
191 |         if self.basis_loss_on and self.is_train:
192 |             # load basis supervisions
193 |             if self.ann_set == "coco":
194 |                 basis_sem_path = (
195 |                     dataset_dict["file_name"]
196 |                     .replace("train2017", "thing_train2017")
197 |                     .replace("image/train", "thing_train")
198 |                 )
199 |             else:
200 |                 basis_sem_path = (
201 |                     dataset_dict["file_name"]
202 |                     .replace("coco", "lvis")
203 |                     .replace("train2017", "thing_train")
204 |                 )
205 |             # change extension to npz
206 |             basis_sem_path = osp.splitext(basis_sem_path)[0] + ".npz"
207 |             basis_sem_gt = np.load(basis_sem_path)["mask"]
208 |             basis_sem_gt = transforms.apply_segmentation(basis_sem_gt)
209 |             basis_sem_gt = torch.as_tensor(basis_sem_gt.astype("long"))
210 |             dataset_dict["basis_sem"] = basis_sem_gt
211 |         return dataset_dict
212 | 


--------------------------------------------------------------------------------