├── adet ├── utils │ ├── __init__.py │ ├── comm.py │ ├── visualizer.py │ └── measures.py ├── structures │ ├── __init__.py │ └── beziers.py ├── __init__.py ├── modeling │ ├── osformer │ │ ├── __init__.py │ │ ├── instance_fusion.py │ │ ├── trans_utils.py │ │ ├── feed_forward.py │ │ ├── position_encoding.py │ │ ├── trans_encoder.py │ │ ├── loss.py │ │ └── trans_decoder.py │ ├── __init__.py │ ├── backbone │ │ ├── __init__.py │ │ ├── fpn.py │ │ ├── lpf.py │ │ ├── resnet_interval.py │ │ └── mobilenet.py │ └── ops │ │ ├── make.sh │ │ ├── modules │ │ ├── __init__.py │ │ └── ms_deform_attn.py │ │ ├── functions │ │ ├── __init__.py │ │ └── ms_deform_attn_func.py │ │ ├── src │ │ ├── vision.cpp │ │ ├── cuda │ │ │ ├── ms_deform_attn_cuda.h │ │ │ └── ms_deform_attn_cuda.cu │ │ ├── cpu │ │ │ ├── ms_deform_attn_cpu.h │ │ │ └── ms_deform_attn_cpu.cpp │ │ └── ms_deform_attn.h │ │ ├── setup.py │ │ └── test.py ├── config │ ├── __init__.py │ ├── config.py │ └── defaults.py ├── checkpoint │ ├── __init__.py │ └── adet_checkpoint.py ├── evaluation │ └── __init__.py ├── layers │ ├── csrc │ │ ├── cuda_version.cu │ │ ├── ml_nms │ │ │ ├── ml_nms.h │ │ │ └── ml_nms.cu │ │ ├── vision.cpp │ │ ├── DefROIAlign │ │ │ └── DefROIAlign.h │ │ └── BezierAlign │ │ │ └── BezierAlign.h │ ├── __init__.py │ ├── ml_nms.py │ ├── conv_with_kaiming_uniform.py │ ├── iou_loss.py │ ├── gcn.py │ ├── naive_group_norm.py │ ├── bezier_align.py │ ├── def_roi_align.py │ └── deform_conv.py └── data │ ├── __init__.py │ ├── datasets │ └── cis.py │ ├── builtin.py │ ├── detection_utils.py │ ├── augmentation.py │ └── dataset_mapper.py ├── docs ├── 488.gif ├── 4126.gif ├── OSFormer.png ├── logo_osformer.png ├── COD10K-CAM-3-Flying-65-Owl-4620.gif ├── COD10K-CAM-3-Flying-53-Bird-3024.gif ├── .gitattributes └── faq.md ├── .gitignore ├── configs ├── CIS_R101.yaml ├── CIS_R50.yaml ├── CIS_RT.yaml ├── CIS_PVTv2B2Li.yaml ├── CIS_SWINT.yaml └── Base-CIS.yaml ├── requirements.txt ├── tools ├── img2gif.py ├── compare_ap.py ├── csv2txt.py ├── eval_single.py ├── combine_vis.py ├── plot_utils.py ├── visualize_data.py ├── visualize_feat.py └── train_net.py ├── demo ├── vis_pred_json.py └── demo.py ├── setup.py └── README.md /adet/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /adet/structures/__init__.py: -------------------------------------------------------------------------------- 1 | from .beziers import Beziers -------------------------------------------------------------------------------- /adet/__init__.py: -------------------------------------------------------------------------------- 1 | from adet import modeling 2 | 3 | __version__ = "0.1.1" 4 | -------------------------------------------------------------------------------- /adet/modeling/osformer/__init__.py: -------------------------------------------------------------------------------- 1 | from .osformer import OSFormer 2 | 3 | -------------------------------------------------------------------------------- /docs/488.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PJLallen/OSFormer/HEAD/docs/488.gif -------------------------------------------------------------------------------- /docs/4126.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PJLallen/OSFormer/HEAD/docs/4126.gif -------------------------------------------------------------------------------- /docs/OSFormer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PJLallen/OSFormer/HEAD/docs/OSFormer.png -------------------------------------------------------------------------------- /docs/logo_osformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PJLallen/OSFormer/HEAD/docs/logo_osformer.png -------------------------------------------------------------------------------- /adet/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import get_cfg 2 | 3 | __all__ = [ 4 | "get_cfg", 5 | ] 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .idea 3 | build 4 | output 5 | *.ipynb 6 | *.egg-info 7 | .ipynb_checkpoints 8 | *.so 9 | -------------------------------------------------------------------------------- /adet/checkpoint/__init__.py: -------------------------------------------------------------------------------- 1 | from .adet_checkpoint import AdetCheckpointer 2 | 3 | __all__ = ["AdetCheckpointer"] 4 | -------------------------------------------------------------------------------- /docs/COD10K-CAM-3-Flying-65-Owl-4620.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PJLallen/OSFormer/HEAD/docs/COD10K-CAM-3-Flying-65-Owl-4620.gif -------------------------------------------------------------------------------- /docs/COD10K-CAM-3-Flying-53-Bird-3024.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PJLallen/OSFormer/HEAD/docs/COD10K-CAM-3-Flying-53-Bird-3024.gif -------------------------------------------------------------------------------- /adet/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .text_evaluation import TextEvaluator 2 | from .text_eval_script import text_eval_main 3 | from . import rrc_evaluation_funcs -------------------------------------------------------------------------------- /configs/CIS_R101.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CIS.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | -------------------------------------------------------------------------------- /configs/CIS_R50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CIS.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | RESNETS: 5 | DEPTH: 50 6 | -------------------------------------------------------------------------------- /adet/layers/csrc/cuda_version.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace adet { 4 | int get_cudart_version() { 5 | return CUDART_VERSION; 6 | } 7 | } // namespace adet 8 | -------------------------------------------------------------------------------- /configs/CIS_RT.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "./CIS_R50.yaml" 2 | INPUT: 3 | MIN_SIZE_TEST: 550 4 | MIN_SIZE_TRAIN: (550,) 5 | MODEL: 6 | OSFormer: 7 | ENC_LAYERS: 3 8 | DEC_LAYERS: 3 9 | -------------------------------------------------------------------------------- /adet/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import builtin # ensure the builtin datasets are registered 2 | from .dataset_mapper import DatasetMapperWithBasis 3 | 4 | 5 | __all__ = ["DatasetMapperWithBasis"] 6 | -------------------------------------------------------------------------------- /adet/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone import build_fcos_resnet_fpn_backbone 2 | from .osformer import OSFormer 3 | 4 | _EXCLUDE = {"torch", "ShapeSpec"} 5 | __all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")] 6 | -------------------------------------------------------------------------------- /configs/CIS_PVTv2B2Li.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CIS.yaml" 2 | MODEL: 3 | WEIGHTS: "" # download weights from https://github.com/whai362/PVT 4 | BACKBONE: 5 | NAME: "build_pvt_v2_b2_li" 6 | PVTV2: 7 | OUT_FEATURES: [ "res2", "res3", "res4", "res5" ] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools==59.5.0 2 | termcolor>=1.1 3 | Pillow>=6.0 4 | yacs>=0.1.6 5 | tabulate 6 | cloudpickle 7 | matplotlib 8 | tqdm>4.29.0 9 | tensorboard 10 | python-Levenshtein 11 | Polygon3 12 | shapely 13 | kornia==0.6.8 14 | opencv-python 15 | timm 16 | -------------------------------------------------------------------------------- /adet/config/config.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import CfgNode 2 | 3 | 4 | def get_cfg() -> CfgNode: 5 | """ 6 | Get a copy of the default config. 7 | 8 | Returns: 9 | a detectron2 CfgNode instance. 10 | """ 11 | from .defaults import _C 12 | 13 | return _C.clone() 14 | -------------------------------------------------------------------------------- /adet/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .fpn import build_fcos_resnet_fpn_backbone 2 | from .dla import build_fcos_dla_fpn_backbone 3 | from .resnet_lpf import build_resnet_lpf_backbone 4 | from .bifpn import build_fcos_resnet_bifpn_backbone 5 | from .pvt_v2 import build_pvt_v2_b2_li, build_pvt_v2_fpn_b2_li 6 | from .swin import D2SwinTransformer 7 | -------------------------------------------------------------------------------- /adet/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .deform_conv import DFConv2d 2 | from .ml_nms import ml_nms 3 | from .iou_loss import IOULoss 4 | from .conv_with_kaiming_uniform import conv_with_kaiming_uniform 5 | from .bezier_align import BezierAlign 6 | from .def_roi_align import DefROIAlign 7 | from .naive_group_norm import NaiveGroupNorm 8 | from .gcn import GCN 9 | 10 | __all__ = [k for k in globals().keys() if not k.startswith("_")] -------------------------------------------------------------------------------- /adet/modeling/ops/make.sh: -------------------------------------------------------------------------------- 1 | # Taken from https://github.com/fundamentalvision/Deformable-DETR 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | 8 | python setup.py build install 9 | -------------------------------------------------------------------------------- /adet/modeling/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Taken from https://github.com/fundamentalvision/Deformable-DETR 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------\ 7 | 8 | from .ms_deform_attn import MSDeformAttn 9 | -------------------------------------------------------------------------------- /adet/modeling/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # Taken from https://github.com/fundamentalvision/Deformable-DETR 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | 8 | from .ms_deform_attn_func import MSDeformAttnFunction 9 | 10 | -------------------------------------------------------------------------------- /configs/CIS_SWINT.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CIS.yaml" 2 | MODEL: 3 | WEIGHTS: "" # download weights from https://github.com/microsoft/Swin-Transformer 4 | BACKBONE: 5 | NAME: "D2SwinTransformer" 6 | SWIN: 7 | EMBED_DIM: 96 8 | DEPTHS: [ 2, 2, 6, 2 ] 9 | NUM_HEADS: [ 3, 6, 12, 24 ] 10 | WINDOW_SIZE: 7 11 | APE: False 12 | DROP_PATH_RATE: 0.3 13 | PATCH_NORM: True 14 | PIXEL_MEAN: [ 123.675, 116.280, 103.530 ] 15 | PIXEL_STD: [ 58.395, 57.120, 57.375 ] 16 | INPUT: 17 | MIN_SIZE_TRAIN: (800,) 18 | MIN_SIZE_TEST: 800 -------------------------------------------------------------------------------- /docs/.gitattributes: -------------------------------------------------------------------------------- 1 | faq.md filter=lfs diff=lfs merge=lfs -text 2 | logo_osformer.png filter=lfs diff=lfs merge=lfs -text 3 | OSFormer.png filter=lfs diff=lfs merge=lfs -text 4 | 4126.gif filter=lfs diff=lfs merge=lfs -text 5 | COD10K-CAM-3-Flying-53-Bird-3024.gif filter=lfs diff=lfs merge=lfs -text 6 | ECCV'22[[:space:]]poster-1317.pdf filter=lfs diff=lfs merge=lfs -text 7 | 488.gif filter=lfs diff=lfs merge=lfs -text 8 | COD10K-CAM-3-Flying-65-Owl-4620.gif filter=lfs diff=lfs merge=lfs -text 9 | ECCV'22[[:space:]]video-1317.mp4 filter=lfs diff=lfs merge=lfs -text 10 | -------------------------------------------------------------------------------- /adet/modeling/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | Taken from https://github.com/fundamentalvision/Deformable-DETR 4 | ************************************************************************************************** 5 | * Deformable DETR 6 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 7 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /adet/layers/csrc/ml_nms/ml_nms.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace adet { 5 | 6 | 7 | #ifdef WITH_CUDA 8 | at::Tensor ml_nms_cuda( 9 | const at::Tensor dets, 10 | const float threshold); 11 | #endif 12 | 13 | at::Tensor ml_nms(const at::Tensor& dets, 14 | const at::Tensor& scores, 15 | const at::Tensor& labels, 16 | const float threshold) { 17 | 18 | if (dets.type().is_cuda()) { 19 | #ifdef WITH_CUDA 20 | // TODO raise error if not compiled with CUDA 21 | if (dets.numel() == 0) 22 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 23 | auto b = at::cat({dets, scores.unsqueeze(1), labels.unsqueeze(1)}, 1); 24 | return ml_nms_cuda(b, threshold); 25 | #else 26 | AT_ERROR("Not compiled with GPU support"); 27 | #endif 28 | } 29 | AT_ERROR("CPU version not implemented"); 30 | } 31 | 32 | } // namespace adet 33 | -------------------------------------------------------------------------------- /adet/layers/ml_nms.py: -------------------------------------------------------------------------------- 1 | from detectron2.layers import batched_nms 2 | 3 | 4 | def ml_nms(boxlist, nms_thresh, max_proposals=-1, 5 | score_field="scores", label_field="labels"): 6 | """ 7 | Performs non-maximum suppression on a boxlist, with scores specified 8 | in a boxlist field via score_field. 9 | 10 | Args: 11 | boxlist (detectron2.structures.Boxes): 12 | nms_thresh (float): 13 | max_proposals (int): if > 0, then only the top max_proposals are kept 14 | after non-maximum suppression 15 | score_field (str): 16 | """ 17 | if nms_thresh <= 0: 18 | return boxlist 19 | boxes = boxlist.pred_boxes.tensor 20 | scores = boxlist.scores 21 | labels = boxlist.pred_classes 22 | keep = batched_nms(boxes, scores, labels, nms_thresh) 23 | if max_proposals > 0: 24 | keep = keep[: max_proposals] 25 | boxlist = boxlist[keep] 26 | return boxlist 27 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## Dataset settings 4 | 5 | Following [SINet](https://github.com/DengPingFan/SINet) and other previous COD works, the original COD10K contains both camouflaged and non-camouflaged images. We only use camouflaged images with instance-level labels for training (3,040) and testing (2,026). 6 | We have uploded the 3,040 training and 2,026 testing COD10K dataset Baidu/[Google](https://drive.google.com/file/d/1YGa3v-MiXy-3MMJDkidLXPt0KQwygt-Z/view?usp=sharing)/[Quark](https://pan.quark.cn/s/07ba3258b777). 7 | 8 | | Dataset | CAM (instance-level) | NonCAM | Total | 9 | | ---- | ---- | ---- | ---- | 10 | | COD10K-Train | 3040 | 2960 | 6000 | 11 | | COD10K-Test | 2026 | 1974 | 4000 | 12 | 13 | ## Paper link 14 | 15 | Arxiv: https://arxiv.org/abs/2207.02255 16 | 17 | ## Initial weights for PVT and Swin 18 | 19 | To fit detectron2 framework, we add prefix to the key of pth. See https://github.com/PJLallen/OSFormer/issues/4 for details. 20 | -------------------------------------------------------------------------------- /tools/img2gif.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from PIL import Image 4 | 5 | STEP = 8 6 | BEZEL = 5 # pixel numbers 7 | DURATION = 200 8 | 9 | def img2gif(img_og, img_pred, out_name): 10 | im1 = cv2.imread(img_og) 11 | im2 = cv2.imread(img_pred) 12 | 13 | h, w, _ = im1.shape 14 | step_size = w // STEP 15 | res_list = [] 16 | 17 | for i in range(STEP): 18 | tmp = np.ones_like(im1) * 255 19 | tmp[:, :(i + 1) * step_size - BEZEL] = im1[:, :(i + 1) * step_size - BEZEL] 20 | tmp[:, (i + 1) * step_size:] = im2[:, (i + 1) * step_size:] 21 | res_list.append(Image.fromarray(tmp[...,::-1].astype('uint8')).convert('RGB')) 22 | 23 | img = res_list[0] # extract first image from iterator 24 | img.save(fp=out_name, format='GIF', append_images=res_list, 25 | save_all=True, duration=200, loop=0) 26 | 27 | 28 | if __name__ == '__main__': 29 | img2gif( 30 | '../test_00000003.jpg', 31 | '../test_00000003.png', 32 | '../output.gif') 33 | -------------------------------------------------------------------------------- /adet/modeling/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | Taken from https://github.com/fundamentalvision/Deformable-DETR 4 | ************************************************************************************************** 5 | * Deformable DETR 6 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 7 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | 31 | -------------------------------------------------------------------------------- /adet/modeling/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | Taken from https://github.com/fundamentalvision/Deformable-DETR 4 | ************************************************************************************************** 5 | * Deformable DETR 6 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 7 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | 34 | -------------------------------------------------------------------------------- /configs/Base-CIS.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "OSFormer" 3 | MASK_ON: True 4 | BACKBONE: 5 | NAME: "build_resnet_backbone" 6 | RESNETS: 7 | OUT_FEATURES: [ "res2", "res3", "res4", "res5" ] 8 | BASIS_MODULE: 9 | NUM_CLASSES: 1 10 | OSFormer: 11 | NUM_CLASSES: 1 12 | INSTANCE_IN_FEATURES: [ "res3", "res4", "res5" ] 13 | MASK_IN_FEATURES: [ "res2", "trans3", "trans4", "trans5" ] 14 | FEAT_INSTANCE_STRIDES: [ 8, 16, 32 ] 15 | FEAT_SCALE_RANGES: ((1, 192), (96, 384), (192, 2048)) 16 | NUM_GRIDS: [ 36, 24, 16 ] 17 | INS_FUSION: 'camin' 18 | DCIN_NORM: False 19 | NOFPN: True 20 | SEM_LOSS: True 21 | C2F_MASK: True 22 | ENC_LAYERS: 6 23 | DEC_LAYERS: 3 24 | LOSS: 25 | SEM_WEIGHT: 1.0 26 | SEM_TYPE: 'dice' 27 | SEM_SEG_HEAD: 28 | NUM_CLASSES: 1 29 | DATASETS: 30 | TRAIN: ("my_data_train_coco_cod_style",) 31 | TEST: ("my_data_test_coco_cod_style", "my_data_test_coco_nc4k_style") 32 | SOLVER: 33 | BASE_LR: 0.00025 34 | WEIGHT_DECAY: 0.0001 35 | STEPS: (60000, 80000) 36 | MAX_ITER: 90000 37 | IMS_PER_BATCH: 2 38 | WARMUP_FACTOR: 0.01 39 | WARMUP_ITERS: 1000 40 | INPUT: 41 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 42 | MIN_SIZE_TEST: 800 43 | MASK_FORMAT: "bitmask" 44 | VERSION: 2 45 | 46 | 47 | -------------------------------------------------------------------------------- /adet/modeling/osformer/instance_fusion.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class DCIN(nn.Module): 5 | def __init__(self, num_kernels, norm): 6 | super(DCIN, self).__init__() 7 | 8 | self.affine_scale = nn.Linear(num_kernels, num_kernels, bias=True) 9 | self.affine_bias = nn.Linear(num_kernels, 1, bias=True) 10 | self.norm = norm 11 | 12 | def forward(self, mask_features, kernel_features): 13 | """ 14 | mask_features: shape of (1, c, w, h) 15 | kernel_features: shape of (n, c) 16 | 17 | return: shape of (1, n, w, h) 18 | """ 19 | kernel_w = self.affine_scale(kernel_features) # (n, c) 20 | kernel_b = self.affine_bias(kernel_features) # (n, 1) 21 | bs, c, w, h = mask_features.shape 22 | x = mask_features.view((bs, c, -1)) # (bs, c, k) 23 | if self.norm: 24 | x_mean = x.mean(2, keepdim=True) # (bs, c, 1) 25 | x_centered = x - x_mean # (bs, c, k) 26 | # add 1e-10 to avoid NaN 27 | x_std_rev = ((x_centered * x_centered).mean(2, keepdim=True) + 1e-10).rsqrt() # (bs, c, 1) 28 | x_norm = x_centered * x_std_rev # (bs, c, k) 29 | else: 30 | x_norm = x 31 | 32 | return (kernel_w.matmul(x_norm) + kernel_b).view((bs, -1, w, h)) 33 | -------------------------------------------------------------------------------- /adet/modeling/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | Taken from https://github.com/fundamentalvision/Deformable-DETR 4 | ************************************************************************************************** 5 | * Deformable DETR 6 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 7 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /adet/modeling/osformer/trans_utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | from torch import nn 4 | 5 | 6 | def _get_clones(module, n): 7 | return nn.ModuleList([copy.deepcopy(module) for _ in range(n)]) 8 | 9 | 10 | def with_pos_embed(tensor, pos): 11 | return tensor if pos is None else tensor + pos 12 | 13 | 14 | def get_reference_points(spatial_shapes, batch_size, device): 15 | reference_points_list = [] # [(2, 7832, 2), (2, 1980, 2), (2, 506, 2), (2, 132, 2)] 16 | for lvl, (H_, W_) in enumerate(spatial_shapes): 17 | # (H_, W_), (H_, W_) 18 | ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device), 19 | torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device)) 20 | ref_y = ref_y.reshape(-1).expand((batch_size, H_ * W_)) / H_ # (2, H_ * W_) 21 | ref_x = ref_x.reshape(-1).expand((batch_size, H_ * W_)) / W_ # (2, H_ * W_) 22 | ref = torch.stack((ref_x, ref_y), -1) 23 | reference_points_list.append(ref) # (2, H_ * W_, 2) 24 | reference_points = torch.cat(reference_points_list, 1) # (2, 10540, 2) 25 | # reference_points[:, :, None] (2, 10450, 1, 2) valid_ratios[:, None] (2, 1, 4, 2) 26 | _, xx, yy = reference_points.shape 27 | reference_points = reference_points[:, :, None].expand((batch_size, xx, lvl + 1, yy)) # (2, 10540, 4, 2) 28 | return reference_points 29 | -------------------------------------------------------------------------------- /tools/compare_ap.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | from collections import defaultdict, OrderedDict 5 | 6 | 7 | basedir = 'SOTA' 8 | base_method = 'osformer' 9 | datasets = ['cod', 'nc4k'] 10 | file_template = 'my_data_test_coco_{}_style_ap.json' 11 | 12 | 13 | for dataset in datasets: 14 | 15 | with open(os.path.join(basedir, base_method, file_template.format(dataset)), 'r') as f: 16 | ours = json.load(f) 17 | print(base_method, len(ours.keys())) 18 | 19 | delta_dict = defaultdict(list) 20 | 21 | for method in os.listdir(basedir): 22 | if method == base_method or not os.path.isdir(os.path.join(basedir, method)): 23 | continue 24 | 25 | with open(os.path.join(basedir, method, file_template.format(dataset)), 'r') as f: 26 | other = json.load(f) 27 | print(method, len(other.keys())) 28 | 29 | for k, v in ours.items(): 30 | our_ap = float(ours[k]['AP']) 31 | other_ap = float(other[k]['AP']) if other.get(k) else 0 32 | if np.isnan(other_ap): 33 | other_ap = 0 34 | delta_dict[k].append(our_ap - other_ap) 35 | 36 | od = [] 37 | for k, v in delta_dict.items(): 38 | print(k, v) 39 | od.append((k, np.mean(v))) 40 | 41 | od.sort(key=lambda x: x[1], reverse=True) 42 | res = OrderedDict() 43 | for elem in od: 44 | res[elem[0]] = elem[1] 45 | 46 | with open(os.path.join(basedir, 'desc_res_{}.json').format(dataset), 'w') as f: 47 | json.dump(res, f, indent=4) 48 | 49 | -------------------------------------------------------------------------------- /tools/csv2txt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from collections import OrderedDict 4 | 5 | 6 | def csv2tex(csv_path, out_path=None): 7 | if out_path is None: 8 | out_path = csv_path.replace('csv', '.txt') 9 | res = OrderedDict() 10 | res_og = OrderedDict() 11 | with open(csv_path, 'r') as f: 12 | flag = False 13 | for idx, line in enumerate(f): 14 | data_list = line.replace('\n', '').split(',') 15 | for ydx, elem in enumerate(data_list): 16 | if data_list[ydx] != '': 17 | try: 18 | if 'Max' in data_list[ydx]: 19 | flt = float(data_list[ydx].replace('Max', '')) 20 | data_list[ydx] = '\\textbf{' + '{:>4.1f}'.format(flt) + '}' 21 | else: 22 | flt = float(data_list[ydx]) 23 | data_list[ydx] = '{:>4.1f}'.format(flt) 24 | except Exception as e: 25 | pass 26 | if data_list[0] == '': 27 | if flag is False: 28 | flag = True 29 | continue 30 | break 31 | flag = False 32 | res[data_list[0]] = ' & '.join(data_list[1:]) 33 | res_og[data_list[0]] = data_list[1:] 34 | 35 | with open(out_path, 'w') as f: 36 | for k, v in res.items(): 37 | f.write('{:<15} & '.format(k) + v + ' \\\\ \n') 38 | 39 | 40 | if __name__ == '__main__': 41 | csv2tex('OSFormer-NoVal-Attribute.csv') 42 | 43 | -------------------------------------------------------------------------------- /adet/utils/comm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.distributed as dist 4 | 5 | from detectron2.utils.comm import get_world_size 6 | 7 | 8 | def reduce_sum(tensor): 9 | world_size = get_world_size() 10 | if world_size < 2: 11 | return tensor 12 | tensor = tensor.clone() 13 | dist.all_reduce(tensor, op=dist.ReduceOp.SUM) 14 | return tensor 15 | 16 | 17 | def aligned_bilinear(tensor, factor): 18 | assert tensor.dim() == 4 19 | assert factor >= 1 20 | assert int(factor) == factor 21 | 22 | if factor == 1: 23 | return tensor 24 | 25 | h, w = tensor.size()[2:] 26 | tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate") 27 | oh = factor * h + 1 28 | ow = factor * w + 1 29 | tensor = F.interpolate( 30 | tensor, size=(oh, ow), 31 | mode='bilinear', 32 | align_corners=True 33 | ) 34 | tensor = F.pad( 35 | tensor, pad=(factor // 2, 0, factor // 2, 0), 36 | mode="replicate" 37 | ) 38 | 39 | return tensor[:, :, :oh - 1, :ow - 1] 40 | 41 | 42 | def compute_locations(h, w, stride, device): 43 | shifts_x = torch.arange( 44 | 0, w * stride, step=stride, 45 | dtype=torch.float32, device=device 46 | ) 47 | shifts_y = torch.arange( 48 | 0, h * stride, step=stride, 49 | dtype=torch.float32, device=device 50 | ) 51 | shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) 52 | shift_x = shift_x.reshape(-1) 53 | shift_y = shift_y.reshape(-1) 54 | locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2 55 | return locations 56 | -------------------------------------------------------------------------------- /adet/checkpoint/adet_checkpoint.py: -------------------------------------------------------------------------------- 1 | import pickle, os 2 | from fvcore.common.file_io import PathManager 3 | from detectron2.checkpoint import DetectionCheckpointer 4 | 5 | 6 | class AdetCheckpointer(DetectionCheckpointer): 7 | """ 8 | Same as :class:`DetectronCheckpointer`, but is able to convert models 9 | in AdelaiDet, such as LPF backbone. 10 | """ 11 | def _load_file(self, filename): 12 | if filename.endswith(".pkl"): 13 | with PathManager.open(filename, "rb") as f: 14 | data = pickle.load(f, encoding="latin1") 15 | if "model" in data and "__author__" in data: 16 | # file is in Detectron2 model zoo format 17 | self.logger.info("Reading a file from '{}'".format(data["__author__"])) 18 | return data 19 | else: 20 | # assume file is from Caffe2 / Detectron1 model zoo 21 | if "blobs" in data: 22 | # Detection models have "blobs", but ImageNet models don't 23 | data = data["blobs"] 24 | data = {k: v for k, v in data.items() if not k.endswith("_momentum")} 25 | if "weight_order" in data: 26 | del data["weight_order"] 27 | return {"model": data, "__author__": "Caffe2", "matching_heuristics": True} 28 | 29 | loaded = super()._load_file(filename) # load native pth checkpoint 30 | if "model" not in loaded: 31 | loaded = {"model": loaded} 32 | 33 | basename = os.path.basename(filename).lower() 34 | if "lpf" in basename or "dla" in basename: 35 | loaded["matching_heuristics"] = True 36 | return loaded 37 | -------------------------------------------------------------------------------- /adet/data/datasets/cis.py: -------------------------------------------------------------------------------- 1 | import os 2 | from detectron2.data.datasets.coco import load_coco_json 3 | from detectron2.data import MetadataCatalog, DatasetCatalog 4 | 5 | DATASET_ROOT = 'COD10K-v3' 6 | ANN_ROOT = os.path.join(DATASET_ROOT, 'annotations') 7 | TRAIN_PATH = os.path.join(DATASET_ROOT, 'Train/Image') 8 | TEST_PATH = os.path.join(DATASET_ROOT, 'Test/Image') 9 | TRAIN_JSON = os.path.join(ANN_ROOT, 'train_instance.json') 10 | TEST_JSON = os.path.join(ANN_ROOT, 'test2026.json') 11 | 12 | NC4K_ROOT = 'NC4K' 13 | NC4K_PATH = os.path.join(NC4K_ROOT, 'Imgs') 14 | NC4K_JSON = os.path.join(NC4K_ROOT, 'nc4k_test.json') 15 | 16 | CLASS_NAMES = ["foreground"] 17 | 18 | PREDEFINED_SPLITS_DATASET = { 19 | "my_data_train_coco_cod_style": (TRAIN_PATH, TRAIN_JSON), 20 | "my_data_test_coco_cod_style": (TEST_PATH, TEST_JSON), 21 | "my_data_test_coco_nc4k_style": (NC4K_PATH, NC4K_JSON), 22 | } 23 | 24 | 25 | def register_dataset(): 26 | """ 27 | purpose: register all splits of dataset with PREDEFINED_SPLITS_DATASET 28 | """ 29 | for key, (image_root, json_file) in PREDEFINED_SPLITS_DATASET.items(): 30 | register_dataset_instances(name=key, 31 | json_file=json_file, 32 | image_root=image_root) 33 | 34 | 35 | def register_dataset_instances(name, json_file, image_root): 36 | """ 37 | purpose: register dataset to DatasetCatalog, 38 | register metadata to MetadataCatalog and set attribute 39 | """ 40 | DatasetCatalog.register(name, lambda: load_coco_json(json_file, image_root, name)) 41 | MetadataCatalog.get(name).set(json_file=json_file, 42 | image_root=image_root, 43 | evaluator_type="coco") -------------------------------------------------------------------------------- /adet/layers/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "ml_nms/ml_nms.h" 3 | #include "DefROIAlign/DefROIAlign.h" 4 | #include "BezierAlign/BezierAlign.h" 5 | 6 | namespace adet { 7 | 8 | #ifdef WITH_CUDA 9 | extern int get_cudart_version(); 10 | #endif 11 | 12 | std::string get_cuda_version() { 13 | #ifdef WITH_CUDA 14 | std::ostringstream oss; 15 | 16 | // copied from 17 | // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231 18 | auto printCudaStyleVersion = [&](int v) { 19 | oss << (v / 1000) << "." << (v / 10 % 100); 20 | if (v % 10 != 0) { 21 | oss << "." << (v % 10); 22 | } 23 | }; 24 | printCudaStyleVersion(get_cudart_version()); 25 | return oss.str(); 26 | #else 27 | return std::string("not available"); 28 | #endif 29 | } 30 | 31 | // similar to 32 | // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp 33 | std::string get_compiler_version() { 34 | std::ostringstream ss; 35 | #if defined(__GNUC__) 36 | #ifndef __clang__ 37 | { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; } 38 | #endif 39 | #endif 40 | 41 | #if defined(__clang_major__) 42 | { 43 | ss << "clang " << __clang_major__ << "." << __clang_minor__ << "." 44 | << __clang_patchlevel__; 45 | } 46 | #endif 47 | 48 | #if defined(_MSC_VER) 49 | { ss << "MSVC " << _MSC_FULL_VER; } 50 | #endif 51 | return ss.str(); 52 | } 53 | 54 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 55 | m.def("ml_nms", &ml_nms, "Multi-Label NMS"); 56 | m.def("def_roi_align_forward", &DefROIAlign_forward, "def_roi_align_forward"); 57 | m.def("def_roi_align_backward", &DefROIAlign_backward, "def_roi_align_backward"); 58 | m.def("bezier_align_forward", &BezierAlign_forward, "bezier_align_forward"); 59 | m.def("bezier_align_backward", &BezierAlign_backward, "bezier_align_backward"); 60 | } 61 | 62 | } // namespace adet 63 | -------------------------------------------------------------------------------- /adet/structures/beziers.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | import torch 3 | 4 | 5 | class Beziers: 6 | """ 7 | This structure stores a list of bezier curves as a Nx16 torch.Tensor. 8 | It will support some common methods about bezier shapes 9 | (`area`, `clip`, `nonempty`, etc), 10 | and also behaves like a Tensor 11 | (support indexing, `to(device)`, `.device`, and iteration over all boxes) 12 | 13 | Attributes: 14 | tensor (torch.Tensor): float matrix of Nx4. Each row is (x1, y1, x2, y2). 15 | """ 16 | 17 | def __init__(self, tensor: torch.Tensor): 18 | """ 19 | Args: 20 | tensor (Tensor[float]): a Nx4 matrix. Each row is (x1, y1, x2, y2). 21 | """ 22 | device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu") 23 | tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) 24 | if tensor.numel() == 0: 25 | # Use reshape, so we don't end up creating a new tensor that does not depend on 26 | # the inputs (and consequently confuses jit) 27 | tensor = tensor.reshape((0, 16)).to(dtype=torch.float32, device=device) 28 | assert tensor.dim() == 2 and tensor.size(-1) == 16, tensor.size() 29 | 30 | self.tensor = tensor 31 | 32 | def to(self, device: str) -> "Beziers": 33 | return Beziers(self.tensor.to(device)) 34 | 35 | def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Beziers": 36 | """ 37 | Returns: 38 | Beziers: Create a new :class:`Beziers` by indexing. 39 | """ 40 | if isinstance(item, int): 41 | return Beziers(self.tensor[item].view(1, -1)) 42 | b = self.tensor[item] 43 | assert b.dim() == 2, "Indexing on Boxes with {} failed to return a matrix!".format(item) 44 | return Beziers(b) -------------------------------------------------------------------------------- /adet/layers/conv_with_kaiming_uniform.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from detectron2.layers import Conv2d 4 | from .deform_conv import DFConv2d 5 | from detectron2.layers.batch_norm import get_norm 6 | 7 | 8 | def conv_with_kaiming_uniform( 9 | norm=None, activation=None, 10 | use_deformable=False, use_sep=False): 11 | def make_conv( 12 | in_channels, out_channels, kernel_size, stride=1, dilation=1 13 | ): 14 | if use_deformable: 15 | conv_func = DFConv2d 16 | else: 17 | conv_func = Conv2d 18 | if use_sep: 19 | assert in_channels == out_channels 20 | groups = in_channels 21 | else: 22 | groups = 1 23 | conv = conv_func( 24 | in_channels, 25 | out_channels, 26 | kernel_size=kernel_size, 27 | stride=stride, 28 | padding=dilation * (kernel_size - 1) // 2, 29 | dilation=dilation, 30 | groups=groups, 31 | bias=(norm is None) 32 | ) 33 | if not use_deformable: 34 | # Caffe2 implementation uses XavierFill, which in fact 35 | # corresponds to kaiming_uniform_ in PyTorch 36 | nn.init.kaiming_uniform_(conv.weight, a=1) 37 | if norm is None: 38 | nn.init.constant_(conv.bias, 0) 39 | module = [conv,] 40 | if norm is not None and len(norm) > 0: 41 | if norm == "GN": 42 | norm_module = nn.GroupNorm(32, out_channels) 43 | else: 44 | norm_module = get_norm(norm, out_channels) 45 | module.append(norm_module) 46 | if activation is not None: 47 | module.append(nn.ReLU(inplace=True)) 48 | if len(module) > 1: 49 | return nn.Sequential(*module) 50 | return conv 51 | 52 | return make_conv 53 | -------------------------------------------------------------------------------- /tools/eval_single.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import tqdm 4 | import json 5 | from collections import OrderedDict 6 | from detectron2.engine.defaults import DefaultPredictor 7 | from detectron2.evaluation.coco_evaluation import COCOEvaluator 8 | 9 | from adet.config import get_cfg 10 | from adet.data.datasets.cis import register_dataset 11 | from tools.train_net import Trainer 12 | 13 | 14 | def setup_cfg(config_file, model_weights, confidence_threshold): 15 | cfg = get_cfg() 16 | cfg.merge_from_file(config_file) 17 | # Set score_threshold for builtin models 18 | cfg.MODEL.WEIGHTS = model_weights 19 | cfg.MODEL.RETINANET.SCORE_THRESH_TEST = confidence_threshold 20 | cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = confidence_threshold 21 | cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = confidence_threshold 22 | cfg.freeze() 23 | 24 | return cfg 25 | 26 | 27 | def main(config_file, model_weights, dataset_name, output_dir=None, confidence_threshold=0.3): 28 | cfg = setup_cfg(config_file, model_weights, confidence_threshold) 29 | 30 | if output_dir is None: 31 | output_dir = os.path.dirname(model_weights) 32 | 33 | if not os.path.exists(output_dir): 34 | os.makedirs(output_dir) 35 | 36 | predictor = DefaultPredictor(cfg) 37 | model = predictor.model 38 | data_loader = Trainer.build_test_loader(cfg, dataset_name) 39 | coco_eval = COCOEvaluator(dataset_name, output_dir=output_dir, tasks=('segm',)) 40 | 41 | eval_res = OrderedDict() 42 | for elem in tqdm.tqdm(data_loader): 43 | predictions = model(elem) 44 | coco_eval.reset() 45 | coco_eval.process(elem, predictions) 46 | eval_cur = coco_eval.evaluate() 47 | filename = os.path.basename(elem[0]['file_name']) 48 | eval_res[filename] = eval_cur['segm'] 49 | 50 | with open(os.path.join(output_dir, '{}_ap.json'.format(dataset_name)), 'w') as f: 51 | json.dump(eval_res, f, indent=4) 52 | -------------------------------------------------------------------------------- /adet/modeling/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | Taken from https://github.com/fundamentalvision/Deformable-DETR 4 | ************************************************************************************************** 5 | * Deformable DETR 6 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 7 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /adet/modeling/osformer/feed_forward.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class FeedForwardNetwork(nn.Module): 6 | def __init__(self, d_model): 7 | super().__init__() 8 | self.d_model = d_model 9 | self.ffn = nn.Sequential( 10 | nn.Conv2d(d_model, d_model, 3, padding=1), 11 | nn.GroupNorm(8, d_model), 12 | nn.GELU(), 13 | nn.Conv2d(d_model, d_model, 3, padding=1) 14 | ) 15 | 16 | def forward(self, src, spatial_shapes, *args): 17 | split_list = [(w * h) for (w, h) in spatial_shapes] 18 | feat_levels = [] 19 | for memory, (w, h) in zip(src.split(split_list, 1), spatial_shapes): 20 | memory = memory.view(-1, w, h, self.d_model).permute(0, 3, 1, 2) 21 | memory = self.ffn(memory) 22 | feat_levels.append(memory.flatten(2).transpose(1, 2)) 23 | return torch.cat(feat_levels, 1) 24 | 25 | 26 | class VanillaFeedForwardNetwork(nn.Module): 27 | def __init__(self, d_model): 28 | super().__init__() 29 | self.ffn = nn.Sequential( 30 | nn.Conv1d(d_model, d_model, 3, padding=1, bias=False), 31 | nn.GroupNorm(8, d_model), 32 | nn.GELU(), 33 | nn.Conv1d(d_model, d_model, 3, padding=1, bias=False) 34 | ) 35 | 36 | def forward(self, src, *args): 37 | return self.ffn(src.permute(0, 2, 1)).permute(0, 2, 1) 38 | 39 | 40 | class StdFeedForwardNetwork(nn.Module): 41 | def __init__(self, d_model): 42 | super().__init__() 43 | self.ffn = nn.Sequential( 44 | nn.Linear(d_model, d_model), 45 | nn.ReLU(), 46 | nn.LayerNorm(d_model) 47 | ) 48 | self.norm = nn.LayerNorm(d_model) 49 | 50 | def forward(self, src, *args): 51 | return self.norm(src + self.ffn(src)) 52 | 53 | 54 | def get_ffn(d_model, ffn_type): 55 | if ffn_type == 'std': 56 | return StdFeedForwardNetwork(d_model) 57 | if ffn_type == 'vanilla': 58 | return VanillaFeedForwardNetwork(d_model) 59 | return FeedForwardNetwork(d_model) 60 | -------------------------------------------------------------------------------- /adet/data/builtin.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from detectron2.data.datasets.register_coco import register_coco_instances 4 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata 5 | 6 | from .datasets.text import register_text_instances 7 | 8 | # register plane reconstruction 9 | 10 | _PREDEFINED_SPLITS_PIC = { 11 | "pic_person_train": ("pic/image/train", "pic/annotations/train_person.json"), 12 | "pic_person_val": ("pic/image/val", "pic/annotations/val_person.json"), 13 | } 14 | 15 | metadata_pic = { 16 | "thing_classes": ["person"] 17 | } 18 | 19 | _PREDEFINED_SPLITS_TEXT = { 20 | "totaltext_train": ("totaltext/train_images", "totaltext/train.json"), 21 | "totaltext_val": ("totaltext/test_images", "totaltext/test.json"), 22 | "ctw1500_word_train": ("CTW1500/ctwtrain_text_image", "CTW1500/annotations/train_ctw1500_maxlen100_v2.json"), 23 | "ctw1500_word_test": ("CTW1500/ctwtest_text_image","CTW1500/annotations/test_ctw1500_maxlen100.json"), 24 | "syntext1_train": ("syntext1/images", "syntext1/annotations/train.json"), 25 | "syntext2_train": ("syntext2/images", "syntext2/annotations/train.json"), 26 | "mltbezier_word_train": ("mlt2017/images","mlt2017/annotations/train.json"), 27 | } 28 | 29 | metadata_text = { 30 | "thing_classes": ["text"] 31 | } 32 | 33 | 34 | def register_all_coco(root="datasets"): 35 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_PIC.items(): 36 | # Assume pre-defined datasets live in `./datasets`. 37 | register_coco_instances( 38 | key, 39 | metadata_pic, 40 | os.path.join(root, json_file) if "://" not in json_file else json_file, 41 | os.path.join(root, image_root), 42 | ) 43 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_TEXT.items(): 44 | # Assume pre-defined datasets live in `./datasets`. 45 | register_text_instances( 46 | key, 47 | metadata_text, 48 | os.path.join(root, json_file) if "://" not in json_file else json_file, 49 | os.path.join(root, image_root), 50 | ) 51 | 52 | 53 | register_all_coco() -------------------------------------------------------------------------------- /tools/combine_vis.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import cv2 4 | import numpy as np 5 | 6 | 7 | BASEDIR = 'SOTA' 8 | DATASETS = { 9 | 'cod': {'image': 'COD10K-v3/Test/Image/', 10 | 'gt': 'COD10K-v3/Test/GT_Instance/'}, 11 | 'nc4k': {'image': 'NC4K/Imgs/', 12 | 'gt': 'NC4K/Instance/'}} 13 | ORDER_JSON = 'SOTA/desc_res_{}.json' 14 | OUTPUT_DIR = 'Combined' 15 | if not os.path.exists(OUTPUT_DIR): 16 | os.makedirs(OUTPUT_DIR) 17 | 18 | 19 | # 4 rows 3 columns 20 | def combine_a_image(filename, dataset, score, order, nums_per_row=3, nums_per_col=4): 21 | print(filename) 22 | gt = cv2.imread(os.path.join(DATASETS[dataset]['gt'], filename.replace('.jpg', '.png'))) 23 | cv2.putText(gt, score, (60, 60), cv2.FONT_HERSHEY_PLAIN, 5.0, (0, 0, 255), 4) 24 | shape = gt.shape 25 | im = cv2.imread(os.path.join(DATASETS[dataset]['image'], filename)) 26 | target_shape = [shape[0] * nums_per_col, shape[1] * nums_per_row, 3] 27 | result = np.zeros(target_shape) 28 | result[:shape[0], :shape[1], :] = gt 29 | 30 | i = 0 31 | for method in os.listdir(BASEDIR): 32 | if not os.path.isdir(os.path.join(BASEDIR, method)): 33 | continue 34 | 35 | print(method) 36 | i += 1 37 | row = i // nums_per_row 38 | col = i % nums_per_row 39 | 40 | vis_map = cv2.imread(os.path.join(BASEDIR, method, 'vis', filename)) 41 | if vis_map is None: 42 | vis_map = im.copy() 43 | if vis_map.shape != shape: 44 | vis_map = cv2.resize(vis_map, (shape[1], shape[0])) 45 | cv2.putText(vis_map, method, (60, 60), cv2.FONT_HERSHEY_PLAIN, 5.0, (0, 0, 255), 4) 46 | result[row * shape[0]: (row + 1) * shape[0], 47 | col * shape[1]: (col + 1) * shape[1], :] = vis_map 48 | cv2.imwrite(os.path.join(OUTPUT_DIR, order + filename), result) 49 | print('Save {} successfully!'.format(os.path.join(OUTPUT_DIR, order + filename))) 50 | 51 | 52 | for dataset_ in ['nc4k']: # DATASETS.keys(): 53 | with open(ORDER_JSON.format(dataset_), 'r') as f: 54 | order_dict = json.load(f) 55 | 56 | for idx, (filename_, score_delta) in enumerate(order_dict.items()): 57 | score_fmt = '{:.2f}'.format(score_delta * 100) 58 | combine_a_image(filename_, dataset_, score_fmt, '{:04}_'.format(idx)) 59 | -------------------------------------------------------------------------------- /adet/modeling/osformer/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Taken from https://github.com/facebookresearch/detr 2 | 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | import torch 8 | from torch import nn 9 | 10 | 11 | class PositionEmbeddingSine(nn.Module): 12 | """ 13 | This is a more standard version of the position embedding, very similar to the one 14 | used by the Attention is all you need paper, generalized to work on images. 15 | """ 16 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 17 | super().__init__() 18 | self.num_pos_feats = num_pos_feats 19 | self.temperature = temperature 20 | self.normalize = normalize 21 | if scale is not None and normalize is False: 22 | raise ValueError("normalize should be True if scale is passed") 23 | if scale is None: 24 | scale = 2 * math.pi 25 | self.scale = scale 26 | 27 | def forward(self, tensor_list): 28 | x = tensor_list 29 | bs, _, w, h = x.shape 30 | not_mask = torch.zeros((bs, w, h), dtype=torch.bool, device=x.device) 31 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 32 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 33 | if self.normalize: 34 | eps = 1e-6 35 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale 36 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale 37 | 38 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 39 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) #dim_t (128) 40 | 41 | # x_embed[:, :, :, None] (bs, w, h, 1) 42 | pos_x = x_embed[:, :, :, None] / dim_t 43 | # pos_x.shape (ba, w, h, dim_t) 44 | pos_y = y_embed[:, :, :, None] / dim_t 45 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 46 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 47 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 48 | return pos 49 | 50 | 51 | def build_position_encoding(hidden_dim): 52 | N_steps = hidden_dim // 2 53 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 54 | 55 | return position_embedding 56 | -------------------------------------------------------------------------------- /adet/layers/iou_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class IOULoss(nn.Module): 6 | """ 7 | Intersetion Over Union (IoU) loss which supports three 8 | different IoU computations: 9 | 10 | * IoU 11 | * Linear IoU 12 | * gIoU 13 | """ 14 | def __init__(self, loc_loss_type='iou'): 15 | super(IOULoss, self).__init__() 16 | self.loc_loss_type = loc_loss_type 17 | 18 | def forward(self, pred, target, weight=None): 19 | """ 20 | Args: 21 | pred: Nx4 predicted bounding boxes 22 | target: Nx4 target bounding boxes 23 | weight: N loss weight for each instance 24 | """ 25 | pred_left = pred[:, 0] 26 | pred_top = pred[:, 1] 27 | pred_right = pred[:, 2] 28 | pred_bottom = pred[:, 3] 29 | 30 | target_left = target[:, 0] 31 | target_top = target[:, 1] 32 | target_right = target[:, 2] 33 | target_bottom = target[:, 3] 34 | 35 | target_aera = (target_left + target_right) * \ 36 | (target_top + target_bottom) 37 | pred_aera = (pred_left + pred_right) * \ 38 | (pred_top + pred_bottom) 39 | 40 | w_intersect = torch.min(pred_left, target_left) + \ 41 | torch.min(pred_right, target_right) 42 | h_intersect = torch.min(pred_bottom, target_bottom) + \ 43 | torch.min(pred_top, target_top) 44 | 45 | g_w_intersect = torch.max(pred_left, target_left) + \ 46 | torch.max(pred_right, target_right) 47 | g_h_intersect = torch.max(pred_bottom, target_bottom) + \ 48 | torch.max(pred_top, target_top) 49 | ac_uion = g_w_intersect * g_h_intersect 50 | 51 | area_intersect = w_intersect * h_intersect 52 | area_union = target_aera + pred_aera - area_intersect 53 | 54 | ious = (area_intersect + 1.0) / (area_union + 1.0) 55 | gious = ious - (ac_uion - area_union) / ac_uion 56 | if self.loc_loss_type == 'iou': 57 | losses = -torch.log(ious) 58 | elif self.loc_loss_type == 'linear_iou': 59 | losses = 1 - ious 60 | elif self.loc_loss_type == 'giou': 61 | losses = 1 - gious 62 | else: 63 | raise NotImplementedError 64 | 65 | if weight is not None: 66 | return (losses * weight).sum() 67 | else: 68 | return losses.sum() 69 | -------------------------------------------------------------------------------- /adet/modeling/ops/setup.py: -------------------------------------------------------------------------------- 1 | # Taken from https://github.com/fundamentalvision/Deformable-DETR 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | 8 | import os 9 | import glob 10 | 11 | import torch 12 | 13 | from torch.utils.cpp_extension import CUDA_HOME 14 | from torch.utils.cpp_extension import CppExtension 15 | from torch.utils.cpp_extension import CUDAExtension 16 | 17 | from setuptools import find_packages 18 | from setuptools import setup 19 | 20 | requirements = ["torch", "torchvision"] 21 | 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | if torch.cuda.is_available() and CUDA_HOME is not None: 37 | extension = CUDAExtension 38 | sources += source_cuda 39 | define_macros += [("WITH_CUDA", None)] 40 | extra_compile_args["nvcc"] = [ 41 | "-DCUDA_HAS_FP16=1", 42 | "-D__CUDA_NO_HALF_OPERATORS__", 43 | "-D__CUDA_NO_HALF_CONVERSIONS__", 44 | "-D__CUDA_NO_HALF2_OPERATORS__", 45 | ] 46 | else: 47 | raise NotImplementedError('Cuda is not availabel') 48 | 49 | sources = [os.path.join(extensions_dir, s) for s in sources] 50 | include_dirs = [extensions_dir] 51 | ext_modules = [ 52 | extension( 53 | "MultiScaleDeformableAttention", 54 | sources, 55 | include_dirs=include_dirs, 56 | define_macros=define_macros, 57 | extra_compile_args=extra_compile_args, 58 | ) 59 | ] 60 | return ext_modules 61 | 62 | setup( 63 | name="MultiScaleDeformableAttention", 64 | version="1.0", 65 | author="Weijie Su", 66 | url="https://github.com/fundamentalvision/Deformable-DETR", 67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 68 | packages=find_packages(exclude=("configs", "tests",)), 69 | ext_modules=get_extensions(), 70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 71 | ) 72 | -------------------------------------------------------------------------------- /adet/layers/gcn.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Conv2D(nn.Module): 7 | def __init__(self, in_channels, out_channels, kernel_size, padding='same', 8 | stride=1, dilation=1, groups=1): 9 | super(Conv2D, self).__init__() 10 | 11 | assert type(kernel_size) in [int, tuple], "Allowed kernel type [int or tuple], not {}".format(type(kernel_size)) 12 | assert padding == 'same', "Allowed padding type {}, not {}".format('same', padding) 13 | 14 | self.kernel_size = kernel_size 15 | if isinstance(kernel_size, tuple): 16 | self.h_kernel = kernel_size[0] 17 | self.w_kernel = kernel_size[1] 18 | else: 19 | self.h_kernel = kernel_size 20 | self.w_kernel = kernel_size 21 | 22 | self.padding = padding 23 | self.stride = stride 24 | self.dilation = dilation 25 | self.groups = groups 26 | self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, 27 | stride=self.stride, dilation=self.dilation, groups=self.groups) 28 | 29 | def forward(self, x): 30 | 31 | if self.padding == 'same': 32 | 33 | height, width = x.shape[2:] 34 | 35 | h_pad_need = max(0, (height - 1) * self.stride + self.h_kernel - height) 36 | w_pad_need = max(0, (width - 1) * self.stride + self.w_kernel - width) 37 | 38 | pad_left = w_pad_need // 2 39 | pad_right = w_pad_need - pad_left 40 | pad_top = h_pad_need // 2 41 | pad_bottom = h_pad_need - pad_top 42 | 43 | padding = (pad_left, pad_right, pad_top, pad_bottom) 44 | 45 | x = F.pad(x, padding, 'constant', 0) 46 | 47 | x = self.conv(x) 48 | 49 | return x 50 | 51 | 52 | class GCN(nn.Module): 53 | """ 54 | Large Kernel Matters -- https://arxiv.org/abs/1703.02719 55 | """ 56 | def __init__(self, in_channels, out_channels, k=3): 57 | super(GCN, self).__init__() 58 | 59 | self.conv_l1 = Conv2D(in_channels=in_channels, out_channels=out_channels, kernel_size=(k, 1), padding='same') 60 | self.conv_l2 = Conv2D(in_channels=out_channels, out_channels=out_channels, kernel_size=(1, k), padding='same') 61 | 62 | self.conv_r1 = Conv2D(in_channels=in_channels, out_channels=out_channels, kernel_size=(1, k), padding='same') 63 | self.conv_r2 = Conv2D(in_channels=out_channels, out_channels=out_channels, kernel_size=(k, 1), padding='same') 64 | 65 | def forward(self, x): 66 | x1 = self.conv_l1(x) 67 | x1 = self.conv_l2(x1) 68 | 69 | x2 = self.conv_r1(x) 70 | x2 = self.conv_r2(x2) 71 | 72 | out = x1 + x2 73 | 74 | return out 75 | -------------------------------------------------------------------------------- /adet/modeling/backbone/fpn.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch.nn.functional as F 3 | import fvcore.nn.weight_init as weight_init 4 | 5 | from detectron2.modeling.backbone import FPN, build_resnet_backbone 6 | from detectron2.layers import ShapeSpec 7 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 8 | 9 | from .resnet_lpf import build_resnet_lpf_backbone 10 | from .resnet_interval import build_resnet_interval_backbone 11 | from .mobilenet import build_mnv2_backbone 12 | 13 | 14 | class LastLevelP6P7(nn.Module): 15 | """ 16 | This module is used in RetinaNet and FCOS to generate extra layers, P6 and P7 from 17 | C5 or P5 feature. 18 | """ 19 | 20 | def __init__(self, in_channels, out_channels, in_features="res5"): 21 | super().__init__() 22 | self.num_levels = 2 23 | self.in_feature = in_features 24 | self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) 25 | self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) 26 | for module in [self.p6, self.p7]: 27 | weight_init.c2_xavier_fill(module) 28 | 29 | def forward(self, x): 30 | p6 = self.p6(x) 31 | p7 = self.p7(F.relu(p6)) 32 | return [p6, p7] 33 | 34 | 35 | class LastLevelP6(nn.Module): 36 | """ 37 | This module is used in FCOS to generate extra layers 38 | """ 39 | 40 | def __init__(self, in_channels, out_channels, in_features="res5"): 41 | super().__init__() 42 | self.num_levels = 1 43 | self.in_feature = in_features 44 | self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) 45 | for module in [self.p6]: 46 | weight_init.c2_xavier_fill(module) 47 | 48 | def forward(self, x): 49 | p6 = self.p6(x) 50 | return [p6] 51 | 52 | 53 | @BACKBONE_REGISTRY.register() 54 | def build_fcos_resnet_fpn_backbone(cfg, input_shape: ShapeSpec): 55 | """ 56 | Args: 57 | cfg: a detectron2 CfgNode 58 | 59 | Returns: 60 | backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. 61 | """ 62 | if cfg.MODEL.BACKBONE.ANTI_ALIAS: 63 | bottom_up = build_resnet_lpf_backbone(cfg, input_shape) 64 | elif cfg.MODEL.RESNETS.DEFORM_INTERVAL > 1: 65 | bottom_up = build_resnet_interval_backbone(cfg, input_shape) 66 | elif cfg.MODEL.MOBILENET: 67 | bottom_up = build_mnv2_backbone(cfg, input_shape) 68 | else: 69 | bottom_up = build_resnet_backbone(cfg, input_shape) 70 | in_features = cfg.MODEL.FPN.IN_FEATURES 71 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 72 | top_levels = cfg.MODEL.FCOS.TOP_LEVELS 73 | in_channels_top = out_channels 74 | if top_levels == 2: 75 | top_block = LastLevelP6P7(in_channels_top, out_channels, "p5") 76 | if top_levels == 1: 77 | top_block = LastLevelP6(in_channels_top, out_channels, "p5") 78 | elif top_levels == 0: 79 | top_block = None 80 | backbone = FPN( 81 | bottom_up=bottom_up, 82 | in_features=in_features, 83 | out_channels=out_channels, 84 | norm=cfg.MODEL.FPN.NORM, 85 | top_block=top_block, 86 | fuse_type=cfg.MODEL.FPN.FUSE_TYPE, 87 | ) 88 | return backbone 89 | -------------------------------------------------------------------------------- /adet/layers/csrc/DefROIAlign/DefROIAlign.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace adet { 5 | 6 | #ifdef WITH_CUDA 7 | at::Tensor DefROIAlign_forward_cuda( 8 | const at::Tensor& input, 9 | const at::Tensor& rois, 10 | const at::Tensor& offsets, // def added 11 | const float spatial_scale, 12 | const int pooled_height, 13 | const int pooled_width, 14 | const int sampling_ratio, 15 | const float trans_std, // def added 16 | bool aligned); 17 | 18 | at::Tensor DefROIAlign_backward_cuda( 19 | const at::Tensor& input, // def added 20 | const at::Tensor& grad, 21 | const at::Tensor& rois, 22 | const at::Tensor& offsets, // def added 23 | const at::Tensor& grad_offsets, // def added 24 | const float spatial_scale, 25 | const int pooled_height, 26 | const int pooled_width, 27 | const int batch_size, 28 | const int channels, 29 | const int height, 30 | const int width, 31 | const int sampling_ratio, 32 | const float trans_std, // def added 33 | bool aligned); 34 | #endif 35 | 36 | // Interface for Python 37 | inline at::Tensor DefROIAlign_forward( 38 | const at::Tensor& input, 39 | const at::Tensor& rois, 40 | const at::Tensor& offsets, // def added 41 | const float spatial_scale, 42 | const int pooled_height, 43 | const int pooled_width, 44 | const int sampling_ratio, 45 | const float trans_std, // def added 46 | bool aligned) { 47 | if (input.type().is_cuda()) { 48 | #ifdef WITH_CUDA 49 | return DefROIAlign_forward_cuda( 50 | input, 51 | rois, 52 | offsets, 53 | spatial_scale, 54 | pooled_height, 55 | pooled_width, 56 | sampling_ratio, 57 | trans_std, 58 | aligned); 59 | #else 60 | AT_ERROR("Not compiled with GPU support"); 61 | #endif 62 | } 63 | AT_ERROR("CPU version not supported"); 64 | } 65 | 66 | inline at::Tensor DefROIAlign_backward( 67 | const at::Tensor& input, // def added 68 | const at::Tensor& grad, 69 | const at::Tensor& rois, 70 | const at::Tensor& offsets, // def added 71 | const at::Tensor& grad_offsets, // def added 72 | const float spatial_scale, 73 | const int pooled_height, 74 | const int pooled_width, 75 | const int batch_size, 76 | const int channels, 77 | const int height, 78 | const int width, 79 | const int sampling_ratio, 80 | const float trans_std, // def added 81 | bool aligned) { 82 | if (grad.type().is_cuda()) { 83 | #ifdef WITH_CUDA 84 | return DefROIAlign_backward_cuda( 85 | input, // def added 86 | grad, 87 | rois, 88 | offsets, // def added 89 | grad_offsets, // def added 90 | spatial_scale, 91 | pooled_height, 92 | pooled_width, 93 | batch_size, 94 | channels, 95 | height, 96 | width, 97 | sampling_ratio, 98 | trans_std, // def added 99 | aligned); 100 | #else 101 | AT_ERROR("Not compiled with GPU support"); 102 | #endif 103 | } 104 | AT_ERROR("CPU version not supported"); 105 | } 106 | 107 | } // namespace adet 108 | -------------------------------------------------------------------------------- /adet/layers/naive_group_norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import Module, Parameter 3 | from torch.nn import init 4 | 5 | 6 | class NaiveGroupNorm(Module): 7 | r"""NaiveGroupNorm implements Group Normalization with the high-level matrix operations in PyTorch. 8 | It is a temporary solution to export GN by ONNX before the official GN can be exported by ONNX. 9 | The usage of NaiveGroupNorm is exactly the same as the official :class:`torch.nn.GroupNorm`. 10 | Args: 11 | num_groups (int): number of groups to separate the channels into 12 | num_channels (int): number of channels expected in input 13 | eps: a value added to the denominator for numerical stability. Default: 1e-5 14 | affine: a boolean value that when set to ``True``, this module 15 | has learnable per-channel affine parameters initialized to ones (for weights) 16 | and zeros (for biases). Default: ``True``. 17 | 18 | Shape: 19 | - Input: :math:`(N, C, *)` where :math:`C=\text{num\_channels}` 20 | - Output: :math:`(N, C, *)` (same shape as input) 21 | 22 | Examples:: 23 | 24 | >>> input = torch.randn(20, 6, 10, 10) 25 | >>> # Separate 6 channels into 3 groups 26 | >>> m = NaiveGroupNorm(3, 6) 27 | >>> # Separate 6 channels into 6 groups (equivalent with InstanceNorm) 28 | >>> m = NaiveGroupNorm(6, 6) 29 | >>> # Put all 6 channels into a single group (equivalent with LayerNorm) 30 | >>> m = NaiveGroupNorm(1, 6) 31 | >>> # Activating the module 32 | >>> output = m(input) 33 | 34 | .. _`Group Normalization`: https://arxiv.org/abs/1803.08494 35 | """ 36 | __constants__ = ['num_groups', 'num_channels', 'eps', 'affine', 'weight', 37 | 'bias'] 38 | 39 | def __init__(self, num_groups, num_channels, eps=1e-5, affine=True): 40 | super(NaiveGroupNorm, self).__init__() 41 | self.num_groups = num_groups 42 | self.num_channels = num_channels 43 | self.eps = eps 44 | self.affine = affine 45 | if self.affine: 46 | self.weight = Parameter(torch.Tensor(num_channels)) 47 | self.bias = Parameter(torch.Tensor(num_channels)) 48 | else: 49 | self.register_parameter('weight', None) 50 | self.register_parameter('bias', None) 51 | self.reset_parameters() 52 | 53 | def reset_parameters(self): 54 | if self.affine: 55 | init.ones_(self.weight) 56 | init.zeros_(self.bias) 57 | 58 | def forward(self, input): 59 | N, C, H, W = input.size() 60 | assert C % self.num_groups == 0 61 | input = input.reshape(N, self.num_groups, -1) 62 | mean = input.mean(dim=-1, keepdim=True) 63 | var = (input ** 2).mean(dim=-1, keepdim=True) - mean ** 2 64 | std = torch.sqrt(var + self.eps) 65 | 66 | input = (input - mean) / std 67 | input = input.reshape(N, C, H, W) 68 | if self.affine: 69 | input = input * self.weight.reshape(1, C, 1, 1) + self.bias.reshape(1, C, 1, 1) 70 | return input 71 | 72 | def extra_repr(self): 73 | return '{num_groups}, {num_channels}, eps={eps}, ' \ 74 | 'affine={affine}'.format(**self.__dict__) 75 | -------------------------------------------------------------------------------- /adet/modeling/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # Taken from https://github.com/fundamentalvision/Deformable-DETR 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | 8 | from __future__ import absolute_import 9 | from __future__ import print_function 10 | from __future__ import division 11 | 12 | import torch 13 | import torch.nn.functional as F 14 | from torch.autograd import Function 15 | from torch.autograd.function import once_differentiable 16 | 17 | import MultiScaleDeformableAttention as MSDA 18 | 19 | 20 | class MSDeformAttnFunction(Function): 21 | @staticmethod 22 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 23 | ctx.im2col_step = im2col_step 24 | output = MSDA.ms_deform_attn_forward( 25 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 26 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 27 | return output 28 | 29 | @staticmethod 30 | @once_differentiable 31 | def backward(ctx, grad_output): 32 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 33 | grad_value, grad_sampling_loc, grad_attn_weight = \ 34 | MSDA.ms_deform_attn_backward( 35 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 36 | 37 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 38 | 39 | 40 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 41 | # for debug and test only, 42 | # need to use cuda version instead 43 | N_, S_, M_, D_ = value.shape 44 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 45 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 46 | sampling_grids = 2 * sampling_locations - 1 47 | sampling_value_list = [] 48 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 49 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 50 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 51 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 52 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 53 | # N_*M_, D_, Lq_, P_ 54 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 55 | mode='bilinear', padding_mode='zeros', align_corners=False) 56 | sampling_value_list.append(sampling_value_l_) 57 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 58 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 59 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 60 | return output.transpose(1, 2).contiguous() 61 | -------------------------------------------------------------------------------- /adet/utils/visualizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from detectron2.utils.visualizer import Visualizer 4 | 5 | 6 | class TextVisualizer(Visualizer): 7 | def draw_instance_predictions(self, predictions): 8 | beziers = predictions.beziers.numpy() 9 | scores = predictions.scores.tolist() 10 | recs = predictions.recs 11 | 12 | self.overlay_instances(beziers, recs, scores) 13 | 14 | return self.output 15 | 16 | def _bezier_to_poly(self, bezier): 17 | # bezier to polygon 18 | u = np.linspace(0, 1, 20) 19 | bezier = bezier.reshape(2, 4, 2).transpose(0, 2, 1).reshape(4, 4) 20 | points = np.outer((1 - u) ** 3, bezier[:, 0]) \ 21 | + np.outer(3 * u * ((1 - u) ** 2), bezier[:, 1]) \ 22 | + np.outer(3 * (u ** 2) * (1 - u), bezier[:, 2]) \ 23 | + np.outer(u ** 3, bezier[:, 3]) 24 | points = np.concatenate((points[:, :2], points[:, 2:]), axis=0) 25 | 26 | return points 27 | 28 | def _decode_recognition(self, rec): 29 | CTLABELS = [' ','!','"','#','$','%','&','\'','(',')','*','+',',','-','.','/','0','1','2','3','4','5','6','7','8','9',':',';','<','=','>','?','@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_','`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','{','|','}','~'] 30 | 31 | s = '' 32 | for c in rec: 33 | c = int(c) 34 | if c < 95: 35 | s += CTLABELS[c] 36 | elif c == 95: 37 | s += u'口' 38 | return s 39 | 40 | def _ctc_decode_recognition(self, rec): 41 | CTLABELS = [' ','!','"','#','$','%','&','\'','(',')','*','+',',','-','.','/','0','1','2','3','4','5','6','7','8','9',':',';','<','=','>','?','@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_','`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','{','|','}','~'] 42 | 43 | # ctc decoding 44 | last_char = False 45 | s = '' 46 | for c in rec: 47 | c = int(c) 48 | if c < 95: 49 | if last_char != c: 50 | s += CTLABELS[c] 51 | last_char = c 52 | elif c == 95: 53 | s += u'口' 54 | else: 55 | last_char = False 56 | return s 57 | 58 | def overlay_instances(self, beziers, recs, scores, alpha=0.5): 59 | color = (0.1, 0.2, 0.5) 60 | 61 | for bezier, rec, score in zip(beziers, recs, scores): 62 | polygon = self._bezier_to_poly(bezier) 63 | self.draw_polygon(polygon, color, alpha=alpha) 64 | 65 | # draw text in the top left corner 66 | # text = self._decode_recognition(rec) 67 | # text = "{:.3f}: {}".format(score, text) 68 | # lighter_color = self._change_color_brightness(color, brightness_factor=0.7) 69 | # text_pos = polygon[0] 70 | # horiz_align = "left" 71 | # font_size = self._default_font_size 72 | # 73 | # self.draw_text( 74 | # text, 75 | # text_pos, 76 | # color=lighter_color, 77 | # horizontal_alignment=horiz_align, 78 | # font_size=font_size, 79 | # ) -------------------------------------------------------------------------------- /adet/data/detection_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from detectron2.data import transforms as T 7 | from detectron2.data.detection_utils import \ 8 | annotations_to_instances as d2_anno_to_inst 9 | from detectron2.data.detection_utils import \ 10 | transform_instance_annotations as d2_transform_inst_anno 11 | 12 | 13 | def transform_instance_annotations( 14 | annotation, transforms, image_size, *, keypoint_hflip_indices=None 15 | ): 16 | 17 | annotation = d2_transform_inst_anno( 18 | annotation, 19 | transforms, 20 | image_size, 21 | keypoint_hflip_indices=keypoint_hflip_indices, 22 | ) 23 | 24 | if "beziers" in annotation: 25 | beziers = transform_beziers_annotations(annotation["beziers"], transforms) 26 | annotation["beziers"] = beziers 27 | return annotation 28 | 29 | 30 | def transform_beziers_annotations(beziers, transforms): 31 | """ 32 | Transform keypoint annotations of an image. 33 | 34 | Args: 35 | beziers (list[float]): Nx16 float in Detectron2 Dataset format. 36 | transforms (TransformList): 37 | """ 38 | # (N*2,) -> (N, 2) 39 | beziers = np.asarray(beziers, dtype="float64").reshape(-1, 2) 40 | beziers = transforms.apply_coords(beziers).reshape(-1) 41 | 42 | # This assumes that HorizFlipTransform is the only one that does flip 43 | do_hflip = ( 44 | sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1 45 | ) 46 | if do_hflip: 47 | raise ValueError("Flipping text data is not supported (also disencouraged).") 48 | 49 | return beziers 50 | 51 | 52 | def annotations_to_instances(annos, image_size, mask_format="polygon"): 53 | instance = d2_anno_to_inst(annos, image_size, mask_format) 54 | 55 | if not annos: 56 | return instance 57 | 58 | # add attributes 59 | if "beziers" in annos[0]: 60 | beziers = [obj.get("beziers", []) for obj in annos] 61 | instance.beziers = torch.as_tensor(beziers, dtype=torch.float32) 62 | 63 | if "rec" in annos[0]: 64 | text = [obj.get("rec", []) for obj in annos] 65 | instance.text = torch.as_tensor(text, dtype=torch.int32) 66 | 67 | return instance 68 | 69 | 70 | def build_augmentation(cfg, is_train): 71 | """ 72 | With option to don't use hflip 73 | 74 | Returns: 75 | list[Augmentation] 76 | """ 77 | if is_train: 78 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 79 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 80 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING 81 | else: 82 | min_size = cfg.INPUT.MIN_SIZE_TEST 83 | max_size = cfg.INPUT.MAX_SIZE_TEST 84 | sample_style = "choice" 85 | if sample_style == "range": 86 | assert ( 87 | len(min_size) == 2 88 | ), "more than 2 ({}) min_size(s) are provided for ranges".format(len(min_size)) 89 | 90 | logger = logging.getLogger(__name__) 91 | 92 | augmentation = [] 93 | augmentation.append(T.ResizeShortestEdge(min_size, max_size, sample_style)) 94 | if is_train: 95 | if cfg.INPUT.HFLIP_TRAIN: 96 | augmentation.append(T.RandomFlip()) 97 | logger.info("Augmentations used in training: " + str(augmentation)) 98 | return augmentation 99 | 100 | 101 | build_transform_gen = build_augmentation 102 | """ 103 | Alias for backward-compatibility. 104 | """ 105 | -------------------------------------------------------------------------------- /adet/layers/csrc/BezierAlign/BezierAlign.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace adet { 5 | 6 | at::Tensor BezierAlign_forward_cpu( 7 | const at::Tensor& input, 8 | const at::Tensor& rois, 9 | const float spatial_scale, 10 | const int pooled_height, 11 | const int pooled_width, 12 | const int sampling_ratio, 13 | bool aligned); 14 | 15 | at::Tensor BezierAlign_backward_cpu( 16 | const at::Tensor& grad, 17 | const at::Tensor& rois, 18 | const float spatial_scale, 19 | const int pooled_height, 20 | const int pooled_width, 21 | const int batch_size, 22 | const int channels, 23 | const int height, 24 | const int width, 25 | const int sampling_ratio, 26 | bool aligned); 27 | 28 | #ifdef WITH_CUDA 29 | at::Tensor BezierAlign_forward_cuda( 30 | const at::Tensor& input, 31 | const at::Tensor& rois, 32 | const float spatial_scale, 33 | const int pooled_height, 34 | const int pooled_width, 35 | const int sampling_ratio, 36 | bool aligned); 37 | 38 | at::Tensor BezierAlign_backward_cuda( 39 | const at::Tensor& grad, 40 | const at::Tensor& rois, 41 | const float spatial_scale, 42 | const int pooled_height, 43 | const int pooled_width, 44 | const int batch_size, 45 | const int channels, 46 | const int height, 47 | const int width, 48 | const int sampling_ratio, 49 | bool aligned); 50 | #endif 51 | 52 | // Interface for Python 53 | inline at::Tensor BezierAlign_forward( 54 | const at::Tensor& input, 55 | const at::Tensor& rois, 56 | const float spatial_scale, 57 | const int pooled_height, 58 | const int pooled_width, 59 | const int sampling_ratio, 60 | bool aligned) { 61 | if (input.type().is_cuda()) { 62 | #ifdef WITH_CUDA 63 | return BezierAlign_forward_cuda( 64 | input, 65 | rois, 66 | spatial_scale, 67 | pooled_height, 68 | pooled_width, 69 | sampling_ratio, 70 | aligned); 71 | #else 72 | AT_ERROR("Not compiled with GPU support"); 73 | #endif 74 | } 75 | return BezierAlign_forward_cpu( 76 | input, 77 | rois, 78 | spatial_scale, 79 | pooled_height, 80 | pooled_width, 81 | sampling_ratio, 82 | aligned); 83 | } 84 | 85 | inline at::Tensor BezierAlign_backward( 86 | const at::Tensor& grad, 87 | const at::Tensor& rois, 88 | const float spatial_scale, 89 | const int pooled_height, 90 | const int pooled_width, 91 | const int batch_size, 92 | const int channels, 93 | const int height, 94 | const int width, 95 | const int sampling_ratio, 96 | bool aligned) { 97 | if (grad.type().is_cuda()) { 98 | #ifdef WITH_CUDA 99 | return BezierAlign_backward_cuda( 100 | grad, 101 | rois, 102 | spatial_scale, 103 | pooled_height, 104 | pooled_width, 105 | batch_size, 106 | channels, 107 | height, 108 | width, 109 | sampling_ratio, 110 | aligned); 111 | #else 112 | AT_ERROR("Not compiled with GPU support"); 113 | #endif 114 | } 115 | return BezierAlign_backward_cpu( 116 | grad, 117 | rois, 118 | spatial_scale, 119 | pooled_height, 120 | pooled_width, 121 | batch_size, 122 | channels, 123 | height, 124 | width, 125 | sampling_ratio, 126 | aligned); 127 | } 128 | 129 | } // namespace detectron2 130 | -------------------------------------------------------------------------------- /tools/plot_utils.py: -------------------------------------------------------------------------------- 1 | # Modified from https://github.com/facebookresearch/detr 2 | import os 3 | import pandas as pd 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | from pathlib import Path, PurePath 7 | 8 | 9 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), 10 | ewm_col=0, log_name='metrics.json'): 11 | """ 12 | Function to plot specific fields from training log(s). Plots both training and test results. 13 | 14 | :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file 15 | - fields = which results to plot from each log file - plots both training and test for each field. 16 | - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots 17 | - log_name = optional, name of log file if different than default 'log.txt'. 18 | 19 | :: Outputs - matplotlib plots of results in fields, color coded for each log file. 20 | - solid lines are training results, dashed lines are test results. 21 | 22 | """ 23 | func_name = "plot_utils.py::plot_logs" 24 | 25 | # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path, 26 | # convert single Path to list to avoid 'not iterable' error 27 | 28 | if not isinstance(logs, list): 29 | if isinstance(logs, PurePath): 30 | logs = [logs] 31 | print(f"{func_name} info: logs param expects a list argument, converted to list[Path].") 32 | else: 33 | raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \ 34 | Expect list[Path] or single Path obj, received {type(logs)}") 35 | 36 | # Quality checks - verify valid dir(s), that every item in list is Path object, and that log_name exists in each dir 37 | for i, dir in enumerate(logs): 38 | if not isinstance(dir, PurePath): 39 | raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}") 40 | if not dir.exists(): 41 | raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}") 42 | # verify log_name exists 43 | fn = Path(dir / log_name) 44 | if not fn.exists(): 45 | print(f"-> missing {log_name}. Have you gotten to Epoch 1 in training?") 46 | print(f"--> full path of missing log file: {fn}") 47 | return 48 | 49 | # load log file(s) and plot 50 | dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs] 51 | 52 | fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5)) 53 | 54 | for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))): 55 | for j, field in enumerate(fields): 56 | df.rolling(60).mean()[::60].interpolate().ewm(com=ewm_col).mean().plot( 57 | y=field, 58 | ax=axs[j], 59 | color=color, 60 | style='-') 61 | for ax, field in zip(axs, fields): 62 | ax.legend([Path(p).name for p in logs]) 63 | ax.set_title(field) 64 | 65 | return fig, axs 66 | 67 | 68 | def save_plot(log_path_list, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), save_name='result.png'): 69 | if isinstance(log_path_list, list): 70 | log = [Path(log_path) for log_path in log_path_list] 71 | else: 72 | log = Path(log_path_list) 73 | fig, _ = plot_logs(log, fields) 74 | fig.savefig(os.path.join(log_path_list[0], save_name)) 75 | 76 | 77 | if __name__ == '__main__': 78 | save_plot(['OSFormer', 79 | 'OSFormer-ZEROLIKE', 80 | 'OSFormer-NNEMBEDDING'], 81 | fields=('total_loss', 'loss_ins')) 82 | -------------------------------------------------------------------------------- /adet/layers/bezier_align.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.autograd import Function 3 | from torch.autograd.function import once_differentiable 4 | from torch.nn.modules.utils import _pair 5 | 6 | from adet import _C 7 | 8 | 9 | class _BezierAlign(Function): 10 | @staticmethod 11 | def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio, aligned): 12 | ctx.save_for_backward(roi) 13 | ctx.output_size = _pair(output_size) 14 | ctx.spatial_scale = spatial_scale 15 | ctx.sampling_ratio = sampling_ratio 16 | ctx.input_shape = input.size() 17 | ctx.aligned = aligned 18 | output = _C.bezier_align_forward( 19 | input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned 20 | ) 21 | return output 22 | 23 | @staticmethod 24 | @once_differentiable 25 | def backward(ctx, grad_output): 26 | rois, = ctx.saved_tensors 27 | output_size = ctx.output_size 28 | spatial_scale = ctx.spatial_scale 29 | sampling_ratio = ctx.sampling_ratio 30 | bs, ch, h, w = ctx.input_shape 31 | grad_input = _C.bezier_align_backward( 32 | grad_output, 33 | rois, 34 | spatial_scale, 35 | output_size[0], 36 | output_size[1], 37 | bs, 38 | ch, 39 | h, 40 | w, 41 | sampling_ratio, 42 | ctx.aligned, 43 | ) 44 | return grad_input, None, None, None, None, None 45 | 46 | 47 | bezier_align = _BezierAlign.apply 48 | 49 | 50 | class BezierAlign(nn.Module): 51 | def __init__(self, output_size, spatial_scale, sampling_ratio, aligned=True): 52 | """ 53 | Args: 54 | output_size (tuple): h, w 55 | spatial_scale (float): scale the input boxes by this number 56 | sampling_ratio (int): number of inputs samples to take for each output 57 | sample. 0 to take samples densely. 58 | aligned (bool): if False, use the legacy implementation in 59 | Detectron. If True, align the results more perfectly. 60 | 61 | Note: 62 | The meaning of aligned=True: 63 | 64 | With `aligned=True`, 65 | we first appropriately scale the ROI and then shift it by -0.5 66 | prior to calling bezier_align. This produces the correct neighbors; see 67 | adet/tests/test_bezier_align.py for verification. 68 | 69 | The difference does not make a difference to the model's performance if 70 | ROIAlign is used together with conv layers. 71 | """ 72 | super(BezierAlign, self).__init__() 73 | self.output_size = output_size 74 | self.spatial_scale = spatial_scale 75 | self.sampling_ratio = sampling_ratio 76 | self.aligned = aligned 77 | 78 | def forward(self, input, rois): 79 | """ 80 | Args: 81 | input: NCHW images 82 | rois: Bx17 boxes. First column is the index into N. The other 16 columns are [xy]x8. 83 | """ 84 | assert rois.dim() == 2 and rois.size(1) == 17 85 | return bezier_align( 86 | input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned 87 | ) 88 | 89 | def __repr__(self): 90 | tmpstr = self.__class__.__name__ + "(" 91 | tmpstr += "output_size=" + str(self.output_size) 92 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 93 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 94 | tmpstr += ", aligned=" + str(self.aligned) 95 | tmpstr += ")" 96 | return tmpstr 97 | -------------------------------------------------------------------------------- /demo/vis_pred_json.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import json 4 | from collections import defaultdict, OrderedDict 5 | 6 | import torch 7 | import pycocotools.mask as mask_util 8 | from pycocotools.coco import COCO 9 | from detectron2.structures import Instances 10 | from detectron2.utils.visualizer import Visualizer, ColorMode 11 | from detectron2.evaluation.coco_evaluation import COCOEvaluator 12 | from adet.data.datasets.cis import register_dataset 13 | 14 | 15 | def pre_process_json(json_file, score_threshold=0.3): 16 | with open(json_file, 'r') as f: 17 | data = json.load(f) 18 | 19 | data_filtered = defaultdict(list) 20 | for d in data: 21 | if d['score'] > score_threshold: 22 | data_filtered[d['image_id']].append(d) 23 | 24 | return data_filtered 25 | 26 | 27 | def data2instance(data): 28 | instances = {} 29 | for k in data.keys(): 30 | if len(data[k]) == 0: 31 | instances[k] = None 32 | results = Instances(data[k][0]['segmentation']['size']) 33 | scores = [] 34 | segms = [] 35 | for elem in data[k]: 36 | scores.append(elem['score']) 37 | segms.append(mask_util.decode(elem['segmentation'])) 38 | results.scores = torch.as_tensor(scores) 39 | results.pred_masks = torch.as_tensor(segms) 40 | instances[k] = results 41 | 42 | return instances 43 | 44 | 45 | def vis_single_image(img_info, ins, img_dir, output_dir, instance_mode=ColorMode.IMAGE): 46 | img_filename = img_info['file_name'] 47 | if ins is None: 48 | ins = Instances((img_info['height'], img_info['width'])) 49 | 50 | im = cv2.imread(os.path.join(img_dir, img_filename))[:, :, ::-1] 51 | visualizer = Visualizer(im, instance_mode=instance_mode) 52 | vis_output = visualizer.draw_instance_predictions(predictions=ins) 53 | out_filename = os.path.join(output_dir, img_filename) 54 | print('Save {} successfully.'.format(out_filename)) 55 | vis_output.save(out_filename) 56 | 57 | 58 | def eval_single_image(coco_eval, prediction): 59 | coco_eval.reset() 60 | coco_eval._predictions.append(prediction) 61 | return coco_eval.evaluate() 62 | 63 | 64 | def vis(res_json, dataset_name, output_dir=None, score_threshold=0.3): 65 | if output_dir is None: 66 | output_dir = os.path.dirname(res_json) 67 | 68 | vis_dir = os.path.join(output_dir, 'vis') 69 | if not os.path.exists(vis_dir): 70 | os.makedirs(vis_dir) 71 | 72 | anno_json, img_dir = datasets[dataset_name] 73 | coco = COCO(anno_json) 74 | coco_eval = COCOEvaluator(dataset_name, output_dir=output_dir, tasks=('segm',)) 75 | 76 | data = pre_process_json(res_json, score_threshold) 77 | instances = data2instance(data) 78 | print('Get instances successfully.') 79 | 80 | eval_res = OrderedDict() 81 | for img_id, ins in instances.items(): 82 | vis_single_image(coco.imgs[img_id], ins, img_dir, vis_dir) 83 | prediction = {"image_id": img_id, "instances": data[img_id]} 84 | eval_cur = eval_single_image(coco_eval, prediction) 85 | eval_res[coco.imgs[img_id]['file_name']] = eval_cur['segm'] 86 | 87 | with open(os.path.join(output_dir, '{}_ap.json'.format(dataset_name)), 'w') as f: 88 | json.dump(eval_res, f, indent=4) 89 | 90 | 91 | if __name__ == '__main__': 92 | datasets = { 93 | "my_data_test_coco_cod_style": [ 94 | 'COD10K-v3/annotations/test2026.json', 95 | 'COD10K-v3/Test/Image/' 96 | ], 97 | "my_data_test_coco_nc4k_style": [ 98 | 'NC4K/nc4k_test.json', 99 | 'NC4K/Imgs/' 100 | ] 101 | } 102 | 103 | register_dataset() 104 | -------------------------------------------------------------------------------- /adet/layers/def_roi_align.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.autograd import Function 4 | from torch.autograd.function import once_differentiable 5 | from torch.nn.modules.utils import _pair 6 | 7 | from adet import _C 8 | 9 | 10 | class _DefROIAlign(Function): 11 | @staticmethod 12 | def forward(ctx, input, roi, offsets, output_size, spatial_scale, sampling_ratio, trans_std, aligned): 13 | ctx.save_for_backward(input, roi, offsets) 14 | ctx.output_size = _pair(output_size) 15 | ctx.spatial_scale = spatial_scale 16 | ctx.sampling_ratio = sampling_ratio 17 | ctx.trans_std = trans_std 18 | ctx.input_shape = input.size() 19 | ctx.aligned = aligned 20 | output = _C.def_roi_align_forward( 21 | input, roi, offsets, spatial_scale, output_size[0], output_size[1], 22 | sampling_ratio, trans_std, aligned 23 | ) 24 | return output 25 | 26 | @staticmethod 27 | @once_differentiable 28 | def backward(ctx, grad_output): 29 | data, rois, offsets = ctx.saved_tensors 30 | output_size = ctx.output_size 31 | spatial_scale = ctx.spatial_scale 32 | sampling_ratio = ctx.sampling_ratio 33 | trans_std = ctx.trans_std 34 | bs, ch, h, w = ctx.input_shape 35 | grad_offsets = torch.zeros_like(offsets) 36 | 37 | grad_input = _C.def_roi_align_backward( 38 | data, 39 | grad_output, 40 | rois, 41 | offsets, 42 | grad_offsets, 43 | spatial_scale, 44 | output_size[0], 45 | output_size[1], 46 | bs, 47 | ch, 48 | h, 49 | w, 50 | sampling_ratio, 51 | trans_std, 52 | ctx.aligned, 53 | ) 54 | return grad_input, None, grad_offsets, None, None, None, None, None 55 | 56 | 57 | def_roi_align = _DefROIAlign.apply 58 | 59 | 60 | class DefROIAlign(nn.Module): 61 | def __init__(self, output_size, spatial_scale, 62 | sampling_ratio, trans_std, aligned=True): 63 | """ 64 | Args: 65 | output_size (tuple): h, w 66 | spatial_scale (float): scale the input boxes by this number 67 | sampling_ratio (int): number of inputs samples to take for each output 68 | sample. 0 to take samples densely. 69 | trans_std (float): offset scale according to the normalized roi size 70 | aligned (bool): if False, use the legacy implementation in 71 | Detectron. If True, align the results more perfectly. 72 | """ 73 | super(DefROIAlign, self).__init__() 74 | self.output_size = output_size 75 | self.spatial_scale = spatial_scale 76 | self.sampling_ratio = sampling_ratio 77 | self.trans_std = trans_std 78 | self.aligned = aligned 79 | 80 | def forward(self, input, rois, offsets): 81 | """ 82 | Args: 83 | input: NCHW images 84 | rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy. 85 | """ 86 | assert rois.dim() == 2 and rois.size(1) == 5 87 | return def_roi_align( 88 | input, rois, offsets, self.output_size, 89 | self.spatial_scale, self.sampling_ratio, 90 | self.trans_std, self.aligned 91 | ) 92 | 93 | def __repr__(self): 94 | tmpstr = self.__class__.__name__ + "(" 95 | tmpstr += "output_size=" + str(self.output_size) 96 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 97 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 98 | tmpstr += ", trans_std=" + str(self.trans_std) 99 | tmpstr += ", aligned=" + str(self.aligned) 100 | tmpstr += ")" 101 | return tmpstr 102 | -------------------------------------------------------------------------------- /adet/data/augmentation.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from fvcore.transforms import transform as T 5 | 6 | from detectron2.data.transforms import RandomCrop, StandardAugInput 7 | from detectron2.structures import BoxMode 8 | 9 | 10 | def gen_crop_transform_with_instance(crop_size, image_size, instances, crop_box=True): 11 | """ 12 | Generate a CropTransform so that the cropping region contains 13 | the center of the given instance. 14 | 15 | Args: 16 | crop_size (tuple): h, w in pixels 17 | image_size (tuple): h, w 18 | instance (dict): an annotation dict of one instance, in Detectron2's 19 | dataset format. 20 | """ 21 | bbox = random.choice(instances) 22 | crop_size = np.asarray(crop_size, dtype=np.int32) 23 | center_yx = (bbox[1] + bbox[3]) * 0.5, (bbox[0] + bbox[2]) * 0.5 24 | assert ( 25 | image_size[0] >= center_yx[0] and image_size[1] >= center_yx[1] 26 | ), "The annotation bounding box is outside of the image!" 27 | assert ( 28 | image_size[0] >= crop_size[0] and image_size[1] >= crop_size[1] 29 | ), "Crop size is larger than image size!" 30 | 31 | min_yx = np.maximum(np.floor(center_yx).astype(np.int32) - crop_size, 0) 32 | max_yx = np.maximum(np.asarray(image_size, dtype=np.int32) - crop_size, 0) 33 | max_yx = np.minimum(max_yx, np.ceil(center_yx).astype(np.int32)) 34 | 35 | y0 = np.random.randint(min_yx[0], max_yx[0] + 1) 36 | x0 = np.random.randint(min_yx[1], max_yx[1] + 1) 37 | 38 | # if some instance is cropped extend the box 39 | if not crop_box: 40 | num_modifications = 0 41 | modified = True 42 | 43 | # convert crop_size to float 44 | crop_size = crop_size.astype(np.float32) 45 | while modified: 46 | modified, x0, y0, crop_size = adjust_crop(x0, y0, crop_size, instances) 47 | num_modifications += 1 48 | if num_modifications > 100: 49 | raise ValueError( 50 | "Cannot finished cropping adjustment within 100 tries (#instances {}).".format( 51 | len(instances) 52 | ) 53 | ) 54 | return T.CropTransform(0, 0, image_size[1], image_size[0]) 55 | 56 | return T.CropTransform(*map(int, (x0, y0, crop_size[1], crop_size[0]))) 57 | 58 | 59 | def adjust_crop(x0, y0, crop_size, instances, eps=1e-3): 60 | modified = False 61 | 62 | x1 = x0 + crop_size[1] 63 | y1 = y0 + crop_size[0] 64 | 65 | for bbox in instances: 66 | 67 | if bbox[0] < x0 - eps and bbox[2] > x0 + eps: 68 | crop_size[1] += x0 - bbox[0] 69 | x0 = bbox[0] 70 | modified = True 71 | 72 | if bbox[0] < x1 - eps and bbox[2] > x1 + eps: 73 | crop_size[1] += bbox[2] - x1 74 | x1 = bbox[2] 75 | modified = True 76 | 77 | if bbox[1] < y0 - eps and bbox[3] > y0 + eps: 78 | crop_size[0] += y0 - bbox[1] 79 | y0 = bbox[1] 80 | modified = True 81 | 82 | if bbox[1] < y1 - eps and bbox[3] > y1 + eps: 83 | crop_size[0] += bbox[3] - y1 84 | y1 = bbox[3] 85 | modified = True 86 | 87 | return modified, x0, y0, crop_size 88 | 89 | 90 | class RandomCropWithInstance(RandomCrop): 91 | """ Instance-aware cropping. 92 | """ 93 | 94 | def __init__(self, crop_type, crop_size, crop_instance=True): 95 | """ 96 | Args: 97 | crop_instance (bool): if False, extend cropping boxes to avoid cropping instances 98 | """ 99 | super().__init__(crop_type, crop_size) 100 | self.crop_instance = crop_instance 101 | self.input_args = ("image", "boxes") 102 | 103 | def get_transform(self, img, boxes): 104 | image_size = img.shape[:2] 105 | crop_size = self.get_crop_size(image_size) 106 | return gen_crop_transform_with_instance( 107 | crop_size, image_size, boxes, crop_box=self.crop_instance 108 | ) 109 | -------------------------------------------------------------------------------- /tools/visualize_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import numpy as np 4 | import os 5 | from itertools import chain 6 | import cv2 7 | import tqdm 8 | from PIL import Image 9 | 10 | from detectron2.data import DatasetCatalog, MetadataCatalog, build_detection_train_loader 11 | from detectron2.data import detection_utils as utils 12 | from detectron2.data.build import filter_images_with_few_keypoints 13 | from detectron2.utils.logger import setup_logger 14 | from detectron2.utils.visualizer import Visualizer 15 | 16 | from adet.config import get_cfg 17 | from adet.data.dataset_mapper import DatasetMapperWithBasis 18 | 19 | 20 | def setup(args): 21 | cfg = get_cfg() 22 | if args.config_file: 23 | cfg.merge_from_file(args.config_file) 24 | cfg.merge_from_list(args.opts) 25 | cfg.freeze() 26 | return cfg 27 | 28 | 29 | def parse_args(in_args=None): 30 | parser = argparse.ArgumentParser(description="Visualize ground-truth data") 31 | parser.add_argument( 32 | "--source", 33 | choices=["annotation", "dataloader"], 34 | required=True, 35 | help="visualize the annotations or the data loader (with pre-processing)", 36 | ) 37 | parser.add_argument("--config-file", metavar="FILE", help="path to config file") 38 | parser.add_argument("--output-dir", default="./", help="path to output directory") 39 | parser.add_argument("--show", action="store_true", help="show output in a window") 40 | parser.add_argument( 41 | "--opts", 42 | help="Modify config options using the command-line", 43 | default=[], 44 | nargs=argparse.REMAINDER, 45 | ) 46 | return parser.parse_args(in_args) 47 | 48 | 49 | if __name__ == "__main__": 50 | args = parse_args() 51 | logger = setup_logger() 52 | logger.info("Arguments: " + str(args)) 53 | cfg = setup(args) 54 | 55 | dirname = args.output_dir 56 | os.makedirs(dirname, exist_ok=True) 57 | metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]) 58 | 59 | def output(vis, fname): 60 | if args.show: 61 | print(fname) 62 | cv2.imshow("window", vis.get_image()[:, :, ::-1]) 63 | cv2.waitKey() 64 | else: 65 | filepath = os.path.join(dirname, fname) 66 | print("Saving to {} ...".format(filepath)) 67 | vis.save(filepath) 68 | 69 | scale = 2.0 if args.show else 1.0 70 | if args.source == "dataloader": 71 | mapper = DatasetMapperWithBasis(cfg, True) 72 | train_data_loader = build_detection_train_loader(cfg, mapper) 73 | for batch in train_data_loader: 74 | for per_image in batch: 75 | # Pytorch tensor is in (C, H, W) format 76 | img = per_image["image"].permute(1, 2, 0) 77 | if cfg.INPUT.FORMAT == "BGR": 78 | img = img[:, :, [2, 1, 0]] 79 | else: 80 | img = np.asarray(Image.fromarray(img, mode=cfg.INPUT.FORMAT).convert("RGB")) 81 | 82 | visualizer = Visualizer(img, metadata=metadata, scale=scale) 83 | target_fields = per_image["instances"].get_fields() 84 | labels = [metadata.thing_classes[i] for i in target_fields["gt_classes"]] 85 | vis = visualizer.overlay_instances( 86 | labels=labels, 87 | boxes=target_fields.get("gt_boxes", None), 88 | masks=target_fields.get("gt_masks", None), 89 | keypoints=target_fields.get("gt_keypoints", None), 90 | ) 91 | output(vis, str(per_image["image_id"]) + ".jpg") 92 | else: 93 | dicts = list(chain.from_iterable([DatasetCatalog.get(k) for k in cfg.DATASETS.TRAIN])) 94 | if cfg.MODEL.KEYPOINT_ON: 95 | dicts = filter_images_with_few_keypoints(dicts, 1) 96 | for dic in tqdm.tqdm(dicts): 97 | img = utils.read_image(dic["file_name"], "RGB") 98 | visualizer = Visualizer(img, metadata=metadata, scale=scale) 99 | vis = visualizer.draw_dataset_dict(dic) 100 | output(vis, os.path.basename(dic["file_name"])) -------------------------------------------------------------------------------- /adet/modeling/ops/test.py: -------------------------------------------------------------------------------- 1 | # Taken from https://github.com/fundamentalvision/Deformable-DETR 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | 8 | from __future__ import absolute_import 9 | from __future__ import print_function 10 | from __future__ import division 11 | 12 | import time 13 | import torch 14 | import torch.nn as nn 15 | from torch.autograd import gradcheck 16 | 17 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 18 | 19 | 20 | N, M, D = 1, 2, 2 21 | Lq, L, P = 2, 2, 2 22 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 23 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 24 | S = sum([(H*W).item() for H, W in shapes]) 25 | 26 | 27 | torch.manual_seed(3) 28 | 29 | 30 | @torch.no_grad() 31 | def check_forward_equal_with_pytorch_double(): 32 | value = torch.rand(N, S, M, D).cuda() * 0.01 33 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 34 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 35 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 36 | im2col_step = 2 37 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 38 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 39 | fwdok = torch.allclose(output_cuda, output_pytorch) 40 | max_abs_err = (output_cuda - output_pytorch).abs().max() 41 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 42 | 43 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 44 | 45 | 46 | @torch.no_grad() 47 | def check_forward_equal_with_pytorch_float(): 48 | value = torch.rand(N, S, M, D).cuda() * 0.01 49 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 50 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 51 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 52 | im2col_step = 2 53 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 54 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 55 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 56 | max_abs_err = (output_cuda - output_pytorch).abs().max() 57 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 58 | 59 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 60 | 61 | 62 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 63 | 64 | value = torch.rand(N, S, M, channels).cuda() * 0.01 65 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 66 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 67 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 68 | im2col_step = 2 69 | func = MSDeformAttnFunction.apply 70 | 71 | value.requires_grad = grad_value 72 | sampling_locations.requires_grad = grad_sampling_loc 73 | attention_weights.requires_grad = grad_attn_weight 74 | 75 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 76 | 77 | print(f'* {gradok} check_gradient_numerical(D={channels})') 78 | 79 | 80 | if __name__ == '__main__': 81 | check_forward_equal_with_pytorch_double() 82 | check_forward_equal_with_pytorch_float() 83 | 84 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 85 | check_gradient_numerical(channels, True, True, True) 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /adet/layers/deform_conv.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from detectron2.layers import Conv2d 5 | 6 | 7 | class _NewEmptyTensorOp(torch.autograd.Function): 8 | @staticmethod 9 | def forward(ctx, x, new_shape): 10 | ctx.shape = x.shape 11 | return x.new_empty(new_shape) 12 | 13 | @staticmethod 14 | def backward(ctx, grad): 15 | shape = ctx.shape 16 | return _NewEmptyTensorOp.apply(grad, shape), None 17 | 18 | 19 | class DFConv2d(nn.Module): 20 | """ 21 | Deformable convolutional layer with configurable 22 | deformable groups, dilations and groups. 23 | 24 | Code is from: 25 | https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/layers/misc.py 26 | 27 | 28 | """ 29 | def __init__( 30 | self, 31 | in_channels, 32 | out_channels, 33 | with_modulated_dcn=True, 34 | kernel_size=3, 35 | stride=1, 36 | groups=1, 37 | dilation=1, 38 | deformable_groups=1, 39 | bias=False, 40 | padding=None 41 | ): 42 | super(DFConv2d, self).__init__() 43 | if isinstance(kernel_size, (list, tuple)): 44 | assert isinstance(stride, (list, tuple)) 45 | assert isinstance(dilation, (list, tuple)) 46 | assert len(kernel_size) == 2 47 | assert len(stride) == 2 48 | assert len(dilation) == 2 49 | padding = ( 50 | dilation[0] * (kernel_size[0] - 1) // 2, 51 | dilation[1] * (kernel_size[1] - 1) // 2 52 | ) 53 | offset_base_channels = kernel_size[0] * kernel_size[1] 54 | else: 55 | padding = dilation * (kernel_size - 1) // 2 56 | offset_base_channels = kernel_size * kernel_size 57 | if with_modulated_dcn: 58 | from detectron2.layers.deform_conv import ModulatedDeformConv 59 | offset_channels = offset_base_channels * 3 # default: 27 60 | conv_block = ModulatedDeformConv 61 | else: 62 | from detectron2.layers.deform_conv import DeformConv 63 | offset_channels = offset_base_channels * 2 # default: 18 64 | conv_block = DeformConv 65 | self.offset = Conv2d( 66 | in_channels, 67 | deformable_groups * offset_channels, 68 | kernel_size=kernel_size, 69 | stride=stride, 70 | padding=padding, 71 | groups=1, 72 | dilation=dilation 73 | ) 74 | for l in [self.offset, ]: 75 | nn.init.kaiming_uniform_(l.weight, a=1) 76 | torch.nn.init.constant_(l.bias, 0.) 77 | self.conv = conv_block( 78 | in_channels, 79 | out_channels, 80 | kernel_size=kernel_size, 81 | stride=stride, 82 | padding=padding, 83 | dilation=dilation, 84 | groups=groups, 85 | deformable_groups=deformable_groups, 86 | bias=bias 87 | ) 88 | self.with_modulated_dcn = with_modulated_dcn 89 | self.kernel_size = kernel_size 90 | self.stride = stride 91 | self.padding = padding 92 | self.dilation = dilation 93 | self.offset_split = offset_base_channels * deformable_groups * 2 94 | 95 | def forward(self, x, return_offset=False): 96 | if x.numel() > 0: 97 | if not self.with_modulated_dcn: 98 | offset_mask = self.offset(x) 99 | x = self.conv(x, offset_mask) 100 | else: 101 | offset_mask = self.offset(x) 102 | offset = offset_mask[:, :self.offset_split, :, :] 103 | mask = offset_mask[:, self.offset_split:, :, :].sigmoid() 104 | x = self.conv(x, offset, mask) 105 | if return_offset: 106 | return x, offset_mask 107 | return x 108 | # get output shape 109 | output_shape = [ 110 | (i + 2 * p - (di * (k - 1) + 1)) // d + 1 111 | for i, p, di, k, d in zip( 112 | x.shape[-2:], 113 | self.padding, 114 | self.dilation, 115 | self.kernel_size, 116 | self.stride 117 | ) 118 | ] 119 | output_shape = [x.shape[0], self.conv.weight.shape[0]] + output_shape 120 | return _NewEmptyTensorOp.apply(x, output_shape) 121 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import glob 4 | import os 5 | from setuptools import find_packages, setup 6 | import torch 7 | from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension 8 | 9 | torch_ver = [int(x) for x in torch.__version__.split(".")[:2]] 10 | assert torch_ver >= [1, 3], "Requires PyTorch >= 1.3" 11 | 12 | 13 | def get_adet_extensions(): 14 | this_dir = os.path.dirname(os.path.abspath(__file__)) 15 | extensions_dir = os.path.join(this_dir, "adet", "layers", "csrc") 16 | 17 | main_source = os.path.join(extensions_dir, "vision.cpp") 18 | sources = glob.glob(os.path.join(extensions_dir, "**", "*.cpp")) 19 | source_cuda = glob.glob(os.path.join(extensions_dir, "**", "*.cu")) + glob.glob( 20 | os.path.join(extensions_dir, "*.cu") 21 | ) 22 | 23 | sources = [main_source] + sources 24 | 25 | extension = CppExtension 26 | 27 | extra_compile_args = {"cxx": []} 28 | define_macros = [] 29 | 30 | if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1": 31 | extension = CUDAExtension 32 | sources += source_cuda 33 | define_macros += [("WITH_CUDA", None)] 34 | extra_compile_args["nvcc"] = [ 35 | "-DCUDA_HAS_FP16=1", 36 | "-D__CUDA_NO_HALF_OPERATORS__", 37 | "-D__CUDA_NO_HALF_CONVERSIONS__", 38 | "-D__CUDA_NO_HALF2_OPERATORS__", 39 | ] 40 | 41 | # It's better if pytorch can do this by default .. 42 | CC = os.environ.get("CC", None) 43 | if CC is not None: 44 | extra_compile_args["nvcc"].append("-ccbin={}".format(CC)) 45 | 46 | sources = [os.path.join(extensions_dir, s) for s in sources] 47 | include_dirs = [extensions_dir] 48 | ext_modules = [ 49 | extension( 50 | "adet._C", 51 | sources, 52 | include_dirs=include_dirs, 53 | define_macros=define_macros, 54 | extra_compile_args=extra_compile_args, 55 | ) 56 | ] 57 | 58 | return ext_modules 59 | 60 | 61 | def get_deformable_extensions(): 62 | this_dir = os.path.dirname(os.path.abspath(__file__)) 63 | extensions_dir = os.path.join(this_dir, "adet", "modeling", "ops", "src") 64 | 65 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 66 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 67 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 68 | 69 | sources = main_file + source_cpu 70 | extension = CppExtension 71 | extra_compile_args = {"cxx": []} 72 | define_macros = [] 73 | 74 | if torch.cuda.is_available() and CUDA_HOME is not None: 75 | extension = CUDAExtension 76 | sources += source_cuda 77 | define_macros += [("WITH_CUDA", None)] 78 | extra_compile_args["nvcc"] = [ 79 | "-DCUDA_HAS_FP16=1", 80 | "-D__CUDA_NO_HALF_OPERATORS__", 81 | "-D__CUDA_NO_HALF_CONVERSIONS__", 82 | "-D__CUDA_NO_HALF2_OPERATORS__", 83 | ] 84 | else: 85 | raise NotImplementedError('Cuda is not availabel') 86 | 87 | sources = [os.path.join(extensions_dir, s) for s in sources] 88 | include_dirs = [extensions_dir] 89 | ext_modules = [ 90 | extension( 91 | "MultiScaleDeformableAttention", 92 | sources, 93 | include_dirs=include_dirs, 94 | define_macros=define_macros, 95 | extra_compile_args=extra_compile_args, 96 | ) 97 | ] 98 | return ext_modules 99 | 100 | 101 | setup( 102 | name="AdelaiDet", 103 | version="0.2.0", 104 | author="Adelaide Intelligent Machines", 105 | url="https://github.com/stanstarks/AdelaiDet", 106 | description="AdelaiDet is AIM's research " 107 | "platform for instance-level detection tasks based on Detectron2.", 108 | packages=find_packages(exclude=("configs", "tests")), 109 | python_requires=">=3.6", 110 | install_requires=[ 111 | "setuptools==59.5.0", 112 | "termcolor>=1.1", 113 | "Pillow>=6.0", 114 | "yacs>=0.1.6", 115 | "tabulate", 116 | "cloudpickle", 117 | "matplotlib", 118 | "tqdm>4.29.0", 119 | "tensorboard", 120 | "python-Levenshtein", 121 | "Polygon3", 122 | "shapely", 123 | "kornia==0.6.8", 124 | "opencv-python", 125 | "timm" 126 | ], 127 | extras_require={"all": ["psutil"]}, 128 | ext_modules=get_adet_extensions() + get_deformable_extensions(), 129 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 130 | ) 131 | 132 | -------------------------------------------------------------------------------- /adet/modeling/backbone/lpf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.parallel 3 | import numpy as np 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | class Downsample(nn.Module): 9 | def __init__(self, pad_type='reflect', filt_size=3, stride=2, channels=None, pad_off=0): 10 | super(Downsample, self).__init__() 11 | self.filt_size = filt_size 12 | self.pad_off = pad_off 13 | self.pad_sizes = [int(1.*(filt_size-1)/2), int(np.ceil(1.*(filt_size-1)/2)), int(1.*(filt_size-1)/2), int(np.ceil(1.*(filt_size-1)/2))] 14 | self.pad_sizes = [pad_size+pad_off for pad_size in self.pad_sizes] 15 | self.stride = stride 16 | self.off = int((self.stride-1)/2.) 17 | self.channels = channels 18 | 19 | # print('Filter size [%i]'%filt_size) 20 | if(self.filt_size==1): 21 | a = np.array([1.,]) 22 | elif(self.filt_size==2): 23 | a = np.array([1., 1.]) 24 | elif(self.filt_size==3): 25 | a = np.array([1., 2., 1.]) 26 | elif(self.filt_size==4): 27 | a = np.array([1., 3., 3., 1.]) 28 | elif(self.filt_size==5): 29 | a = np.array([1., 4., 6., 4., 1.]) 30 | elif(self.filt_size==6): 31 | a = np.array([1., 5., 10., 10., 5., 1.]) 32 | elif(self.filt_size==7): 33 | a = np.array([1., 6., 15., 20., 15., 6., 1.]) 34 | 35 | filt = torch.Tensor(a[:,None]*a[None,:]) 36 | filt = filt/torch.sum(filt) 37 | self.register_buffer('filt', filt[None,None,:,:].repeat((self.channels,1,1,1))) 38 | 39 | self.pad = get_pad_layer(pad_type)(self.pad_sizes) 40 | 41 | def forward(self, inp): 42 | if(self.filt_size==1): 43 | if(self.pad_off==0): 44 | return inp[:,:,::self.stride,::self.stride] 45 | else: 46 | return self.pad(inp)[:,:,::self.stride,::self.stride] 47 | else: 48 | return F.conv2d(self.pad(inp), self.filt, stride=self.stride, groups=inp.shape[1]) 49 | 50 | def get_pad_layer(pad_type): 51 | if(pad_type in ['refl','reflect']): 52 | PadLayer = nn.ReflectionPad2d 53 | elif(pad_type in ['repl','replicate']): 54 | PadLayer = nn.ReplicationPad2d 55 | elif(pad_type=='zero'): 56 | PadLayer = nn.ZeroPad2d 57 | else: 58 | print('Pad type [%s] not recognized'%pad_type) 59 | return PadLayer 60 | 61 | 62 | class Downsample1D(nn.Module): 63 | def __init__(self, pad_type='reflect', filt_size=3, stride=2, channels=None, pad_off=0): 64 | super(Downsample1D, self).__init__() 65 | self.filt_size = filt_size 66 | self.pad_off = pad_off 67 | self.pad_sizes = [int(1. * (filt_size - 1) / 2), int(np.ceil(1. * (filt_size - 1) / 2))] 68 | self.pad_sizes = [pad_size + pad_off for pad_size in self.pad_sizes] 69 | self.stride = stride 70 | self.off = int((self.stride - 1) / 2.) 71 | self.channels = channels 72 | 73 | # print('Filter size [%i]' % filt_size) 74 | if(self.filt_size == 1): 75 | a = np.array([1., ]) 76 | elif(self.filt_size == 2): 77 | a = np.array([1., 1.]) 78 | elif(self.filt_size == 3): 79 | a = np.array([1., 2., 1.]) 80 | elif(self.filt_size == 4): 81 | a = np.array([1., 3., 3., 1.]) 82 | elif(self.filt_size == 5): 83 | a = np.array([1., 4., 6., 4., 1.]) 84 | elif(self.filt_size == 6): 85 | a = np.array([1., 5., 10., 10., 5., 1.]) 86 | elif(self.filt_size == 7): 87 | a = np.array([1., 6., 15., 20., 15., 6., 1.]) 88 | 89 | filt = torch.Tensor(a) 90 | filt = filt / torch.sum(filt) 91 | self.register_buffer('filt', filt[None, None, :].repeat((self.channels, 1, 1))) 92 | 93 | self.pad = get_pad_layer_1d(pad_type)(self.pad_sizes) 94 | 95 | def forward(self, inp): 96 | if(self.filt_size == 1): 97 | if(self.pad_off == 0): 98 | return inp[:, :, ::self.stride] 99 | else: 100 | return self.pad(inp)[:, :, ::self.stride] 101 | else: 102 | return F.conv1d(self.pad(inp), self.filt, stride=self.stride, groups=inp.shape[1]) 103 | 104 | 105 | def get_pad_layer_1d(pad_type): 106 | if(pad_type in ['refl', 'reflect']): 107 | PadLayer = nn.ReflectionPad1d 108 | elif(pad_type in ['repl', 'replicate']): 109 | PadLayer = nn.ReplicationPad1d 110 | elif(pad_type == 'zero'): 111 | PadLayer = nn.ZeroPad1d 112 | else: 113 | print('Pad type [%s] not recognized' % pad_type) 114 | return PadLayer 115 | -------------------------------------------------------------------------------- /tools/visualize_feat.py: -------------------------------------------------------------------------------- 1 | from detectron2.utils.logger import setup_logger 2 | setup_logger() 3 | 4 | # import some common libraries 5 | import os, cv2 6 | import matplotlib.pyplot as plt 7 | 8 | # import some common detectron2 utilities 9 | from detectron2.engine import DefaultPredictor 10 | 11 | from adet.config import get_cfg 12 | 13 | YML_PATH = '' 14 | WEIGHTS = '' 15 | OUTPATH = '' 16 | if not os.path.exists(OUTPATH): 17 | os.makedirs(OUTPATH) 18 | 19 | 20 | def setup(yml_path, weights): 21 | cfg = get_cfg() 22 | cfg.merge_from_file(yml_path) 23 | cfg.MODEL.WEIGHTS = weights 24 | cfg.MODEL.OSFormer.UPDATE_THR = 0.5 25 | cfg.freeze() 26 | predictor = DefaultPredictor(cfg) 27 | return cfg, predictor 28 | 29 | 30 | def vis_features(feat): 31 | feat = feat.squeeze(0) 32 | return feat.square().sum(0) 33 | 34 | 35 | def visualize(im_path, cfg, predictor, out_path): 36 | im_name = os.path.basename(im_path).split('.')[0] 37 | 38 | model = predictor.model 39 | 40 | im = cv2.imread(im_path) 41 | im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) 42 | 43 | conv_features = [] 44 | trans_features = [] 45 | camin_features = [] 46 | mask_features = [] 47 | 48 | hooks = [ 49 | # backbone feature 50 | model.backbone.res2.register_forward_hook( 51 | lambda self, input, output: conv_features.append(output) 52 | ), 53 | model.backbone.res3.register_forward_hook( 54 | lambda self, input, output: conv_features.append(output) 55 | ), 56 | model.backbone.res4.register_forward_hook( 57 | lambda self, input, output: conv_features.append(output) 58 | ), 59 | model.backbone.res5.register_forward_hook( 60 | lambda self, input, output: conv_features.append(output) 61 | ), 62 | 63 | # trans feature 64 | model.cate_head.trans_encoder.encoder.layers[5].register_forward_hook( 65 | lambda self, input, output: trans_features.append(output) 66 | ), 67 | 68 | # mask head feature 69 | model.mask_head.register_forward_hook( 70 | lambda self, input, output: mask_features.append(output) 71 | ), 72 | 73 | # camin feature 74 | model.dcin.register_forward_hook( 75 | lambda self, input, output: camin_features.append(output) 76 | ) 77 | ] 78 | 79 | outputs = predictor(im) 80 | 81 | for hook in hooks: 82 | hook.remove() 83 | 84 | # save res feats, res2-res5 85 | spatial_shapes = [] 86 | spatial_sizes = [] 87 | for idx, elem in enumerate(conv_features): 88 | cur_feat = vis_features(elem).cpu().numpy() 89 | spatial_shapes.append(tuple(cur_feat.shape)) 90 | spatial_sizes.append(cur_feat.shape[0] * cur_feat.shape[1]) 91 | plt.axis('off') 92 | plt.imshow(cur_feat) 93 | plt.savefig(os.path.join(out_path, 'vis_res{}_{}.pdf'.format(idx + 2, im_name)), bbox_inches='tight', pad_inches=0.0) 94 | print(os.path.join(out_path, 'vis_res{}_{}.pdf'.format(idx + 2, im_name))) 95 | 96 | # save trans feats, trans3-trans5 97 | for idx, elem, (x, y) in zip(range(len(spatial_shapes) - 1), trans_features[0].split(spatial_sizes[1:], 1), spatial_shapes[1:]): 98 | feat = vis_features(elem.permute(0, 2, 1).view(1, -1, x, y)).cpu().numpy() 99 | plt.axis('off') 100 | plt.imshow(feat) 101 | plt.savefig(os.path.join(out_path, 'vis_trans{}_{}.pdf'.format(idx + 3, im_name)), bbox_inches='tight', pad_inches=0.0) 102 | print(os.path.join(out_path, 'vis_trans{}_{}.pdf'.format(idx + 3, im_name))) 103 | 104 | # save camin output features 105 | camin_feats = camin_features[0].squeeze(0).cpu().numpy() 106 | for i in range(camin_feats.shape[0]): 107 | feat = camin_feats[i] 108 | plt.cla() # ref https://stackoverflow.com/questions/8213522/when-to-use-cla-clf-or-close-for-clearing-a-plot-in-matplotlib 109 | plt.axis('off') 110 | plt.imshow(feat) 111 | plt.savefig(os.path.join(out_path, 'vis_dcin{}_{}.pdf'.format(i, im_name)), bbox_inches='tight', pad_inches=0.0) 112 | print(os.path.join(out_path, 'vis_dcin{}_{}.pdf'.format(i, im_name))) 113 | 114 | # save mask features 115 | mask_feats = vis_features(mask_features[0][0]).cpu().numpy() 116 | plt.imshow(mask_feats) 117 | plt.savefig(os.path.join(out_path, 'vis_maskhead_{}.pdf'.format(im_name)), bbox_inches='tight', pad_inches=0.0) 118 | print(os.path.join(out_path, 'vis_maskhead_{}.pdf'.format(im_name))) 119 | 120 | # save rea edges 121 | for i in range(len(mask_features[0][1])): 122 | feat = mask_features[0][1][i].squeeze().cpu().numpy() 123 | plt.axis('off') 124 | plt.imshow(feat) 125 | plt.savefig(os.path.join(out_path, 'vis_rea_edge{}_{}.pdf'.format(i, im_name)), bbox_inches='tight', pad_inches=0.0) 126 | print(os.path.join(out_path, 'vis_rea_edge{}_{}.pdf'.format(i, im_name))) 127 | -------------------------------------------------------------------------------- /adet/modeling/backbone/resnet_interval.py: -------------------------------------------------------------------------------- 1 | from detectron2.layers import FrozenBatchNorm2d 2 | from detectron2.modeling.backbone import BACKBONE_REGISTRY 3 | from detectron2.modeling.backbone.resnet import ( 4 | BasicStem, 5 | DeformBottleneckBlock, 6 | BottleneckBlock, 7 | ResNet, 8 | ) 9 | 10 | 11 | def make_stage_intervals(block_class, num_blocks, first_stride, **kwargs): 12 | """ 13 | Create a resnet stage by creating many blocks. 14 | Args: 15 | block_class (class): a subclass of ResNetBlockBase 16 | num_blocks (int): 17 | first_stride (int): the stride of the first block. The other blocks will have stride=1. 18 | A `stride` argument will be passed to the block constructor. 19 | kwargs: other arguments passed to the block constructor. 20 | 21 | Returns: 22 | list[nn.Module]: a list of block module. 23 | """ 24 | blocks = [] 25 | conv_kwargs = {key: kwargs[key] for key in kwargs if "deform" not in key} 26 | deform_kwargs = {key: kwargs[key] for key in kwargs if key != "deform_interval"} 27 | deform_interval = kwargs.get("deform_interval", None) 28 | for i in range(num_blocks): 29 | if deform_interval and i % deform_interval == 0: 30 | blocks.append(block_class(stride=first_stride if i == 0 else 1, **deform_kwargs)) 31 | else: 32 | blocks.append(BottleneckBlock(stride=first_stride if i == 0 else 1, **conv_kwargs)) 33 | conv_kwargs["in_channels"] = conv_kwargs["out_channels"] 34 | deform_kwargs["in_channels"] = deform_kwargs["out_channels"] 35 | return blocks 36 | 37 | 38 | @BACKBONE_REGISTRY.register() 39 | def build_resnet_interval_backbone(cfg, input_shape): 40 | """ 41 | Create a ResNet instance from config. 42 | 43 | Returns: 44 | ResNet: a :class:`ResNet` instance. 45 | """ 46 | # need registration of new blocks/stems? 47 | norm = cfg.MODEL.RESNETS.NORM 48 | stem = BasicStem( 49 | in_channels=input_shape.channels, 50 | out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS, 51 | norm=norm, 52 | ) 53 | freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT 54 | 55 | if freeze_at >= 1: 56 | for p in stem.parameters(): 57 | p.requires_grad = False 58 | stem = FrozenBatchNorm2d.convert_frozen_batchnorm(stem) 59 | 60 | # fmt: off 61 | out_features = cfg.MODEL.RESNETS.OUT_FEATURES 62 | depth = cfg.MODEL.RESNETS.DEPTH 63 | num_groups = cfg.MODEL.RESNETS.NUM_GROUPS 64 | width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP 65 | bottleneck_channels = num_groups * width_per_group 66 | in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS 67 | out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS 68 | stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1 69 | res5_dilation = cfg.MODEL.RESNETS.RES5_DILATION 70 | deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE 71 | deform_modulated = cfg.MODEL.RESNETS.DEFORM_MODULATED 72 | deform_num_groups = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS 73 | deform_interval = cfg.MODEL.RESNETS.DEFORM_INTERVAL 74 | # fmt: on 75 | assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation) 76 | 77 | num_blocks_per_stage = {50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3]}[depth] 78 | 79 | stages = [] 80 | 81 | # Avoid creating variables without gradients 82 | # It consumes extra memory and may cause allreduce to fail 83 | out_stage_idx = [{"res2": 2, "res3": 3, "res4": 4, "res5": 5}[f] for f in out_features] 84 | max_stage_idx = max(out_stage_idx) 85 | for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)): 86 | dilation = res5_dilation if stage_idx == 5 else 1 87 | first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2 88 | stage_kargs = { 89 | "num_blocks": num_blocks_per_stage[idx], 90 | "first_stride": first_stride, 91 | "in_channels": in_channels, 92 | "bottleneck_channels": bottleneck_channels, 93 | "out_channels": out_channels, 94 | "num_groups": num_groups, 95 | "norm": norm, 96 | "stride_in_1x1": stride_in_1x1, 97 | "dilation": dilation, 98 | } 99 | if deform_on_per_stage[idx]: 100 | stage_kargs["block_class"] = DeformBottleneckBlock 101 | stage_kargs["deform_modulated"] = deform_modulated 102 | stage_kargs["deform_num_groups"] = deform_num_groups 103 | stage_kargs["deform_interval"] = deform_interval 104 | else: 105 | stage_kargs["block_class"] = BottleneckBlock 106 | blocks = make_stage_intervals(**stage_kargs) 107 | in_channels = out_channels 108 | out_channels *= 2 109 | bottleneck_channels *= 2 110 | 111 | if freeze_at >= stage_idx: 112 | for block in blocks: 113 | block.freeze() 114 | stages.append(blocks) 115 | return ResNet(stem, stages, out_features=out_features) 116 | -------------------------------------------------------------------------------- /adet/modeling/osformer/trans_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn.init import xavier_uniform_, constant_, normal_ 4 | 5 | from adet.modeling.ops.modules.ms_deform_attn import MSDeformAttn 6 | from .trans_utils import _get_clones, get_reference_points, with_pos_embed 7 | from .feed_forward import get_ffn 8 | 9 | 10 | class CISTransformerEncoder(nn.Module): 11 | def __init__(self, d_model=256, nhead=8, 12 | num_encoder_layers=6, dim_feedforward=1024, dropout=0.1, 13 | ffn_type="default", num_feature_levels=4, enc_n_points=4): 14 | super().__init__() 15 | 16 | self.d_model = d_model 17 | self.nhead = nhead 18 | 19 | encoder_layer = TransformerEncoderLayer(d_model, dim_feedforward, 20 | dropout, ffn_type, 21 | num_feature_levels, nhead, enc_n_points) 22 | self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers) 23 | self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) 24 | self.reference_points = nn.Linear(d_model, 2) 25 | 26 | self._reset_parameters() 27 | 28 | def _reset_parameters(self): 29 | for p in self.parameters(): 30 | if p.dim() > 1: 31 | nn.init.xavier_uniform_(p) 32 | for m in self.modules(): 33 | if isinstance(m, MSDeformAttn): 34 | m._reset_parameters() 35 | xavier_uniform_(self.reference_points.weight.data, gain=1.0) 36 | constant_(self.reference_points.bias.data, 0.) 37 | normal_(self.level_embed) 38 | 39 | def forward(self, srcs, pos_embeds): 40 | # prepare input for encoder 41 | src_flatten = [] 42 | lvl_pos_embed_flatten = [] 43 | spatial_shapes = [] 44 | for lvl, (src, pos_embed) in enumerate(zip(srcs, pos_embeds)): 45 | bs, c, h, w = src.shape 46 | spatial_shape = (h, w) 47 | spatial_shapes.append(spatial_shape) 48 | src = src.flatten(2).transpose(1, 2) 49 | pos_embed = pos_embed.flatten(2).transpose(1, 2) 50 | lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1) 51 | lvl_pos_embed_flatten.append(lvl_pos_embed) 52 | src_flatten.append(src) 53 | src_flatten = torch.cat(src_flatten, 1) 54 | lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) 55 | spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device) 56 | level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) 57 | 58 | # encoder 59 | memory = self.encoder(src_flatten, spatial_shapes, level_start_index, lvl_pos_embed_flatten) 60 | 61 | return memory, level_start_index 62 | 63 | 64 | class TransformerEncoderLayer(nn.Module): 65 | def __init__(self, 66 | d_model=256, d_ffn=1024, 67 | dropout=0.1, ffn_type="default", 68 | n_levels=4, n_heads=8, n_points=4): 69 | super().__init__() 70 | 71 | # self attention 72 | self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points) 73 | self.dropout1 = nn.Dropout(dropout) 74 | self.norm1 = nn.LayerNorm(d_model) 75 | 76 | # ffn 77 | self.ffn = get_ffn(d_model, ffn_type) 78 | 79 | def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None): 80 | # self attention 81 | src2 = self.self_attn(with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask) 82 | src = src + self.dropout1(src2) 83 | src = self.norm1(src) # (bs, w*h, dim) 84 | 85 | # ffn 86 | src = self.ffn(src, spatial_shapes, level_start_index) 87 | 88 | return src 89 | 90 | 91 | class TransformerEncoder(nn.Module): 92 | def __init__(self, encoder_layer, num_layers): 93 | super().__init__() 94 | self.layers = _get_clones(encoder_layer, num_layers) 95 | self.num_layers = num_layers 96 | 97 | def forward(self, src, spatial_shapes, level_start_index, pos=None): 98 | output = src 99 | batch_size = src.shape[0] 100 | reference_points = get_reference_points(spatial_shapes, batch_size, device=src.device) 101 | for _, layer in enumerate(self.layers): 102 | output = layer(output, pos, reference_points, spatial_shapes, level_start_index) 103 | 104 | return output 105 | 106 | 107 | def build_transformer_encoder(cfg): 108 | return CISTransformerEncoder( 109 | d_model=cfg.MODEL.OSFormer.HIDDEN_DIM, 110 | nhead=cfg.MODEL.OSFormer.NHEAD, 111 | num_encoder_layers=cfg.MODEL.OSFormer.ENC_LAYERS, 112 | dim_feedforward=cfg.MODEL.OSFormer.DIM_FEEDFORWARD, 113 | dropout=0.1, 114 | ffn_type=cfg.MODEL.OSFormer.FFN, 115 | num_feature_levels=len(cfg.MODEL.OSFormer.FEAT_INSTANCE_STRIDES), 116 | enc_n_points=cfg.MODEL.OSFormer.ENC_POINTS) 117 | 118 | 119 | -------------------------------------------------------------------------------- /adet/modeling/osformer/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | from fvcore.nn import sigmoid_focal_loss_jit 5 | 6 | 7 | def dice_loss(input, target): 8 | input = input.contiguous().view(input.size()[0], -1) 9 | target = target.contiguous().view(target.size()[0], -1).float() 10 | 11 | a = torch.sum(input * target, 1) 12 | b = torch.sum(input * input, 1) + 0.001 13 | c = torch.sum(target * target, 1) + 0.001 14 | d = (2 * a) / (b + c) 15 | return 1 - d 16 | 17 | def giou_loss(input, target): 18 | input = input.contiguous().view(input.size()[0], -1) 19 | target = target.contiguous().view(target.size()[0], -1).float() 20 | 21 | inters = torch.sum(input * target, 1) 22 | b = torch.sum(input * input, 1) + 0.001 23 | c = torch.sum(target * target, 1) + 0.001 24 | uni = b + c 25 | 26 | # ious 27 | ious = inters / uni 28 | loss = 1 - ious 29 | 30 | return loss 31 | 32 | 33 | def reduce_loss(loss, reduction): 34 | """Reduce loss as specified. 35 | Args: 36 | loss (Tensor): Elementwise loss tensor. 37 | reduction (str): Options are "none", "mean" and "sum". 38 | Return: 39 | Tensor: Reduced loss tensor. 40 | """ 41 | reduction_enum = F._Reduction.get_enum(reduction) 42 | # none: 0, elementwise_mean:1, sum: 2 43 | if reduction_enum == 0: 44 | return loss 45 | elif reduction_enum == 1: 46 | return loss.mean() 47 | elif reduction_enum == 2: 48 | return loss.sum() 49 | 50 | 51 | def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None): 52 | """Apply element-wise weight and reduce loss. 53 | Args: 54 | loss (Tensor): Element-wise loss. 55 | weight (Tensor): Element-wise weights. 56 | reduction (str): Same as built-in losses of PyTorch. 57 | avg_factor (float): Avarage factor when computing the mean of losses. 58 | Returns: 59 | Tensor: Processed loss values. 60 | """ 61 | # if weight is specified, apply element-wise weight 62 | if weight is not None: 63 | loss = loss * weight 64 | 65 | # if avg_factor is not specified, just reduce the loss 66 | if avg_factor is None: 67 | loss = reduce_loss(loss, reduction) 68 | else: 69 | # if reduction is mean, then average the loss by avg_factor 70 | if reduction == 'mean': 71 | loss = loss.sum() / avg_factor 72 | # if reduction is 'none', then do nothing, otherwise raise an error 73 | elif reduction != 'none': 74 | raise ValueError('avg_factor can not be used with reduction="sum"') 75 | return loss 76 | 77 | 78 | def sigmoid_focal_loss(pred, 79 | target, 80 | weight=None, 81 | gamma=2.0, 82 | alpha=0.25, 83 | reduction='mean', 84 | avg_factor=None): 85 | # Function.apply does not accept keyword arguments, so the decorator 86 | # "weighted_loss" is not applicable 87 | loss = sigmoid_focal_loss_jit(pred, target, gamma=gamma, alpha=alpha) 88 | if weight is not None: 89 | if weight.shape != loss.shape: 90 | if weight.size(0) == loss.size(0): 91 | # For most cases, weight is of shape (num_priors, ), 92 | # which means it does not have the second axis num_class 93 | weight = weight.view(-1, 1) 94 | else: 95 | # Sometimes, weight per anchor per class is also needed. e.g. 96 | # in FSAF. But it may be flattened of shape 97 | # (num_priors x num_class, ), while loss is still of shape 98 | # (num_priors, num_class). 99 | assert weight.numel() == loss.numel() 100 | weight = weight.view(loss.size(0), -1) 101 | assert weight.ndim == loss.ndim 102 | loss = weight_reduce_loss(loss, weight, reduction, avg_factor) 103 | return loss 104 | 105 | 106 | class FocalLoss(nn.Module): 107 | 108 | def __init__(self, 109 | use_sigmoid=True, 110 | gamma=2.0, 111 | alpha=0.25, 112 | reduction='mean', 113 | loss_weight=1.0): 114 | super(FocalLoss, self).__init__() 115 | assert use_sigmoid is True, 'Only sigmoid focal loss supported now.' 116 | self.use_sigmoid = use_sigmoid 117 | self.gamma = gamma 118 | self.alpha = alpha 119 | self.reduction = reduction 120 | self.loss_weight = loss_weight 121 | 122 | def forward(self, 123 | pred, 124 | target, 125 | weight=None, 126 | avg_factor=None, 127 | reduction_override=None): 128 | assert reduction_override in (None, 'none', 'mean', 'sum') 129 | reduction = ( 130 | reduction_override if reduction_override else self.reduction) 131 | if self.use_sigmoid: 132 | loss_cls = self.loss_weight * sigmoid_focal_loss( 133 | pred, 134 | target, 135 | weight, 136 | gamma=self.gamma, 137 | alpha=self.alpha, 138 | reduction=reduction, 139 | avg_factor=avg_factor) 140 | else: 141 | raise NotImplementedError 142 | return loss_cls 143 | -------------------------------------------------------------------------------- /adet/layers/csrc/ml_nms/ml_nms.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 10 | 11 | __device__ inline float devIoU(float const * const a, float const * const b) { 12 | if (a[5] != b[5]) { 13 | return 0.0; 14 | } 15 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 16 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 17 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 18 | float interS = width * height; 19 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 20 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 21 | return interS / (Sa + Sb - interS); 22 | } 23 | 24 | __global__ void ml_nms_kernel(const int n_boxes, const float nms_overlap_thresh, 25 | const float *dev_boxes, unsigned long long *dev_mask) { 26 | const int row_start = blockIdx.y; 27 | const int col_start = blockIdx.x; 28 | 29 | // if (row_start > col_start) return; 30 | 31 | const int row_size = 32 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 33 | const int col_size = 34 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 35 | 36 | __shared__ float block_boxes[threadsPerBlock * 6]; 37 | if (threadIdx.x < col_size) { 38 | block_boxes[threadIdx.x * 6 + 0] = 39 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0]; 40 | block_boxes[threadIdx.x * 6 + 1] = 41 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1]; 42 | block_boxes[threadIdx.x * 6 + 2] = 43 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2]; 44 | block_boxes[threadIdx.x * 6 + 3] = 45 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3]; 46 | block_boxes[threadIdx.x * 6 + 4] = 47 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4]; 48 | block_boxes[threadIdx.x * 6 + 5] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5]; 50 | } 51 | __syncthreads(); 52 | 53 | if (threadIdx.x < row_size) { 54 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 55 | const float *cur_box = dev_boxes + cur_box_idx * 6; 56 | int i = 0; 57 | unsigned long long t = 0; 58 | int start = 0; 59 | if (row_start == col_start) { 60 | start = threadIdx.x + 1; 61 | } 62 | for (i = start; i < col_size; i++) { 63 | if (devIoU(cur_box, block_boxes + i * 6) > nms_overlap_thresh) { 64 | t |= 1ULL << i; 65 | } 66 | } 67 | const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); 68 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 69 | } 70 | } 71 | 72 | namespace adet { 73 | 74 | // boxes is a N x 6 tensor 75 | at::Tensor ml_nms_cuda(const at::Tensor boxes, const float nms_overlap_thresh) { 76 | using scalar_t = float; 77 | AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); 78 | auto scores = boxes.select(1, 4); 79 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 80 | auto boxes_sorted = boxes.index_select(0, order_t); 81 | 82 | int boxes_num = boxes.size(0); 83 | 84 | const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); 85 | 86 | scalar_t* boxes_dev = boxes_sorted.data(); 87 | 88 | THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState 89 | 90 | unsigned long long* mask_dev = NULL; 91 | //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, 92 | // boxes_num * col_blocks * sizeof(unsigned long long))); 93 | 94 | mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); 95 | 96 | dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), 97 | THCCeilDiv(boxes_num, threadsPerBlock)); 98 | dim3 threads(threadsPerBlock); 99 | ml_nms_kernel<<>>(boxes_num, 100 | nms_overlap_thresh, 101 | boxes_dev, 102 | mask_dev); 103 | 104 | std::vector mask_host(boxes_num * col_blocks); 105 | THCudaCheck(cudaMemcpy(&mask_host[0], 106 | mask_dev, 107 | sizeof(unsigned long long) * boxes_num * col_blocks, 108 | cudaMemcpyDeviceToHost)); 109 | 110 | std::vector remv(col_blocks); 111 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 112 | 113 | at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); 114 | int64_t* keep_out = keep.data(); 115 | 116 | int num_to_keep = 0; 117 | for (int i = 0; i < boxes_num; i++) { 118 | int nblock = i / threadsPerBlock; 119 | int inblock = i % threadsPerBlock; 120 | 121 | if (!(remv[nblock] & (1ULL << inblock))) { 122 | keep_out[num_to_keep++] = i; 123 | unsigned long long *p = &mask_host[0] + i * col_blocks; 124 | for (int j = nblock; j < col_blocks; j++) { 125 | remv[j] |= p[j]; 126 | } 127 | } 128 | } 129 | 130 | THCudaFree(state, mask_dev); 131 | // TODO improve this part 132 | return std::get<0>(order_t.index({ 133 | keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( 134 | order_t.device(), keep.scalar_type()) 135 | }).sort(0, false)); 136 | } 137 | 138 | } // namespace adet -------------------------------------------------------------------------------- /adet/config/defaults.py: -------------------------------------------------------------------------------- 1 | from detectron2.config.defaults import _C 2 | from detectron2.config import CfgNode as CN 3 | 4 | # ---------------------------------------------------------------------------- # 5 | # Additional Configs 6 | # ---------------------------------------------------------------------------- # 7 | _C.MODEL.MOBILENET = False 8 | _C.MODEL.BACKBONE.ANTI_ALIAS = False 9 | _C.MODEL.RESNETS.DEFORM_INTERVAL = 1 10 | _C.INPUT.HFLIP_TRAIN = True 11 | _C.INPUT.CROP.CROP_INSTANCE = True 12 | 13 | # ---------------------------------------------------------------------------- # 14 | # Basis Module Options 15 | # ---------------------------------------------------------------------------- # 16 | _C.MODEL.BASIS_MODULE = CN() 17 | _C.MODEL.BASIS_MODULE.NAME = "ProtoNet" 18 | _C.MODEL.BASIS_MODULE.NUM_BASES = 4 19 | _C.MODEL.BASIS_MODULE.LOSS_ON = False 20 | _C.MODEL.BASIS_MODULE.ANN_SET = "coco" 21 | _C.MODEL.BASIS_MODULE.CONVS_DIM = 128 22 | _C.MODEL.BASIS_MODULE.IN_FEATURES = ["p3", "p4", "p5"] 23 | _C.MODEL.BASIS_MODULE.NORM = "SyncBN" 24 | _C.MODEL.BASIS_MODULE.NUM_CONVS = 3 25 | _C.MODEL.BASIS_MODULE.COMMON_STRIDE = 8 26 | _C.MODEL.BASIS_MODULE.NUM_CLASSES = 80 27 | _C.MODEL.BASIS_MODULE.LOSS_WEIGHT = 0.3 28 | 29 | # ---------------------------------------------------------------------------- # 30 | # OSFormer Options 31 | # ---------------------------------------------------------------------------- # 32 | _C.MODEL.OSFormer = CN() 33 | 34 | # Instance hyper-parameters 35 | _C.MODEL.OSFormer.INSTANCE_IN_FEATURES = ["p2", "p3", "p4", "p5", "p6"] 36 | _C.MODEL.OSFormer.FEAT_INSTANCE_STRIDES = [8, 8, 16, 32, 32] 37 | _C.MODEL.OSFormer.FEAT_SCALE_RANGES = ((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)) 38 | _C.MODEL.OSFormer.SIGMA = 0.2 39 | # Channel size for the instance head. 40 | _C.MODEL.OSFormer.INSTANCE_IN_CHANNELS = 256 41 | _C.MODEL.OSFormer.INSTANCE_CHANNELS = 256 42 | # Convolutions to use in the instance head. 43 | _C.MODEL.OSFormer.NUM_INSTANCE_CONVS = 4 44 | _C.MODEL.OSFormer.USE_DCN_IN_INSTANCE = False 45 | _C.MODEL.OSFormer.TYPE_DCN = 'DCN' 46 | _C.MODEL.OSFormer.NUM_GRIDS = [40, 36, 24, 16, 12] 47 | # Number of foreground classes. 48 | _C.MODEL.OSFormer.NUM_CLASSES = 80 # COCO 49 | _C.MODEL.OSFormer.NUM_KERNELS = 256 50 | _C.MODEL.OSFormer.NORM = "GN" 51 | _C.MODEL.OSFormer.USE_COORD_CONV = True 52 | _C.MODEL.OSFormer.PRIOR_PROB = 0.01 53 | 54 | # Mask hyper-parameters. 55 | # Channel size for the mask tower. 56 | _C.MODEL.OSFormer.MASK_IN_FEATURES = ["p2", "p3", "p4", "p5"] 57 | _C.MODEL.OSFormer.MASK_IN_CHANNELS = 256 58 | _C.MODEL.OSFormer.MASK_CHANNELS = 128 59 | _C.MODEL.OSFormer.NUM_MASKS = 256 60 | 61 | # Test cfg. 62 | # _C.MODEL.OSFormer.CONFIDENCE_SCORE = 0.25 63 | _C.MODEL.OSFormer.NMS_PRE = 500 64 | _C.MODEL.OSFormer.SCORE_THR = 0.1 65 | _C.MODEL.OSFormer.UPDATE_THR = 0.05 66 | _C.MODEL.OSFormer.MASK_THR = 0.5 67 | _C.MODEL.OSFormer.MAX_PER_IMG = 100 68 | _C.MODEL.OSFormer.RESIZE_INPUT_FACTOR = 1 69 | # NMS type: matrix OR mask. 70 | _C.MODEL.OSFormer.NMS_TYPE = "matrix" 71 | # Matrix NMS kernel type: gaussian OR linear. 72 | _C.MODEL.OSFormer.NMS_KERNEL = "gaussian" 73 | _C.MODEL.OSFormer.NMS_SIGMA = 2 74 | 75 | # Loss cfg. 76 | _C.MODEL.OSFormer.LOSS = CN() 77 | _C.MODEL.OSFormer.LOSS.FOCAL_USE_SIGMOID = True 78 | _C.MODEL.OSFormer.LOSS.FOCAL_ALPHA = 0.25 79 | _C.MODEL.OSFormer.LOSS.FOCAL_GAMMA = 2.0 80 | _C.MODEL.OSFormer.LOSS.FOCAL_WEIGHT = 1.0 81 | _C.MODEL.OSFormer.LOSS.DICE_WEIGHT = 3.0 82 | _C.MODEL.OSFormer.LOSS.SEM_WEIGHT = 1.0 83 | _C.MODEL.OSFormer.LOSS.INS_EDGE_WEIGHT = 1.0 84 | _C.MODEL.OSFormer.LOSS.SEM_TYPE = 'dice' 85 | 86 | # Transformer cfg 87 | _C.MODEL.OSFormer.HIDDEN_DIM = 256 88 | _C.MODEL.OSFormer.NUMBER_FEATURE_LEVELS = 5 # P2 P3 P4 P5 P6 89 | _C.MODEL.OSFormer.NHEAD = 8 90 | _C.MODEL.OSFormer.ENC_LAYERS = 6 91 | _C.MODEL.OSFormer.DEC_LAYERS = 6 92 | _C.MODEL.OSFormer.DIM_FEEDFORWARD = 1024 93 | _C.MODEL.OSFormer.ENC_POINTS = 4 94 | 95 | # Structure cfg 96 | _C.MODEL.OSFormer.C2F_MASK = False 97 | _C.MODEL.OSFormer.NOFPN = False 98 | _C.MODEL.OSFormer.SEM_LOSS = False 99 | _C.MODEL.OSFormer.SINGLE_SEM = False 100 | _C.MODEL.OSFormer.INS_EDGE = False 101 | _C.MODEL.OSFormer.FFN = 'default' 102 | _C.MODEL.OSFormer.INS_FUSION = 'default' 103 | _C.MODEL.OSFormer.DCIN_NORM = True 104 | 105 | # Query selection cfg 106 | _C.MODEL.OSFormer.QS = CN() 107 | _C.MODEL.OSFormer.QS.ENABLE = True 108 | _C.MODEL.OSFormer.QS.INPUT = "ENC" # ENC/GRID 109 | _C.MODEL.OSFormer.QS.SHARE_HEAD = False 110 | _C.MODEL.OSFormer.QS.NUM_QUERIES = 300 111 | 112 | # ---------------------------------------------------------------------------- # 113 | # PVT Options 114 | # ---------------------------------------------------------------------------- # 115 | 116 | _C.MODEL.PVTV2 = CN() 117 | _C.MODEL.PVTV2.OUT_FEATURES = ["res2", "res3", "res4", "res5"] 118 | 119 | # ---------------------------------------------------------------------------- # 120 | # SWIN Options 121 | # ---------------------------------------------------------------------------- # 122 | 123 | _C.MODEL.SWIN = CN() 124 | _C.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 125 | _C.MODEL.SWIN.PATCH_SIZE = 4 126 | _C.MODEL.SWIN.EMBED_DIM = 96 127 | _C.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] 128 | _C.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] 129 | _C.MODEL.SWIN.WINDOW_SIZE = 7 130 | _C.MODEL.SWIN.MLP_RATIO = 4.0 131 | _C.MODEL.SWIN.QKV_BIAS = True 132 | _C.MODEL.SWIN.QK_SCALE = None 133 | _C.MODEL.SWIN.DROP_RATE = 0.0 134 | _C.MODEL.SWIN.ATTN_DROP_RATE = 0.0 135 | _C.MODEL.SWIN.DROP_PATH_RATE = 0.3 136 | _C.MODEL.SWIN.APE = False 137 | _C.MODEL.SWIN.PATCH_NORM = True 138 | _C.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] 139 | _C.MODEL.SWIN.USE_CHECKPOINT = False 140 | -------------------------------------------------------------------------------- /adet/modeling/backbone/mobilenet.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from torch import nn 4 | from torch.nn import BatchNorm2d 5 | #from detectron2.layers.batch_norm import NaiveSyncBatchNorm as BatchNorm2d 6 | from detectron2.layers import Conv2d 7 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 8 | from detectron2.modeling.backbone import Backbone 9 | 10 | 11 | def conv_bn(inp, oup, stride): 12 | return nn.Sequential( 13 | Conv2d(inp, oup, 3, stride, 1, bias=False), 14 | BatchNorm2d(oup), 15 | nn.ReLU6(inplace=True) 16 | ) 17 | 18 | 19 | def conv_1x1_bn(inp, oup): 20 | return nn.Sequential( 21 | Conv2d(inp, oup, 1, 1, 0, bias=False), 22 | BatchNorm2d(oup), 23 | nn.ReLU6(inplace=True) 24 | ) 25 | 26 | 27 | class InvertedResidual(nn.Module): 28 | def __init__(self, inp, oup, stride, expand_ratio): 29 | super(InvertedResidual, self).__init__() 30 | self.stride = stride 31 | assert stride in [1, 2] 32 | 33 | hidden_dim = int(round(inp * expand_ratio)) 34 | self.use_res_connect = self.stride == 1 and inp == oup 35 | 36 | if expand_ratio == 1: 37 | self.conv = nn.Sequential( 38 | # dw 39 | Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), 40 | BatchNorm2d(hidden_dim), 41 | nn.ReLU6(inplace=True), 42 | # pw-linear 43 | Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), 44 | BatchNorm2d(oup), 45 | ) 46 | else: 47 | self.conv = nn.Sequential( 48 | # pw 49 | Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), 50 | BatchNorm2d(hidden_dim), 51 | nn.ReLU6(inplace=True), 52 | # dw 53 | Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), 54 | BatchNorm2d(hidden_dim), 55 | nn.ReLU6(inplace=True), 56 | # pw-linear 57 | Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), 58 | BatchNorm2d(oup), 59 | ) 60 | 61 | def forward(self, x): 62 | if self.use_res_connect: 63 | return x + self.conv(x) 64 | else: 65 | return self.conv(x) 66 | 67 | 68 | class MobileNetV2(Backbone): 69 | """ 70 | Should freeze bn 71 | """ 72 | def __init__(self, cfg, n_class=1000, input_size=224, width_mult=1.): 73 | super(MobileNetV2, self).__init__() 74 | block = InvertedResidual 75 | input_channel = 32 76 | interverted_residual_setting = [ 77 | # t, c, n, s 78 | [1, 16, 1, 1], 79 | [6, 24, 2, 2], 80 | [6, 32, 3, 2], 81 | [6, 64, 4, 2], 82 | [6, 96, 3, 1], 83 | [6, 160, 3, 2], 84 | [6, 320, 1, 1], 85 | ] 86 | 87 | # building first layer 88 | assert input_size % 32 == 0 89 | input_channel = int(input_channel * width_mult) 90 | self.return_features_indices = [3, 6, 13, 17] 91 | self.return_features_num_channels = [] 92 | self.features = nn.ModuleList([conv_bn(3, input_channel, 2)]) 93 | # building inverted residual blocks 94 | for t, c, n, s in interverted_residual_setting: 95 | output_channel = int(c * width_mult) 96 | for i in range(n): 97 | if i == 0: 98 | self.features.append(block(input_channel, output_channel, s, expand_ratio=t)) 99 | else: 100 | self.features.append(block(input_channel, output_channel, 1, expand_ratio=t)) 101 | input_channel = output_channel 102 | if len(self.features) - 1 in self.return_features_indices: 103 | self.return_features_num_channels.append(output_channel) 104 | 105 | self._initialize_weights() 106 | self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_AT) 107 | 108 | def _freeze_backbone(self, freeze_at): 109 | for layer_index in range(freeze_at): 110 | for p in self.features[layer_index].parameters(): 111 | p.requires_grad = False 112 | 113 | def forward(self, x): 114 | res = [] 115 | for i, m in enumerate(self.features): 116 | x = m(x) 117 | if i in self.return_features_indices: 118 | res.append(x) 119 | return {'res{}'.format(i + 2): r for i, r in enumerate(res)} 120 | 121 | def _initialize_weights(self): 122 | for m in self.modules(): 123 | if isinstance(m, Conv2d): 124 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 125 | m.weight.data.normal_(0, (2. / n) ** 0.5) 126 | if m.bias is not None: 127 | m.bias.data.zero_() 128 | elif isinstance(m, BatchNorm2d): 129 | m.weight.data.fill_(1) 130 | m.bias.data.zero_() 131 | elif isinstance(m, nn.Linear): 132 | n = m.weight.size(1) 133 | m.weight.data.normal_(0, 0.01) 134 | m.bias.data.zero_() 135 | 136 | @BACKBONE_REGISTRY.register() 137 | def build_mnv2_backbone(cfg, input_shape): 138 | """ 139 | Create a ResNet instance from config. 140 | 141 | Returns: 142 | ResNet: a :class:`ResNet` instance. 143 | """ 144 | out_features = cfg.MODEL.RESNETS.OUT_FEATURES 145 | 146 | out_feature_channels = {"res2": 24, "res3": 32, 147 | "res4": 96, "res5": 320} 148 | out_feature_strides = {"res2": 4, "res3": 8, "res4": 16, "res5": 32} 149 | model = MobileNetV2(cfg) 150 | model._out_features = out_features 151 | model._out_feature_channels = out_feature_channels 152 | model._out_feature_strides = out_feature_strides 153 | return model 154 | -------------------------------------------------------------------------------- /adet/modeling/ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # Taken from https://github.com/fundamentalvision/Deformable-DETR 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | 8 | from __future__ import absolute_import 9 | from __future__ import print_function 10 | from __future__ import division 11 | 12 | import warnings 13 | import math 14 | 15 | import torch 16 | from torch import nn 17 | import torch.nn.functional as F 18 | from torch.nn.init import xavier_uniform_, constant_ 19 | 20 | from ..functions import MSDeformAttnFunction 21 | 22 | 23 | def _is_power_of_2(n): 24 | if (not isinstance(n, int)) or (n < 0): 25 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 26 | return (n & (n-1) == 0) and n != 0 27 | 28 | 29 | class MSDeformAttn(nn.Module): 30 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 31 | """ 32 | Multi-Scale Deformable Attention Module 33 | :param d_model hidden dimension 34 | :param n_levels number of feature levels 35 | :param n_heads number of attention heads 36 | :param n_points number of sampling points per attention head per feature level 37 | """ 38 | super().__init__() 39 | if d_model % n_heads != 0: 40 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 41 | _d_per_head = d_model // n_heads 42 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 43 | if not _is_power_of_2(_d_per_head): 44 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 45 | "which is more efficient in our CUDA implementation.") 46 | 47 | self.im2col_step = 64 48 | 49 | self.d_model = d_model 50 | self.n_levels = n_levels 51 | self.n_heads = n_heads 52 | self.n_points = n_points 53 | 54 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 55 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 56 | self.value_proj = nn.Linear(d_model, d_model) 57 | self.output_proj = nn.Linear(d_model, d_model) 58 | 59 | self._reset_parameters() 60 | 61 | def _reset_parameters(self): 62 | constant_(self.sampling_offsets.weight.data, 0.) 63 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 64 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 65 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 66 | for i in range(self.n_points): 67 | grid_init[:, :, i, :] *= i + 1 68 | with torch.no_grad(): 69 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 70 | constant_(self.attention_weights.weight.data, 0.) 71 | constant_(self.attention_weights.bias.data, 0.) 72 | xavier_uniform_(self.value_proj.weight.data) 73 | constant_(self.value_proj.bias.data, 0.) 74 | xavier_uniform_(self.output_proj.weight.data) 75 | constant_(self.output_proj.bias.data, 0.) 76 | 77 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 78 | """ 79 | :param query (N, Length_{query}, C) 80 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 81 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 82 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 83 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 84 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 85 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 86 | 87 | :return output (N, Length_{query}, C) 88 | """ 89 | N, Len_q, _ = query.shape 90 | N, Len_in, _ = input_flatten.shape 91 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 92 | 93 | value = self.value_proj(input_flatten) 94 | if input_padding_mask is not None: 95 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 96 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 97 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 98 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 99 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 100 | # N, Len_q, n_heads, n_levels, n_points, 2 101 | if reference_points.shape[-1] == 2: 102 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 103 | sampling_locations = reference_points[:, :, None, :, None, :] \ 104 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 105 | elif reference_points.shape[-1] == 4: 106 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 107 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 108 | else: 109 | raise ValueError( 110 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 111 | output = MSDeformAttnFunction.apply( 112 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 113 | output = self.output_proj(output) 114 | return output 115 | -------------------------------------------------------------------------------- /adet/modeling/osformer/trans_decoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn.init import xavier_uniform_, constant_, normal_ 4 | 5 | from adet.modeling.ops.modules.ms_deform_attn import MSDeformAttn 6 | from .trans_utils import _get_clones, get_reference_points, with_pos_embed 7 | from .feed_forward import get_ffn 8 | 9 | 10 | class CISTransformerDecoder(nn.Module): 11 | def __init__(self, d_model=256, nhead=8, 12 | num_encoder_layers=6, dim_feedforward=1024, dropout=0.1, 13 | ffn_type="default", num_feature_levels=4, enc_n_points=4): 14 | super().__init__() 15 | 16 | self.d_model = d_model 17 | self.nhead = nhead 18 | 19 | decoder_layer = TransformerDecoderLayer(d_model, dim_feedforward, 20 | dropout, ffn_type, 21 | num_feature_levels, nhead, enc_n_points) 22 | self.decoder = TransformerDecoder(decoder_layer, num_encoder_layers) 23 | self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) 24 | self.reference_points = nn.Linear(d_model, 2) 25 | 26 | self._reset_parameters() 27 | 28 | def _reset_parameters(self): 29 | for p in self.parameters(): 30 | if p.dim() > 1: 31 | nn.init.xavier_uniform_(p) 32 | for m in self.modules(): 33 | if isinstance(m, MSDeformAttn): 34 | m._reset_parameters() 35 | xavier_uniform_(self.reference_points.weight.data, gain=1.0) 36 | constant_(self.reference_points.bias.data, 0.) 37 | normal_(self.level_embed) 38 | 39 | def forward(self, srcs, pos_embeds, memorys=None, pos_memorys=None): 40 | 41 | # prepare input for decoder 42 | src_flatten = [] 43 | memory_flatten = [] 44 | lvl_pos_embed_flatten = [] 45 | lvl_pos_memory_flatten = [] 46 | spatial_shapes = [] 47 | spatial_shape_grids = [] 48 | for lvl, (src, pos_embed, memory, pos_memory) in enumerate(zip(srcs, pos_embeds, memorys, pos_memorys)): 49 | bs1, c1, h1, w1 = src.shape 50 | spatial_shape_src = (h1, w1) 51 | spatial_shape_grids.append(spatial_shape_src) 52 | bs, c, h, w = memory.shape 53 | spatial_shape = (h, w) 54 | spatial_shapes.append(spatial_shape) 55 | src = src.flatten(2).transpose(1, 2) 56 | memory = memory.flatten(2).transpose(1, 2) 57 | pos_embed = pos_embed.flatten(2).transpose(1, 2) 58 | pos_memory = pos_memory.flatten(2).transpose(1, 2) 59 | lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1) 60 | lvl_pos_embed_flatten.append(lvl_pos_embed) 61 | lvl_pos_memory = pos_memory + self.level_embed[lvl].view(1, 1, -1) 62 | lvl_pos_memory_flatten.append(lvl_pos_memory) 63 | src_flatten.append(src) 64 | memory_flatten.append(memory) 65 | src_flatten = torch.cat(src_flatten, 1) 66 | memory_flatten = torch.cat(memory_flatten, 1) 67 | lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) 68 | lvl_pos_memory_flatten = torch.cat(lvl_pos_memory_flatten, 1) 69 | spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device) 70 | level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) 71 | spatial_shape_grids = torch.as_tensor(spatial_shape_grids, dtype=torch.long, device=src_flatten.device) 72 | level_start_index_grid = torch.cat((spatial_shape_grids.new_zeros((1, )), spatial_shape_grids.prod(1).cumsum(0)[:-1])) 73 | 74 | # decoder 75 | memory = self.decoder(src_flatten, memory_flatten, spatial_shapes, spatial_shape_grids, level_start_index_grid, 76 | level_start_index, lvl_pos_embed_flatten, lvl_pos_memory_flatten) 77 | 78 | return memory, level_start_index 79 | 80 | 81 | class TransformerDecoderLayer(nn.Module): 82 | def __init__(self, 83 | d_model=256, d_ffn=1024, 84 | dropout=0.1, ffn_type="default", 85 | n_levels=4, n_heads=8, n_points=4): 86 | super().__init__() 87 | 88 | # self attention 89 | self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points) 90 | self.dropout1 = nn.Dropout(dropout) 91 | self.norm1 = nn.LayerNorm(d_model) 92 | 93 | # ffn 94 | self.ffn = get_ffn(d_model, ffn_type) 95 | 96 | def forward(self, src, pos_embed, memorys, pos_memory, reference_points, spatial_shapes, 97 | level_start_index, spatial_shape_grids, level_start_index_grid): 98 | # self attention 99 | src2 = self.self_attn(with_pos_embed(src, pos_embed), reference_points, 100 | with_pos_embed(memorys, pos_memory), spatial_shapes, level_start_index) 101 | src = src + self.dropout1(src2) 102 | src = self.norm1(src) 103 | 104 | # ffn 105 | src = self.ffn(src, spatial_shape_grids, level_start_index_grid) 106 | 107 | return src 108 | 109 | 110 | class TransformerDecoder(nn.Module): 111 | def __init__(self, decoder_layer, num_layers): 112 | super().__init__() 113 | self.layers = _get_clones(decoder_layer, num_layers) 114 | self.num_layers = num_layers 115 | 116 | def forward(self, src, memorys, spatial_shapes, spatial_shape_grids, 117 | level_start_index_grid, level_start_index, pos_embed, pos_memory): 118 | output = src 119 | batch_size = src.shape[0] 120 | reference_points = get_reference_points(spatial_shape_grids, batch_size, device=src.device) 121 | for _, layer in enumerate(self.layers): 122 | output = layer(output, pos_embed, memorys, pos_memory, reference_points, spatial_shapes, 123 | level_start_index, spatial_shape_grids, level_start_index_grid) 124 | 125 | return output 126 | 127 | 128 | def build_transformer_decoder(cfg): 129 | return CISTransformerDecoder( 130 | d_model=cfg.MODEL.OSFormer.HIDDEN_DIM, 131 | nhead=cfg.MODEL.OSFormer.NHEAD, 132 | num_encoder_layers=cfg.MODEL.OSFormer.DEC_LAYERS, 133 | dim_feedforward=cfg.MODEL.OSFormer.DIM_FEEDFORWARD, 134 | dropout=0.1, 135 | ffn_type=cfg.MODEL.OSFormer.FFN, 136 | num_feature_levels=len(cfg.MODEL.OSFormer.FEAT_INSTANCE_STRIDES), 137 | enc_n_points=cfg.MODEL.OSFormer.ENC_POINTS) 138 | -------------------------------------------------------------------------------- /demo/demo.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import multiprocessing as mp 4 | import os 5 | import time 6 | import cv2 7 | import tqdm 8 | 9 | from detectron2.data.detection_utils import read_image 10 | from detectron2.utils.logger import setup_logger 11 | from detectron2.utils.analysis import parameter_count 12 | 13 | from predictor import VisualizationDemo 14 | from adet.config import get_cfg 15 | 16 | # constants 17 | WINDOW_NAME = "COCO detections" 18 | 19 | 20 | def setup_cfg(args): 21 | # load config from file and command-line arguments 22 | cfg = get_cfg() 23 | cfg.merge_from_file(args.config_file) 24 | cfg.merge_from_list(args.opts) 25 | # Set score_threshold for builtin models 26 | cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold 27 | cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold 28 | cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold 29 | cfg.freeze() 30 | return cfg 31 | 32 | 33 | def get_parser(): 34 | parser = argparse.ArgumentParser(description="Detectron2 Demo") 35 | parser.add_argument( 36 | "--config-file", 37 | default="configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_inference_acc_test.yaml", 38 | metavar="FILE", 39 | help="path to config file", 40 | ) 41 | parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.") 42 | parser.add_argument("--video-input", help="Path to video file.") 43 | parser.add_argument("--input", nargs="+", help="A list of space separated input images") 44 | parser.add_argument( 45 | "--output", 46 | help="A file or directory to save output visualizations. " 47 | "If not given, will show output in an OpenCV window.", 48 | ) 49 | parser.add_argument( 50 | "--sod", 51 | action='store_true' 52 | ) 53 | 54 | parser.add_argument( 55 | "--confidence-threshold", 56 | type=float, 57 | default=0.3, 58 | help="Minimum score for instance predictions to be shown", 59 | ) 60 | parser.add_argument( 61 | "--opts", 62 | help="Modify config options using the command-line 'KEY VALUE' pairs", 63 | default=[], 64 | nargs=argparse.REMAINDER, 65 | ) 66 | return parser 67 | 68 | 69 | if __name__ == "__main__": 70 | mp.set_start_method("spawn", force=True) 71 | args = get_parser().parse_args() 72 | logger = setup_logger() 73 | logger.info("Arguments: " + str(args)) 74 | 75 | cfg = setup_cfg(args) 76 | 77 | demo = VisualizationDemo(cfg) 78 | print('total parameter:', parameter_count(demo.predictor.model)['']) 79 | 80 | if args.input: 81 | if os.path.isdir(args.input[0]): 82 | args.input = [os.path.join(args.input[0], fname) for fname in os.listdir(args.input[0])] 83 | elif len(args.input) == 1: 84 | args.input = glob.glob(os.path.expanduser(args.input[0])) 85 | assert args.input, "The input path(s) was not found" 86 | for path in tqdm.tqdm(args.input, disable=not args.output): 87 | # use PIL, to be consistent with evaluation 88 | img = read_image(path, format="BGR") 89 | start_time = time.time() 90 | if args.sod: 91 | sod = demo.run_on_image_sod(img) 92 | out_filename = os.path.join(args.output, os.path.basename(path)) 93 | cv2.imwrite(out_filename, sod) 94 | continue 95 | predictions, visualized_output = demo.run_on_image(img) 96 | logger.info( 97 | "{}: detected {} instances in {:.2f}s".format( 98 | path, len(predictions["instances"]), time.time() - start_time 99 | ) 100 | ) 101 | 102 | if args.output: 103 | if os.path.isdir(args.output): 104 | assert os.path.isdir(args.output), args.output 105 | out_filename = os.path.join(args.output, os.path.basename(path)) 106 | else: 107 | assert len(args.input) == 1, "Please specify a directory with args.output" 108 | out_filename = args.output 109 | visualized_output.save(out_filename) 110 | if cfg.MODEL.OSFormer.SEM_LOSS: 111 | cv2.imwrite( 112 | os.path.join(args.output, 'edge_' + os.path.basename(path)), 113 | predictions["pred_sems"].squeeze(0).cpu().numpy() * 255) 114 | else: 115 | cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1]) 116 | if cv2.waitKey(0) == 27: 117 | break # esc to quit 118 | elif args.webcam: 119 | assert args.input is None, "Cannot have both --input and --webcam!" 120 | cam = cv2.VideoCapture(0) 121 | for vis in tqdm.tqdm(demo.run_on_video(cam)): 122 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) 123 | cv2.imshow(WINDOW_NAME, vis) 124 | if cv2.waitKey(1) == 27: 125 | break # esc to quit 126 | cv2.destroyAllWindows() 127 | elif args.video_input: 128 | video = cv2.VideoCapture(args.video_input) 129 | width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) 130 | height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) 131 | frames_per_second = video.get(cv2.CAP_PROP_FPS) 132 | num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) 133 | basename = os.path.basename(args.video_input) 134 | 135 | if args.output: 136 | if os.path.isdir(args.output): 137 | output_fname = os.path.join(args.output, basename) 138 | output_fname = os.path.splitext(output_fname)[0] + ".mkv" 139 | else: 140 | output_fname = args.output 141 | assert not os.path.isfile(output_fname), output_fname 142 | output_file = cv2.VideoWriter( 143 | filename=output_fname, 144 | # some installation of opencv may not support x264 (due to its license), 145 | # you can try other format (e.g. MPEG) 146 | fourcc=cv2.VideoWriter_fourcc(*"x264"), 147 | fps=float(frames_per_second), 148 | frameSize=(width, height), 149 | isColor=True, 150 | ) 151 | assert os.path.isfile(args.video_input) 152 | for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames): 153 | if args.output: 154 | output_file.write(vis_frame) 155 | else: 156 | cv2.namedWindow(basename, cv2.WINDOW_NORMAL) 157 | cv2.imshow(basename, vis_frame) 158 | if cv2.waitKey(1) == 27: 159 | break # esc to quit 160 | video.release() 161 | if args.output: 162 | output_file.release() 163 | else: 164 | cv2.destroyAllWindows() 165 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OSFormer: One-Stage Camouflaged Instance Segmentation with Transformers (ECCV 2022) 2 | 3 | ![OSFormer](docs/OSFormer.png) 4 | 5 | Official Implementation of "[OSFormer: One-Stage Camouflaged Instance Segmentation with Transformers](https://arxiv.org/abs/2207.02255)" 6 | 7 | [Jialun Pei*](https://scholar.google.com/citations?user=1lPivLsAAAAJ&hl=en), [Tianyang Cheng*](https://github.com/Patrickctyyx), [Deng-Ping Fan](https://dengpingfan.github.io/), [He Tang](https://scholar.google.com/citations?hl=en&user=70XLFUsAAAAJ), Chuanbo Chen, and [Luc Van Gool](https://ee.ethz.ch/the-department/faculty/professors/person-detail.OTAyMzM=.TGlzdC80MTEsMTA1ODA0MjU5.html) 8 | 9 | [[Paper]](https://arxiv.org/abs/2207.02255); [[Chinese Version]](https://dengpingfan.github.io/papers/[2022][ECCV]OSFormer_Chinese.pdf); [[Official Version]](https://link.springer.com/content/pdf/10.1007/978-3-031-19797-0_2.pdf); [[Project Page]](https://blog.patrickcty.cc/OSFormer-Homepage/) 10 | 11 | **Contact:** dengpfan@gmail.com, peijl@hust.edu.cn 12 | 13 | | *Sample 1* | *Sample 2* | *Sample 3* | *Sample 4* | 14 | | :------------------------------: | :-------------------------------: | :-------------------------------: | :-------------------------------: | 15 | | | | | | 16 | 17 | ## Environment preparation 18 | 19 | The code is tested on CUDA 11.1 and pytorch 1.9.0, change the versions below to your desired ones. 20 | 21 | ```shell 22 | git clone https://github.com/PJLallen/OSFormer.git 23 | cd OSFormer 24 | conda create -n osformer python=3.8 -y 25 | conda activate osformer 26 | conda install pytorch==1.9.0 torchvision cudatoolkit=11.1 -c pytorch -c nvidia -y 27 | python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu111/torch1.9/index.html 28 | python setup.py build develop 29 | ``` 30 | 31 | ## Dataset preparation 32 | 33 | ### Download the datasets 34 | 35 | - **COD10K**: [Baidu](https://pan.baidu.com/s/1IPcPjdg1EJ-h9HPoU42nHA) (password:hust) / [Google](https://drive.google.com/file/d/1YGa3v-MiXy-3MMJDkidLXPt0KQwygt-Z/view?usp=sharing) / [Quark](https://pan.quark.cn/s/07ba3258b777); **Json files:** [Baidu](https://pan.baidu.com/s/1kRawj-hzBDycCkZZfQjFhg) (password:hust) / [Google](https://drive.google.com/drive/folders/1Yvz63C8c7LOHFRgm06viUM9XupARRPif?usp=sharing) 36 | - **NC4K**: [Baidu](https://pan.baidu.com/s/1li4INx4klQ_j8ftODyw2Zg) (password:hust) / [Google](https://drive.google.com/file/d/1eK_oi-N4Rmo6IIxUNbYHBiNWuDDLGr_k/view?usp=sharing); **Json files:** [Baidu](https://pan.baidu.com/s/1DBPFtAL2iEjefwiqXE_GWA) (password:hust) / [Google](https://drive.google.com/drive/folders/1LyK7tl2QVZBFiNaWI_n0ZVa0QiwF2B8e?usp=sharing) 37 | 38 | ### Register datasets 39 | 40 | 1. generate coco annotation files, you may refer to [the tutorial of mmdetection](https://github.com/open-mmlab/mmdetection/blob/master/docs/en/2_new_data_model.md) for some help 41 | 2. change the path of the datasets as well as annotations in `adet/data/datasets/cis.py`, please refer to [the docs of detectron2](https://detectron2.readthedocs.io/en/latest/) for more help 42 | 43 | ```python 44 | # adet/data/datasets/cis.py 45 | # change the paths 46 | DATASET_ROOT = 'COD10K-v3' 47 | ANN_ROOT = os.path.join(DATASET_ROOT, 'annotations') 48 | TRAIN_PATH = os.path.join(DATASET_ROOT, 'Train/Image') 49 | TEST_PATH = os.path.join(DATASET_ROOT, 'Test/Image') 50 | TRAIN_JSON = os.path.join(ANN_ROOT, 'train_instance.json') 51 | TEST_JSON = os.path.join(ANN_ROOT, 'test2026.json') 52 | 53 | NC4K_ROOT = 'NC4K' 54 | NC4K_PATH = os.path.join(NC4K_ROOT, 'Imgs') 55 | NC4K_JSON = os.path.join(NC4K_ROOT, 'nc4k_test.json') 56 | ``` 57 | 58 | ## Pre-trained models 59 | 60 | Model weights: [Baidu](https://pan.baidu.com/s/1Ao3Myqa6xiA9ymAkZgZOeQ) (password:l6vn) / [Google](https://drive.google.com/drive/folders/1pl9iM1NAfN5N6Voc03oPmlbKJ-YNldMF?usp=sharing) / [Quark](https://pan.quark.cn/s/6676592ff08b) 61 | 62 | | Model | Config | COD10K-test AP | NC4K-test AP | 63 | |:--------------|:------------------------------------------------ |:---------------|:-------------| 64 | | R50-550 | [configs/CIS_RT.yaml](configs/CIS_RT.yaml) | 36.0 | 41.4 | 65 | | R50 | [configs/CIS_R50.yaml](configs/CIS_R50.yaml) | 41.0 | 42.5 | 66 | | R101 | [configs/CIS_R101.yaml](configs/CIS_R101.yaml) | 42.0 | 44.4 | 67 | | PVTv2-B2-Li | [configs/CIS_PVTv2B2Li](configs/CIS_PVTv2B2Li) | 47.2 | 50.5 | 68 | | SWIN-T | [configs/CIS_SWINT.yaml](configs/CIS_SWINT.yaml) | 47.7 | 50.2 | 69 | 70 | ## Visualization results 71 | 72 | The visual results are achieved by our OSFormer with ResNet-50 trained on the COD10K training set. 73 | 74 | - Results on the COD10K test set: [Baidu](https://pan.baidu.com/s/16xH7coaGoOGiB5x1AXgy5w) (password:hust) / 75 | [Google](https://drive.google.com/open?id=16XMw6NaiCQdHG1By-1a7s8SmnyEqlmYD) 76 | - Results on the NC4K test set: [Baidu](https://pan.baidu.com/s/15Y-7fNcHRhu38Vjybx1HMg) (password:hust) / 77 | [Google](https://drive.google.com/file/d/1cRcwbD3Y3fMO3n7eTtA6VGZWKCWwJSU0/view?usp=sharing) 78 | 79 | ## Frequently asked questions 80 | 81 | [FAQ](https://github.com/PJLallen/OSFormer/blob/main/docs/faq.md) 82 | 83 | ## Usage 84 | 85 | ### Train 86 | 87 | ```shell 88 | python tools/train_net.py --config-file configs/CIS_R50.yaml --num-gpus 1 \ 89 | OUTPUT_DIR {PATH_TO_OUTPUT_DIR} 90 | ``` 91 | 92 | Please replace `{PATH_TO_OUTPUT_DIR}` to your own output dir 93 | 94 | ### Inference 95 | 96 | ```shell 97 | python tools/train_net.py --config-file configs/CIS_R50.yaml --eval-only \ 98 | MODEL.WEIGHTS {PATH_TO_PRE_TRAINED_WEIGHTS} 99 | ``` 100 | 101 | Please replace `{PATH_TO_PRE_TRAINED_WEIGHTS}` to the pre-trained weights 102 | 103 | ### Eval 104 | 105 | ```shell 106 | python demo/demo.py --config-file configs/CIS_R50.yaml \ 107 | --input {PATH_TO_THE_IMG_DIR_OR_FIRE} \ 108 | --output {PATH_TO_SAVE_DIR_OR_IMAGE_FILE} \ 109 | --opts MODEL.WEIGHTS {PATH_TO_PRE_TRAINED_WEIGHTS} 110 | ``` 111 | 112 | - `{PATH_TO_THE_IMG_DIR_OR_FIRE}`: you can put image dir or image paths here 113 | - `{PATH_TO_SAVE_DIR_OR_IMAGE_FILE}`: the place where the visualizations will be saved 114 | - `{PATH_TO_PRE_TRAINED_WEIGHTS}`: please put the pre-trained weights here 115 | 116 | 117 | ## Acknowledgement 118 | 119 | This work is based on: 120 | - [detectron2](https://github.com/facebookresearch/detectron2) 121 | - [AdelaiDet](https://github.com/aim-uofa/AdelaiDet) 122 | - [DETR](https://github.com/facebookresearch/detr) 123 | - [Deformable DETR](https://github.com/fundamentalvision/Deformable-DETR) 124 | 125 | We also get help from [mmdetection](https://github.com/open-mmlab/mmdetection). Thanks them for their great work! 126 | 127 | ## Citation 128 | 129 | If this helps you, please cite this work: 130 | 131 | ``` 132 | @inproceedings{pei2022osformer, 133 | title={OSFormer: One-Stage Camouflaged Instance Segmentation with Transformers}, 134 | author={Pei, Jialun and Cheng, Tianyang and Fan, Deng-Ping and Tang, He and Chen, Chuanbo and Van Gool, Luc}, 135 | booktitle={European conference on computer vision}, 136 | year={2022}, 137 | organization={Springer} 138 | } 139 | ``` 140 | -------------------------------------------------------------------------------- /tools/train_net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from collections import OrderedDict 4 | 5 | import detectron2.utils.comm as comm 6 | from detectron2.data import MetadataCatalog, build_detection_train_loader 7 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch 8 | from detectron2.utils.events import EventStorage 9 | from detectron2.evaluation import ( 10 | COCOEvaluator, 11 | COCOPanopticEvaluator, 12 | DatasetEvaluators, 13 | LVISEvaluator, 14 | PascalVOCDetectionEvaluator, 15 | SemSegEvaluator, 16 | verify_results, 17 | ) 18 | from detectron2.modeling import GeneralizedRCNNWithTTA 19 | 20 | from detectron2.utils.logger import setup_logger 21 | from adet.data.dataset_mapper import DatasetMapperWithBasis 22 | from adet.config import get_cfg 23 | from adet.checkpoint import AdetCheckpointer 24 | from adet.evaluation import TextEvaluator 25 | from adet.data.datasets.cis import register_dataset 26 | 27 | 28 | class Trainer(DefaultTrainer): 29 | """ 30 | This is the same Trainer except that we rewrite the 31 | `build_train_loader` method. 32 | """ 33 | 34 | def train_loop(self, start_iter: int, max_iter: int): 35 | """ 36 | Args: 37 | start_iter, max_iter (int): See docs above 38 | """ 39 | logger = logging.getLogger("adet.trainer") 40 | logger.info("Starting training from iteration {}".format(start_iter)) 41 | 42 | self.iter = self.start_iter = start_iter 43 | self.max_iter = max_iter 44 | 45 | with EventStorage(start_iter) as self.storage: 46 | self.before_train() 47 | for self.iter in range(start_iter, max_iter): 48 | self.before_step() 49 | self.run_step() 50 | self.after_step() 51 | self.after_train() 52 | 53 | def train(self): 54 | """ 55 | Run training. 56 | 57 | Returns: 58 | OrderedDict of results, if evaluation is enabled. Otherwise None. 59 | """ 60 | self.train_loop(self.start_iter, self.max_iter) 61 | if hasattr(self, "_last_eval_results") and comm.is_main_process(): 62 | verify_results(self.cfg, self._last_eval_results) 63 | return self._last_eval_results 64 | 65 | @classmethod 66 | def build_train_loader(cls, cfg): 67 | """ 68 | Returns: 69 | iterable 70 | 71 | It calls :func:`detectron2.data.build_detection_train_loader` with a customized 72 | DatasetMapper, which adds categorical labels as a semantic mask. 73 | """ 74 | mapper = DatasetMapperWithBasis(cfg, True) 75 | return build_detection_train_loader(cfg, mapper=mapper) 76 | 77 | @classmethod 78 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 79 | """ 80 | Create evaluator(s) for a given dataset. 81 | This uses the special metadata "evaluator_type" associated with each builtin dataset. 82 | For your own dataset, you can simply create an evaluator manually in your 83 | script and do not have to worry about the hacky if-else logic here. 84 | """ 85 | if output_folder is None: 86 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 87 | evaluator_list = [] 88 | evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type 89 | if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: 90 | evaluator_list.append( 91 | SemSegEvaluator( 92 | dataset_name, 93 | distributed=True, 94 | num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 95 | ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 96 | output_dir=output_folder, 97 | ) 98 | ) 99 | if evaluator_type in ["coco", "coco_panoptic_seg"]: 100 | evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder)) 101 | if evaluator_type == "coco_panoptic_seg": 102 | evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) 103 | if evaluator_type == "pascal_voc": 104 | return PascalVOCDetectionEvaluator(dataset_name) 105 | if evaluator_type == "lvis": 106 | return LVISEvaluator(dataset_name, cfg, True, output_folder) 107 | if evaluator_type == "text": 108 | return TextEvaluator(dataset_name, cfg, True, output_folder) 109 | if len(evaluator_list) == 0: 110 | raise NotImplementedError( 111 | "no Evaluator for the dataset {} with the type {}".format( 112 | dataset_name, evaluator_type 113 | ) 114 | ) 115 | if len(evaluator_list) == 1: 116 | return evaluator_list[0] 117 | return DatasetEvaluators(evaluator_list) 118 | 119 | @classmethod 120 | def test_with_TTA(cls, cfg, model): 121 | logger = logging.getLogger("adet.trainer") 122 | # In the end of training, run an evaluation with TTA 123 | # Only support some R-CNN models. 124 | logger.info("Running inference with test-time augmentation ...") 125 | model = GeneralizedRCNNWithTTA(cfg, model) 126 | evaluators = [ 127 | cls.build_evaluator( 128 | cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA") 129 | ) 130 | for name in cfg.DATASETS.TEST 131 | ] 132 | res = cls.test(cfg, model, evaluators) 133 | res = OrderedDict({k + "_TTA": v for k, v in res.items()}) 134 | return res 135 | 136 | 137 | def setup(args): 138 | """ 139 | Create configs and perform basic setups. 140 | """ 141 | cfg = get_cfg() 142 | cfg.merge_from_file(args.config_file) 143 | cfg.merge_from_list(args.opts) 144 | cfg.freeze() 145 | default_setup(cfg, args) 146 | 147 | rank = comm.get_rank() 148 | setup_logger(cfg.OUTPUT_DIR, distributed_rank=rank, name="adet") 149 | 150 | return cfg 151 | 152 | 153 | def main(args): 154 | 155 | cfg = setup(args) 156 | register_dataset() 157 | 158 | if args.eval_only: 159 | model = Trainer.build_model(cfg) 160 | AdetCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 161 | cfg.MODEL.WEIGHTS, resume=args.resume 162 | ) 163 | res = Trainer.test(cfg, model) # d2 defaults.py 164 | if comm.is_main_process(): 165 | verify_results(cfg, res) 166 | if cfg.TEST.AUG.ENABLED: 167 | res.update(Trainer.test_with_TTA(cfg, model)) 168 | return res 169 | 170 | """ 171 | If you'd like to do anything fancier than the standard training logic, 172 | consider writing your own training loop or subclassing the trainer. 173 | """ 174 | trainer = Trainer(cfg) 175 | trainer.resume_or_load(resume=args.resume) 176 | if cfg.TEST.AUG.ENABLED: 177 | trainer.register_hooks( 178 | [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))] 179 | ) 180 | return trainer.train() 181 | 182 | 183 | if __name__ == "__main__": 184 | args = default_argument_parser().parse_args() 185 | print("Command Line Args:", args) 186 | launch( 187 | main, 188 | args.num_gpus, 189 | num_machines=args.num_machines, 190 | machine_rank=args.machine_rank, 191 | dist_url=args.dist_url, 192 | args=(args,), 193 | ) 194 | -------------------------------------------------------------------------------- /adet/modeling/ops/src/cuda/ms_deform_attn_cuda.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | Taken from https://github.com/fundamentalvision/Deformable-DETR 4 | ************************************************************************************************** 5 | * Deformable DETR 6 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 7 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | #include "cuda/ms_deform_im2col_cuda.cuh" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | 20 | at::Tensor ms_deform_attn_cuda_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step) 27 | { 28 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 29 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 30 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 31 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 32 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 33 | 34 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 35 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 36 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 37 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 38 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 39 | 40 | const int batch = value.size(0); 41 | const int spatial_size = value.size(1); 42 | const int num_heads = value.size(2); 43 | const int channels = value.size(3); 44 | 45 | const int num_levels = spatial_shapes.size(0); 46 | 47 | const int num_query = sampling_loc.size(1); 48 | const int num_point = sampling_loc.size(4); 49 | 50 | const int im2col_step_ = std::min(batch, im2col_step); 51 | 52 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 53 | 54 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); 55 | 56 | const int batch_n = im2col_step_; 57 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 58 | auto per_value_size = spatial_size * num_heads * channels; 59 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 60 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 61 | for (int n = 0; n < batch/im2col_step_; ++n) 62 | { 63 | auto columns = output_n.select(0, n); 64 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { 65 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), 66 | value.data() + n * im2col_step_ * per_value_size, 67 | spatial_shapes.data(), 68 | level_start_index.data(), 69 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 70 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 71 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 72 | columns.data()); 73 | 74 | })); 75 | } 76 | 77 | output = output.view({batch, num_query, num_heads*channels}); 78 | 79 | return output; 80 | } 81 | 82 | 83 | std::vector ms_deform_attn_cuda_backward( 84 | const at::Tensor &value, 85 | const at::Tensor &spatial_shapes, 86 | const at::Tensor &level_start_index, 87 | const at::Tensor &sampling_loc, 88 | const at::Tensor &attn_weight, 89 | const at::Tensor &grad_output, 90 | const int im2col_step) 91 | { 92 | 93 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 94 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 95 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 96 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 97 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 98 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); 99 | 100 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 101 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 102 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 103 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 104 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 105 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); 106 | 107 | const int batch = value.size(0); 108 | const int spatial_size = value.size(1); 109 | const int num_heads = value.size(2); 110 | const int channels = value.size(3); 111 | 112 | const int num_levels = spatial_shapes.size(0); 113 | 114 | const int num_query = sampling_loc.size(1); 115 | const int num_point = sampling_loc.size(4); 116 | 117 | const int im2col_step_ = std::min(batch, im2col_step); 118 | 119 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 120 | 121 | auto grad_value = at::zeros_like(value); 122 | auto grad_sampling_loc = at::zeros_like(sampling_loc); 123 | auto grad_attn_weight = at::zeros_like(attn_weight); 124 | 125 | const int batch_n = im2col_step_; 126 | auto per_value_size = spatial_size * num_heads * channels; 127 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 128 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 129 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 130 | 131 | for (int n = 0; n < batch/im2col_step_; ++n) 132 | { 133 | auto grad_output_g = grad_output_n.select(0, n); 134 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { 135 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), 136 | grad_output_g.data(), 137 | value.data() + n * im2col_step_ * per_value_size, 138 | spatial_shapes.data(), 139 | level_start_index.data(), 140 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 141 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 142 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 143 | grad_value.data() + n * im2col_step_ * per_value_size, 144 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 145 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); 146 | 147 | })); 148 | } 149 | 150 | return { 151 | grad_value, grad_sampling_loc, grad_attn_weight 152 | }; 153 | } -------------------------------------------------------------------------------- /adet/utils/measures.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import absolute_import 4 | from __future__ import unicode_literals 5 | from __future__ import print_function 6 | from __future__ import division 7 | 8 | import operator 9 | 10 | from functools import reduce 11 | 12 | 13 | def get_num_gen(gen): 14 | return sum(1 for x in gen) 15 | 16 | 17 | def is_pruned(layer): 18 | try: 19 | layer.mask 20 | return True 21 | except AttributeError: 22 | return False 23 | 24 | 25 | def is_leaf(model): 26 | return get_num_gen(model.children()) == 0 27 | 28 | 29 | def get_layer_info(layer): 30 | layer_str = str(layer) 31 | type_name = layer_str[:layer_str.find('(')].strip() 32 | return type_name 33 | 34 | 35 | def get_layer_param(model): 36 | return sum([reduce(operator.mul, i.size(), 1) for i in model.parameters()]) 37 | 38 | 39 | ### The input batch size should be 1 to call this function 40 | def measure_layer(layer, *args): 41 | global count_ops, count_params 42 | 43 | for x in args: 44 | delta_ops = 0 45 | delta_params = 0 46 | multi_add = 1 47 | type_name = get_layer_info(layer) 48 | 49 | ### ops_conv 50 | if type_name in ['Conv2d']: 51 | out_h = int((x.size()[2] + 2 * layer.padding[0] / layer.dilation[0] - layer.kernel_size[0]) / 52 | layer.stride[0] + 1) 53 | out_w = int((x.size()[3] + 2 * layer.padding[1] / layer.dilation[1] - layer.kernel_size[1]) / 54 | layer.stride[1] + 1) 55 | delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] * layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add 56 | delta_params = get_layer_param(layer) 57 | 58 | elif type_name in ['ConvTranspose2d']: 59 | _, _, in_h, in_w = x.size() 60 | out_h = int((in_h-1)*layer.stride[0] - 2 * layer.padding[0] + layer.kernel_size[0] + layer.output_padding[0]) 61 | out_w = int((in_w-1)*layer.stride[1] - 2 * layer.padding[1] + layer.kernel_size[1] + layer.output_padding[1]) 62 | delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] * \ 63 | layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add 64 | delta_params = get_layer_param(layer) 65 | 66 | ### ops_learned_conv 67 | elif type_name in ['LearnedGroupConv']: 68 | measure_layer(layer.relu, x) 69 | measure_layer(layer.norm, x) 70 | conv = layer.conv 71 | out_h = int((x.size()[2] + 2 * conv.padding[0] - conv.kernel_size[0]) / 72 | conv.stride[0] + 1) 73 | out_w = int((x.size()[3] + 2 * conv.padding[1] - conv.kernel_size[1]) / 74 | conv.stride[1] + 1) 75 | delta_ops = conv.in_channels * conv.out_channels * conv.kernel_size[0] * conv.kernel_size[1] * out_h * out_w / layer.condense_factor * multi_add 76 | delta_params = get_layer_param(conv) / layer.condense_factor 77 | 78 | ### ops_nonlinearity 79 | elif type_name in ['ReLU', 'ReLU6']: 80 | delta_ops = x.numel() 81 | delta_params = get_layer_param(layer) 82 | 83 | ### ops_pooling 84 | elif type_name in ['AvgPool2d', 'MaxPool2d']: 85 | in_w = x.size()[2] 86 | kernel_ops = layer.kernel_size * layer.kernel_size 87 | out_w = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1) 88 | out_h = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1) 89 | delta_ops = x.size()[0] * x.size()[1] * out_w * out_h * kernel_ops 90 | delta_params = get_layer_param(layer) 91 | 92 | elif type_name in ['LastLevelMaxPool']: 93 | pass 94 | 95 | elif type_name in ['AdaptiveAvgPool2d']: 96 | delta_ops = x.size()[0] * x.size()[1] * x.size()[2] * x.size()[3] 97 | delta_params = get_layer_param(layer) 98 | 99 | elif type_name in ['ZeroPad2d', 'RetinaNetPostProcessor']: 100 | pass 101 | #delta_ops = x.size()[0] * x.size()[1] * x.size()[2] * x.size()[3] 102 | #delta_params = get_layer_param(layer) 103 | 104 | ### ops_linear 105 | elif type_name in ['Linear']: 106 | weight_ops = layer.weight.numel() * multi_add 107 | # bias_ops = layer.bias.numel() 108 | # delta_ops = x.size()[0] * (weight_ops + bias_ops) 109 | delta_ops = x.size()[0] * (weight_ops ) 110 | delta_params = get_layer_param(layer) 111 | 112 | ### ops_nothing 113 | elif type_name in ['BatchNorm2d', 'Dropout2d', 'DropChannel', 'Dropout', 'FrozenBatchNorm2d', 'GroupNorm']: 114 | delta_params = get_layer_param(layer) 115 | 116 | elif type_name in ['SumTwo']: 117 | delta_ops = x.numel() 118 | 119 | elif type_name in ['AggregateCell']: 120 | if not layer.pre_transform: 121 | delta_ops = 2 * x.numel() # twice for each input 122 | else: 123 | measure_layer(layer.branch_1, x) 124 | measure_layer(layer.branch_2, x) 125 | delta_params = get_layer_param(layer) 126 | 127 | elif type_name in ['Identity', 'Zero']: 128 | pass 129 | 130 | elif type_name in ['Scale']: 131 | delta_params = get_layer_param(layer) 132 | delta_ops = x.numel() 133 | 134 | elif type_name in ['FCOSPostProcessor', 'RPNPostProcessor', 'KeypointPostProcessor', 135 | 'ROIAlign', 'PostProcessor', 'KeypointRCNNPredictor', 136 | 'NaiveSyncBatchNorm', 'Upsample', 'Sequential']: 137 | pass 138 | 139 | elif type_name in ['DeformConv']: 140 | # don't count bilinear 141 | offset_conv = list(layer.parameters())[0] 142 | delta_ops = reduce(operator.mul, offset_conv.size(), x.size()[2] * x.size()[3]) 143 | out_h = int((x.size()[2] + 2 * layer.padding[0] / layer.dilation[0] 144 | - layer.kernel_size[0]) / layer.stride[0] + 1) 145 | out_w = int((x.size()[3] + 2 * layer.padding[1] / layer.dilation[1] 146 | - layer.kernel_size[1]) / layer.stride[1] + 1) 147 | delta_ops += layer.in_channels * layer.out_channels * layer.kernel_size[0] * layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add 148 | delta_params = get_layer_param(layer) 149 | 150 | ### unknown layer type 151 | else: 152 | raise TypeError('unknown layer type: %s' % type_name) 153 | 154 | count_ops += delta_ops 155 | count_params += delta_params 156 | return 157 | 158 | 159 | def measure_model(model, x): 160 | global count_ops, count_params 161 | count_ops = 0 162 | count_params = 0 163 | 164 | def should_measure(x): 165 | return is_leaf(x) or is_pruned(x) 166 | 167 | def modify_forward(model): 168 | for child in model.children(): 169 | if should_measure(child): 170 | def new_forward(m): 171 | def lambda_forward(*args): 172 | measure_layer(m, *args) 173 | return m.old_forward(*args) 174 | return lambda_forward 175 | child.old_forward = child.forward 176 | child.forward = new_forward(child) 177 | else: 178 | modify_forward(child) 179 | 180 | def restore_forward(model): 181 | for child in model.children(): 182 | # leaf node 183 | if is_leaf(child) and hasattr(child, 'old_forward'): 184 | child.forward = child.old_forward 185 | child.old_forward = None 186 | else: 187 | restore_forward(child) 188 | 189 | modify_forward(model) 190 | out = model.forward(x) 191 | restore_forward(model) 192 | 193 | return out, count_ops, count_params 194 | -------------------------------------------------------------------------------- /adet/data/dataset_mapper.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | import os.path as osp 4 | 5 | import numpy as np 6 | import torch 7 | from fvcore.common.file_io import PathManager 8 | from PIL import Image 9 | from pycocotools import mask as maskUtils 10 | 11 | from detectron2.data import detection_utils as utils 12 | from detectron2.data import transforms as T 13 | from detectron2.data.dataset_mapper import DatasetMapper 14 | from detectron2.data.detection_utils import SizeMismatchError 15 | from detectron2.structures import BoxMode 16 | 17 | from .augmentation import RandomCropWithInstance 18 | from .detection_utils import (annotations_to_instances, build_augmentation, 19 | transform_instance_annotations) 20 | 21 | """ 22 | This file contains the default mapping that's applied to "dataset dicts". 23 | """ 24 | 25 | __all__ = ["DatasetMapperWithBasis"] 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | def segmToRLE(segm, img_size): 31 | h, w = img_size 32 | if type(segm) == list: 33 | # polygon -- a single object might consist of multiple parts 34 | # we merge all parts into one mask rle code 35 | rles = maskUtils.frPyObjects(segm, h, w) 36 | rle = maskUtils.merge(rles) 37 | elif type(segm["counts"]) == list: 38 | # uncompressed RLE 39 | rle = maskUtils.frPyObjects(segm, h, w) 40 | else: 41 | # rle 42 | rle = segm 43 | return rle 44 | 45 | 46 | def segmToMask(segm, img_size): 47 | rle = segmToRLE(segm, img_size) 48 | m = maskUtils.decode(rle) 49 | return m 50 | 51 | 52 | class DatasetMapperWithBasis(DatasetMapper): 53 | """ 54 | This caller enables the default Detectron2 mapper to read an additional basis semantic label 55 | """ 56 | 57 | def __init__(self, cfg, is_train=True): 58 | super().__init__(cfg, is_train) 59 | 60 | # Rebuild augmentations 61 | logger.info( 62 | "Rebuilding the augmentations. The previous augmentations will be overridden." 63 | ) 64 | self.augmentation = build_augmentation(cfg, is_train) 65 | 66 | if cfg.INPUT.CROP.ENABLED and is_train: 67 | self.augmentation.insert( 68 | 0, 69 | RandomCropWithInstance( 70 | cfg.INPUT.CROP.TYPE, 71 | cfg.INPUT.CROP.SIZE, 72 | cfg.INPUT.CROP.CROP_INSTANCE, 73 | ), 74 | ) 75 | logging.getLogger(__name__).info( 76 | "Cropping used in training: " + str(self.augmentation[0]) 77 | ) 78 | 79 | # fmt: off 80 | self.basis_loss_on = cfg.MODEL.BASIS_MODULE.LOSS_ON 81 | self.ann_set = cfg.MODEL.BASIS_MODULE.ANN_SET 82 | # fmt: on 83 | 84 | def __call__(self, dataset_dict): 85 | """ 86 | Args: 87 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 88 | 89 | Returns: 90 | dict: a format that builtin models in detectron2 accept 91 | """ 92 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 93 | # USER: Write your own image loading if it's not from a file 94 | try: 95 | image = utils.read_image( 96 | dataset_dict["file_name"], format=self.image_format 97 | ) 98 | except Exception as e: 99 | print(dataset_dict["file_name"]) 100 | print(e) 101 | raise e 102 | try: 103 | utils.check_image_size(dataset_dict, image) 104 | except SizeMismatchError as e: 105 | expected_wh = (dataset_dict["width"], dataset_dict["height"]) 106 | image_wh = (image.shape[1], image.shape[0]) 107 | if (image_wh[1], image_wh[0]) == expected_wh: 108 | print("transposing image {}".format(dataset_dict["file_name"])) 109 | image = image.transpose(1, 0, 2) 110 | else: 111 | raise e 112 | 113 | # USER: Remove if you don't do semantic/panoptic segmentation. 114 | if "sem_seg_file_name" in dataset_dict: 115 | sem_seg_gt = utils.read_image( 116 | dataset_dict.pop("sem_seg_file_name"), "L" 117 | ).squeeze(2) 118 | else: 119 | sem_seg_gt = None 120 | 121 | boxes = np.asarray( 122 | [ 123 | BoxMode.convert( 124 | instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS 125 | ) 126 | for instance in dataset_dict["annotations"] 127 | ] 128 | ) 129 | aug_input = T.StandardAugInput(image, boxes=boxes, sem_seg=sem_seg_gt) 130 | transforms = aug_input.apply_augmentations(self.augmentation) 131 | image, sem_seg_gt = aug_input.image, aug_input.sem_seg 132 | 133 | image_shape = image.shape[:2] # h, w 134 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 135 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 136 | # Therefore it's important to use torch.Tensor. 137 | dataset_dict["image"] = torch.as_tensor( 138 | np.ascontiguousarray(image.transpose(2, 0, 1)) 139 | ) 140 | if sem_seg_gt is not None: 141 | dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long")) 142 | 143 | # USER: Remove if you don't use pre-computed proposals. 144 | # Most users would not need this feature. 145 | if self.proposal_topk: 146 | utils.transform_proposals( 147 | dataset_dict, 148 | image_shape, 149 | transforms, 150 | proposal_topk=self.proposal_topk, 151 | min_box_size=self.proposal_min_box_size, 152 | ) 153 | 154 | if not self.is_train: 155 | dataset_dict.pop("annotations", None) 156 | dataset_dict.pop("sem_seg_file_name", None) 157 | dataset_dict.pop("pano_seg_file_name", None) 158 | return dataset_dict 159 | 160 | if "annotations" in dataset_dict: 161 | # USER: Modify this if you want to keep them for some reason. 162 | for anno in dataset_dict["annotations"]: 163 | if not self.use_instance_mask: 164 | anno.pop("segmentation", None) 165 | if not self.use_keypoint: 166 | anno.pop("keypoints", None) 167 | 168 | # USER: Implement additional transformations if you have other types of data 169 | annos = [ 170 | transform_instance_annotations( 171 | obj, 172 | transforms, 173 | image_shape, 174 | keypoint_hflip_indices=self.keypoint_hflip_indices, 175 | ) 176 | for obj in dataset_dict.pop("annotations") 177 | if obj.get("iscrowd", 0) == 0 178 | ] 179 | instances = annotations_to_instances( 180 | annos, image_shape, mask_format=self.instance_mask_format 181 | ) 182 | 183 | # After transforms such as cropping are applied, the bounding box may no longer 184 | # tightly bound the object. As an example, imagine a triangle object 185 | # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight 186 | # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to 187 | if self.recompute_boxes: 188 | instances.gt_boxes = instances.gt_masks.get_bounding_boxes() 189 | dataset_dict["instances"] = utils.filter_empty_instances(instances) 190 | 191 | if self.basis_loss_on and self.is_train: 192 | # load basis supervisions 193 | if self.ann_set == "coco": 194 | basis_sem_path = ( 195 | dataset_dict["file_name"] 196 | .replace("train2017", "thing_train2017") 197 | .replace("image/train", "thing_train") 198 | ) 199 | else: 200 | basis_sem_path = ( 201 | dataset_dict["file_name"] 202 | .replace("coco", "lvis") 203 | .replace("train2017", "thing_train") 204 | ) 205 | # change extension to npz 206 | basis_sem_path = osp.splitext(basis_sem_path)[0] + ".npz" 207 | basis_sem_gt = np.load(basis_sem_path)["mask"] 208 | basis_sem_gt = transforms.apply_segmentation(basis_sem_gt) 209 | basis_sem_gt = torch.as_tensor(basis_sem_gt.astype("long")) 210 | dataset_dict["basis_sem"] = basis_sem_gt 211 | return dataset_dict 212 | --------------------------------------------------------------------------------