├── .gitignore
├── DCNv4_op
    ├── DCNv4
    │   ├── __init__.py
    │   ├── functions
    │   │   ├── __init__.py
    │   │   ├── dcnv4_func.py
    │   │   ├── flash_deform_attn_func.py
    │   │   └── table.py
    │   └── modules
    │   │   ├── __init__.py
    │   │   ├── dcnv4.py
    │   │   └── flash_deform_attn.py
    ├── MANIFEST.in
    ├── __init__.py
    ├── make.sh
    ├── scripts
    │   ├── find_best.py
    │   ├── search_bwd.sh
    │   ├── search_dcnv4.py
    │   ├── search_dcnv4_bwd.py
    │   ├── search_dcnv4_bwd_engine.py
    │   ├── search_dcnv4_engine.py
    │   ├── search_fwd.sh
    │   ├── test_dcnv4.py
    │   ├── test_dcnv4_bwd.py
    │   ├── test_flash_deform_attn.py
    │   └── test_flash_deform_attn_backward.py
    ├── setup.py
    └── src
    │   ├── cuda
    │       ├── common.h
    │       ├── dcnv4_col2im_cuda.cuh
    │       ├── dcnv4_cuda.cu
    │       ├── dcnv4_cuda.h
    │       ├── dcnv4_im2col_cuda.cuh
    │       ├── flash_deform_attn_cuda.cu
    │       ├── flash_deform_attn_cuda.h
    │       ├── flash_deform_col2im_cuda.cuh
    │       └── flash_deform_im2col_cuda.cuh
    │   ├── dcnv4.h
    │   └── vision.cpp
├── LICENSE
├── README.md
├── classification
    ├── README.md
    ├── config.py
    ├── configs
    │   ├── flash_intern_image_b_1k_224.yaml
    │   ├── flash_intern_image_l_22kto1k_384.yaml
    │   ├── flash_intern_image_s_1k_224.yaml
    │   └── flash_intern_image_t_1k_224.yaml
    ├── dataset
    │   ├── __init__.py
    │   ├── build.py
    │   ├── cached_image_folder.py
    │   ├── samplers.py
    │   └── zipreader.py
    ├── ddp_hooks.py
    ├── ema_deepspeed.py
    ├── eval.sh
    ├── export.py
    ├── extract_feature.py
    ├── logger.py
    ├── lr_scheduler.py
    ├── main.py
    ├── main_accelerate.py
    ├── main_deepspeed.py
    ├── meta_data
    │   ├── 22k_class_to_idx.json
    │   ├── map22kto1k.txt
    │   └── meta
    ├── models
    │   ├── __init__.py
    │   ├── build.py
    │   ├── flash_intern_image.py
    │   └── intern_image.py
    ├── ops_dcnv3
    │   ├── functions
    │   │   ├── __init__.py
    │   │   └── dcnv3_func.py
    │   ├── make.sh
    │   ├── modules
    │   │   ├── __init__.py
    │   │   └── dcnv3.py
    │   ├── setup.py
    │   ├── src
    │   │   ├── cpu
    │   │   │   ├── dcnv3_cpu.cpp
    │   │   │   └── dcnv3_cpu.h
    │   │   ├── cuda
    │   │   │   ├── dcnv3_cuda.cu
    │   │   │   ├── dcnv3_cuda.h
    │   │   │   └── dcnv3_im2col_cuda.cuh
    │   │   ├── dcnv3.h
    │   │   └── vision.cpp
    │   └── test.py
    ├── optimizer.py
    ├── train_in1k.sh
    ├── train_in1k_deepspeed.sh
    └── utils.py
├── detection
    ├── README.md
    ├── configs
    │   ├── _base_
    │   │   ├── datasets
    │   │   │   ├── coco_detection.py
    │   │   │   ├── coco_instance.py
    │   │   │   └── crowd_human.py
    │   │   ├── default_runtime.py
    │   │   ├── models
    │   │   │   ├── cascade_mask_rcnn_convnext_fpn.py
    │   │   │   ├── cascade_mask_rcnn_r50_fpn.py
    │   │   │   ├── cascade_mask_rcnn_r50_fpn_crowdhuman.py
    │   │   │   ├── cascade_rcnn_r50_fpn.py
    │   │   │   ├── fast_rcnn_r50_fpn.py
    │   │   │   ├── faster_rcnn_r50_caffe_c4.py
    │   │   │   ├── faster_rcnn_r50_caffe_dc5.py
    │   │   │   ├── faster_rcnn_r50_fpn.py
    │   │   │   ├── mask_rcnn_convnext_fpn.py
    │   │   │   ├── mask_rcnn_r50_caffe_c4.py
    │   │   │   ├── mask_rcnn_r50_fpn.py
    │   │   │   ├── retinanet_r50_fpn.py
    │   │   │   ├── rpn_r50_caffe_c4.py
    │   │   │   ├── rpn_r50_fpn.py
    │   │   │   └── ssd300.py
    │   │   └── schedules
    │   │   │   ├── schedule_1x.py
    │   │   │   └── schedule_3x.py
    │   └── coco
    │   │   ├── README.md
    │   │   ├── cascade_flash_intern_image_l_fpn_1x_coco.py
    │   │   ├── cascade_flash_intern_image_l_fpn_3x_coco.py
    │   │   ├── dino_4scale_flash_internimage_b_1x_coco.py
    │   │   ├── dino_4scale_flash_internimage_l_1x_coco.py
    │   │   ├── dino_4scale_flash_internimage_s_1x_coco.py
    │   │   ├── dino_4scale_flash_internimage_t_1x_coco.py
    │   │   ├── mask_rcnn_flash_intern_image_b_fpn_1x_coco.py
    │   │   ├── mask_rcnn_flash_intern_image_b_fpn_3x_coco.py
    │   │   ├── mask_rcnn_flash_intern_image_s_fpn_1x_coco.py
    │   │   ├── mask_rcnn_flash_intern_image_s_fpn_3x_coco.py
    │   │   ├── mask_rcnn_flash_intern_image_t_fpn_1x_coco.py
    │   │   └── mask_rcnn_flash_intern_image_t_fpn_3x_coco.py
    ├── dist_test.sh
    ├── dist_train.sh
    ├── get_flops.py
    ├── image_demo.py
    ├── mmcv_custom
    │   ├── __init__.py
    │   └── custom_layer_decay_optimizer_constructor.py
    ├── mmdet_custom
    │   ├── __init__.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   └── crowd_human.py
    │   └── models
    │   │   ├── __init__.py
    │   │   ├── backbones
    │   │       ├── __init__.py
    │   │       └── flash_intern_image.py
    │   │   ├── dense_heads
    │   │       ├── __init__.py
    │   │       ├── bbox_head.py
    │   │       ├── deformable_detr_head.py
    │   │       ├── detr_head.py
    │   │       ├── dino_head.py
    │   │       ├── mask_rcnn.py
    │   │       ├── msda.py
    │   │       └── two_stage.py
    │   │   ├── detectors
    │   │       ├── __init__.py
    │   │       └── dino.py
    │   │   ├── necks
    │   │       └── fpn.py
    │   │   └── utils
    │   │       ├── __init__.py
    │   │       ├── convModule_norm.py
    │   │       ├── query_denoising.py
    │   │       └── transformer.py
    ├── ops_dcnv3
    │   ├── functions
    │   │   ├── __init__.py
    │   │   └── dcnv3_func.py
    │   ├── make.sh
    │   ├── modules
    │   │   ├── __init__.py
    │   │   └── dcnv3.py
    │   ├── setup.py
    │   ├── src
    │   │   ├── cpu
    │   │   │   ├── dcnv3_cpu.cpp
    │   │   │   └── dcnv3_cpu.h
    │   │   ├── cuda
    │   │   │   ├── dcnv3_cuda.cu
    │   │   │   ├── dcnv3_cuda.h
    │   │   │   └── dcnv3_im2col_cuda.cuh
    │   │   ├── dcnv3.h
    │   │   └── vision.cpp
    │   └── test.py
    ├── slurm_test.sh
    ├── slurm_train.sh
    ├── test.py
    ├── tools
    │   ├── create_crowd_anno.py
    │   └── evaluate
    │   │   └── __init__.py
    └── train.py
└── segmentation
    ├── README.md
    ├── configs
        ├── _base_
        │   ├── datasets
        │   │   ├── ade20k.py
        │   │   ├── ade20k_640x640.py
        │   │   ├── chase_db1.py
        │   │   ├── cityscapes.py
        │   │   ├── cityscapes_1024x1024.py
        │   │   ├── cityscapes_extra.py
        │   │   ├── coco-stuff10k.py
        │   │   ├── coco-stuff164k.py
        │   │   ├── drive.py
        │   │   ├── hrf.py
        │   │   ├── loveda.py
        │   │   ├── mapillary.py
        │   │   ├── mapillary_1024x1024.py
        │   │   ├── nyu_depth_v2.py
        │   │   ├── pascal_context.py
        │   │   ├── pascal_context_59.py
        │   │   ├── pascal_voc12.py
        │   │   ├── pascal_voc12_aug.py
        │   │   ├── potsdam.py
        │   │   └── stare.py
        │   ├── default_runtime.py
        │   ├── models
        │   │   ├── mask2former_beit.py
        │   │   ├── segformer_mit-b0.py
        │   │   ├── upernet_convnext.py
        │   │   ├── upernet_r50.py
        │   │   └── upernet_swin.py
        │   └── schedules
        │   │   ├── schedule_160k.py
        │   │   ├── schedule_20k.py
        │   │   ├── schedule_320k.py
        │   │   ├── schedule_40k.py
        │   │   └── schedule_80k.py
        └── ade20k
        │   ├── README.md
        │   ├── mask2former_flash_internimage_b_640_160k_ade20k_ss.py
        │   ├── mask2former_flash_internimage_l_640_160k_ade20k_ss.py
        │   ├── mask2former_flash_internimage_s_640_160k_ade20k_ss.py
        │   ├── mask2former_flash_internimage_s_640_160k_ade20k_ss_nsmx.py
        │   ├── mask2former_flash_internimage_t_512_160k_ade20k_ss.py
        │   ├── upernet_flash_internimage_b_512_160k_ade20k.py
        │   ├── upernet_flash_internimage_l_640_160k_ade20k.py
        │   ├── upernet_flash_internimage_s_512_160k_ade20k.py
        │   └── upernet_flash_internimage_t_512_160k_ade20k.py
    ├── deploy
        ├── configs
        │   ├── _base_
        │   │   ├── backends
        │   │   │   └── tensorrt.py
        │   │   └── onnx_config.py
        │   └── mmseg
        │   │   ├── segmentation_static.py
        │   │   └── segmentation_tensorrt_static-512x512.py
        └── demo.png
    ├── dist_test.sh
    ├── dist_train.sh
    ├── get_flops.py
    ├── image_demo.py
    ├── mmcv_custom
        ├── __init__.py
        ├── custom_layer_decay_optimizer_constructor.py
        ├── layer_decay.py
        └── layer_decay_vit.py
    ├── mmseg_custom
        ├── __init__.py
        ├── core
        │   ├── __init__.py
        │   ├── anchor
        │   │   ├── __init__.py
        │   │   ├── builder.py
        │   │   └── point_generator.py
        │   ├── box
        │   │   ├── __init__.py
        │   │   ├── builder.py
        │   │   └── samplers
        │   │   │   ├── __init__.py
        │   │   │   ├── base_sampler.py
        │   │   │   ├── mask_pseudo_sampler.py
        │   │   │   ├── mask_sampling_result.py
        │   │   │   └── sampling_result.py
        │   ├── evaluation
        │   │   ├── __init__.py
        │   │   └── panoptic_utils.py
        │   ├── mask
        │   │   ├── __init__.py
        │   │   └── utils.py
        │   └── utils
        │   │   ├── __init__.py
        │   │   ├── dist_utils.py
        │   │   └── misc.py
        ├── datasets
        │   ├── __init__.py
        │   ├── dataset_wrappers.py
        │   ├── mapillary.py
        │   ├── nyu_depth_v2.py
        │   └── pipelines
        │   │   ├── __init__.py
        │   │   ├── formatting.py
        │   │   └── transform.py
        └── models
        │   ├── __init__.py
        │   ├── backbones
        │       ├── __init__.py
        │       └── flash_intern_image.py
        │   ├── builder.py
        │   ├── decode_heads
        │       ├── __init__.py
        │       ├── mask2former_head.py
        │       ├── maskformer_head.py
        │       └── msda.py
        │   ├── losses
        │       ├── __init__.py
        │       ├── cross_entropy_loss.py
        │       ├── dice_loss.py
        │       ├── focal_loss.py
        │       ├── match_costs.py
        │       └── match_loss.py
        │   ├── plugins
        │       ├── __init__.py
        │       ├── msdeformattn_pixel_decoder.py
        │       └── pixel_decoder.py
        │   ├── segmentors
        │       ├── __init__.py
        │       ├── encoder_decoder_mask2former.py
        │       └── encoder_decoder_mask2former_aug.py
        │   └── utils
        │       ├── __init__.py
        │       ├── assigner.py
        │       ├── point_sample.py
        │       ├── positional_encoding.py
        │       └── transformer.py
    ├── ops_dcnv3
        ├── functions
        │   ├── __init__.py
        │   └── dcnv3_func.py
        ├── make.sh
        ├── modules
        │   ├── __init__.py
        │   └── dcnv3.py
        ├── setup.py
        ├── src
        │   ├── cpu
        │   │   ├── dcnv3_cpu.cpp
        │   │   └── dcnv3_cpu.h
        │   ├── cuda
        │   │   ├── dcnv3_cuda.cu
        │   │   ├── dcnv3_cuda.h
        │   │   └── dcnv3_im2col_cuda.cuh
        │   ├── dcnv3.h
        │   └── vision.cpp
        └── test.py
    ├── slurm_test.sh
    ├── slurm_train.sh
    ├── test.py
    └── train.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | .idea/
 3 | .DS_Store
 4 | __pycache__/
 5 | classification/convertor/
 6 | segmentation/convertor/
 7 | checkpoint_dir/
 8 | demo/
 9 | detection/work_dirs
10 | *.pth
11 | ops_dcnv3/build
12 | ops_dcnv3/dist
13 | ops_dcnv3/DCNv3.egg-info
14 | DCNv4_op/DCNv4.egg-info
15 | build/
16 | dist/
17 | ckpts/
18 | ckpts
19 | data
20 | data/
21 | detection/data
22 | detection/ckpts
23 | segmentation/data
24 | segmentation/ckpts
25 | work_dirs/
26 | output


--------------------------------------------------------------------------------
/DCNv4_op/DCNv4/__init__.py:
--------------------------------------------------------------------------------
1 | from .functions import DCNv4Function, FlashDeformAttnFunction
2 | from .modules import DCNv4, FlashDeformAttn


--------------------------------------------------------------------------------
/DCNv4_op/DCNv4/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # from .ms_flash_deform_attn_func import FlashMSDeformAttnFunction
10 | from .flash_deform_attn_func import FlashDeformAttnFunction
11 | from .dcnv4_func import DCNv4Function


--------------------------------------------------------------------------------
/DCNv4_op/DCNv4/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .flash_deform_attn import FlashDeformAttn
10 | from .dcnv4 import DCNv4
11 | 


--------------------------------------------------------------------------------
/DCNv4_op/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/*
2 | include src/cuda/*
3 | 


--------------------------------------------------------------------------------
/DCNv4_op/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/DCNv4/4b848f7dd7da74ff03f7d278f902c6fd05b391b5/DCNv4_op/__init__.py


--------------------------------------------------------------------------------
/DCNv4_op/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | python setup.py build install
11 | 


--------------------------------------------------------------------------------
/DCNv4_op/scripts/find_best.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | class LineParser:
 4 |     def __init__(self) -> None:
 5 |         self.data = {}
 6 | 
 7 |     def parse(self, line):
 8 |         def startswith(line, lst):
 9 |             for ele in lst:
10 |                 if line.startswith(ele):
11 |                     return True
12 |             return False
13 | 
14 |         if not startswith(line, ['1', '2', '3', '4', '5', '6', '7', '8', '9']):
15 |             return
16 | 
17 |         eles = line.strip().split()
18 |         key = eles[0]
19 |         if key not in self.data:
20 |             self.data[key] = []
21 |         
22 |         self.data[key].append([eles[1], float(eles[2])])
23 |     
24 |     def sort(self):
25 |         for k, v in self.data.items():
26 |             nv = sorted(v, key=lambda x: x[1])
27 |             self.data[k] = nv
28 |     
29 |     def display_best(self):
30 |         for k, v in self.data.items():
31 |             print(f'{k} \t {v[0][0]} \t {v[0][1]:.4f} \t {v[1][0]} \t {v[1][1]:.4f}')
32 |     
33 |     def display_best_python(self, output):
34 |         res = {}
35 |         def parse(spec):
36 |             d_stride = int(spec.split('/')[0])
37 |             thread = int(spec.split('/')[1].split('(')[0])
38 |             m = int(spec.split('(')[1].split(')')[0])
39 |             return d_stride, thread, m
40 | 
41 |         for k, v in self.data.items():
42 |             res[k] = parse(v[0][0])
43 | 
44 |         with open(output, "w") as f:
45 |             json.dump(res, f, indent=4)
46 | 
47 | if __name__ == '__main__':
48 |     parser = argparse.ArgumentParser()
49 |     parser.add_argument('--input', type=str)
50 |     parser.add_argument('--output', type=str)
51 |     args = parser.parse_args()
52 | 
53 |     with open(args.input) as f:
54 |         lines = f.readlines()
55 | 
56 |     lineparser = LineParser()
57 |     for line in lines:
58 |         lineparser.parse(line)
59 |     lineparser.sort()
60 |     lineparser.display_best()
61 |     lineparser.display_best_python(args.output)


--------------------------------------------------------------------------------
/DCNv4_op/scripts/search_bwd.sh:
--------------------------------------------------------------------------------
1 | python search_dcnv4_bwd_engine.py > res_bwd.txt
2 | python find_best.py --input res_bwd.txt --output table_bwd.py


--------------------------------------------------------------------------------
/DCNv4_op/scripts/search_dcnv4_bwd_engine.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def factors(N):
 4 |     res = []
 5 |     for i in range(1, N+1):
 6 |         if N % i == 0:
 7 |             res.append(i)
 8 |     return res
 9 | 
10 | if __name__ == '__main__':
11 |     BATCH=64
12 |     for N, Hin, Win in [(BATCH, 56, 56), (BATCH, 28, 28), (BATCH, 14, 14), (BATCH, 7, 7), 
13 |                         (1, 200, 320), (1, 100, 160), (1, 50, 80), (1, 25, 40), (1, 64, 64)]:
14 |         for group_channel in [16, 32, 64]:
15 |             for group in [4, 5, 6, 7, 8]:
16 |                 for d_stride in [1, 2, 4]:
17 |                     for m in factors(N*Hin*Win):
18 |                         if m > 64:
19 |                             break
20 |                         block_thread = group * (group_channel//d_stride) * m
21 |                         if block_thread > 1024:
22 |                             break
23 |                         cmd = f"python search_dcnv4_bwd.py --n {N} --h {Hin} --w {Win} --g {group} --c {group_channel} --dstride {d_stride} --blockthread {block_thread} --multiplier {m}"
24 |                         os.system(cmd)


--------------------------------------------------------------------------------
/DCNv4_op/scripts/search_dcnv4_engine.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def factors(N):
 4 |     res = []
 5 |     for i in range(1, N+1):
 6 |         if N % i == 0:
 7 |             res.append(i)
 8 |     return res
 9 | 
10 | if __name__ == '__main__':
11 |     BATCH=64
12 |     for group_channel in [16, 32, 64]:
13 |         for group in [4, 5, 6, 7, 8]:
14 |             for N, Hin, Win in [(BATCH, 56, 56), (BATCH, 28, 28), (BATCH, 14, 14), (BATCH, 7, 7), 
15 |                                 (1, 200, 320), (1, 100, 160), (1, 50, 80), (1, 25, 40), (1, 64, 64)]:
16 |                 for d_stride in [2, 4, 8, 16]:
17 |                     for m in factors(N*Hin*Win):
18 |                         if m > 64:
19 |                             break
20 |                         block_thread = group * (group_channel//d_stride) * m
21 |                         if block_thread > 1024:
22 |                             break
23 |                         cmd = f"python search_dcnv4.py --n {N} --h {Hin} --w {Win} --g {group} --c {group_channel} --dstride {d_stride} --blockthread {block_thread} --multiplier {m}"
24 |                         os.system(cmd)


--------------------------------------------------------------------------------
/DCNv4_op/scripts/search_fwd.sh:
--------------------------------------------------------------------------------
1 | python search_dcnv4_engine.py > res.txt
2 | python find_best.py --input res.txt --output table.py


--------------------------------------------------------------------------------
/DCNv4_op/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable Convolution v4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | import os
10 | import glob
11 | 
12 | import torch
13 | 
14 | from torch.utils.cpp_extension import CUDA_HOME
15 | from torch.utils.cpp_extension import CppExtension
16 | from torch.utils.cpp_extension import CUDAExtension
17 | 
18 | from setuptools import find_packages
19 | from setuptools import setup
20 | 
21 | requirements = ["torch", "torchvision"]
22 | 
23 | def get_extensions():
24 |     this_dir = os.path.dirname(os.path.abspath(__file__))
25 |     extensions_dir = os.path.join(this_dir, "src")
26 | 
27 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 | 
31 |     sources = main_file + source_cpu
32 |     extension = CppExtension
33 |     extra_compile_args = {"cxx": []}
34 |     define_macros = []
35 | 
36 |     if torch.cuda.is_available() and CUDA_HOME is not None:
37 |         extension = CUDAExtension
38 |         sources += source_cuda
39 |         define_macros += [("WITH_CUDA", None)]
40 |         extra_compile_args["nvcc"] = [
41 |             "-DCUDA_HAS_FP16=1",
42 |             "-D__CUDA_NO_HALF_OPERATORS__",
43 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
44 |             "-D__CUDA_NO_HALF2_OPERATORS__",
45 |             "-O3",
46 |         ]
47 |     else:
48 |         raise NotImplementedError('Cuda is not available')
49 | 
50 |     sources = [os.path.join(extensions_dir, s) for s in sources]
51 |     include_dirs = [extensions_dir]
52 |     ext_modules = [
53 |         extension(
54 |             "DCNv4.ext",
55 |             sources,
56 |             include_dirs=include_dirs,
57 |             define_macros=define_macros,
58 |             extra_compile_args=extra_compile_args,
59 |         )
60 |     ]
61 |     return ext_modules
62 | 
63 | setup(
64 |     name="DCNv4",
65 |     version="1.0.0.post2",
66 |     author="Yuwen Xiong, Feng Wang",
67 |     url="",
68 |     description="PyTorch Wrapper for CUDA Functions of DCNv4",
69 |     packages=['DCNv4', 'DCNv4/functions', 'DCNv4/modules'],
70 |     ext_modules=get_extensions(),
71 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
72 | )
73 | 


--------------------------------------------------------------------------------
/DCNv4_op/src/cuda/dcnv4_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #pragma once
13 | #include <torch/extension.h>
14 | 
15 | at::Tensor dcnv4_cuda_forward(
16 |     const at::Tensor &value,
17 |     const at::Tensor &p_offset,
18 |     const int kernel_h, const int kernel_w, const int stride_h,
19 |     const int stride_w, const int pad_h, const int pad_w, const int dilation_h,
20 |     const int dilation_w, const int group, const int group_channels,
21 |     const float offset_scale, const int im2col_step, const int remove_center,
22 |     const int d_stride, const int block_thread, const bool softmax);
23 | 
24 | std::vector<at::Tensor>
25 | dcnv4_cuda_backward(
26 |     const at::Tensor &value,
27 |     const at::Tensor &p_offset,
28 |     const int kernel_h, const int kernel_w, const int stride_h,
29 |     const int stride_w, const int pad_h, const int pad_w, const int dilation_h,
30 |     const int dilation_w, const int group, const int group_channels,
31 |     const float offset_scale, const int im2col_step, const at::Tensor &grad_output, 
32 |     const int remove_center, const int d_stride, const int block_thread, 
33 |     const bool softmax);


--------------------------------------------------------------------------------
/DCNv4_op/src/cuda/flash_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #pragma once
13 | #include <torch/extension.h>
14 | 
15 | at::Tensor flash_deform_attn_cuda_forward(
16 |     const at::Tensor &value, const at::Tensor &spatial_shapes,
17 |     const at::Tensor &level_start_index, const at::Tensor &sampling_loc_attn,
18 |     const int im2col_step, const int K, const int d_stride, const int block_thread);
19 | 
20 | std::vector<at::Tensor>
21 | flash_deform_attn_cuda_backward(
22 |     const at::Tensor &value, const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index, const at::Tensor &sampling_loc_attn,
24 |     const at::Tensor &grad_output, const int im2col_step, const int K,
25 |     const int d_stride, const int block_thread);


--------------------------------------------------------------------------------
/DCNv4_op/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #include "dcnv4.h"
13 | 
14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
15 |   m.def("flash_deform_attn_forward", &flash_deform_attn_forward,
16 |         "flash_deform_attn_forward");
17 |   m.def("flash_deform_attn_backward", &flash_deform_attn_backward,
18 |         "flash_deform_attn_backward");
19 |   m.def("dcnv4_forward", &dcnv4_forward, "dcnv4_forward");
20 |   m.def("dcnv4_backward", &dcnv4_backward, "dcnv4_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 OpenGVLab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/classification/configs/flash_intern_image_b_1k_224.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 | MODEL:
 4 |   TYPE: flash_intern_image
 5 |   DROP_PATH_RATE: 0.5
 6 |   FLASH_INTERN_IMAGE:
 7 |     CORE_OP: 'DCNv4'
 8 |     DEPTHS: [4, 4, 21, 4]
 9 |     GROUPS: [7, 14, 28, 56]
10 |     CHANNELS: 112
11 |     LAYER_SCALE: 1e-5
12 |     OFFSET_SCALE: 0.5
13 |     MLP_RATIO: 4.0
14 |     POST_NORM: True
15 |     DW_KERNEL_SIZE: 3
16 | TRAIN:
17 |   EMA:
18 |     ENABLE: True
19 |     DECAY: 0.9999
20 |   BASE_LR: 5e-4


--------------------------------------------------------------------------------
/classification/configs/flash_intern_image_l_22kto1k_384.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_SIZE: 384
 3 |   IMG_ON_MEMORY: False
 4 | AUG:
 5 |   MIXUP: 0.0
 6 |   CUTMIX: 0.0
 7 |   REPROB: 0.0
 8 | MODEL:
 9 |   TYPE: flash_intern_image
10 |   DROP_PATH_RATE: 0.1
11 |   LABEL_SMOOTHING: 0.3
12 |   FLASH_INTERN_IMAGE:
13 |     CORE_OP: 'DCNv4'
14 |     DEPTHS: [5, 5, 22, 5]
15 |     GROUPS: [10, 20, 40, 80]
16 |     CHANNELS: 160
17 |     LAYER_SCALE: 1e-5
18 |     OFFSET_SCALE: 2.0
19 |     MLP_RATIO: 4.0
20 |     POST_NORM: True
21 |     DW_KERNEL_SIZE: 3
22 |     DCN_OUTPUT_BIAS: True
23 |     MLP_FC2_BIAS: True
24 | TRAIN:
25 |   EMA:
26 |     ENABLE: true
27 |     DECAY: 0.9999
28 |   EPOCHS: 20
29 |   WARMUP_EPOCHS: 2
30 |   WEIGHT_DECAY: 0.05
31 |   BASE_LR: 2e-05 # 512
32 |   WARMUP_LR: .0
33 |   MIN_LR: .0
34 |   LR_LAYER_DECAY: true
35 |   LR_LAYER_DECAY_RATIO: 0.9
36 |   USE_CHECKPOINT: true
37 |   OPTIMIZER:
38 |     DCN_LR_MUL: 0.1
39 | AMP_OPT_LEVEL: O0
40 | EVAL_FREQ: 1


--------------------------------------------------------------------------------
/classification/configs/flash_intern_image_s_1k_224.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 | MODEL:
 4 |   TYPE: flash_intern_image
 5 |   DROP_PATH_RATE: 0.4
 6 |   FLASH_INTERN_IMAGE:
 7 |     CORE_OP: 'DCNv4'
 8 |     DEPTHS: [4, 4, 21, 4]
 9 |     GROUPS: [5, 10, 20, 40]
10 |     CHANNELS: 80
11 |     LAYER_SCALE: 1e-5
12 |     OFFSET_SCALE: 1.0
13 |     MLP_RATIO: 4.0
14 |     POST_NORM: True
15 |     DW_KERNEL_SIZE: 3
16 | TRAIN:
17 |   EMA:
18 |     ENABLE: True
19 |     DECAY: 0.9999
20 |   BASE_LR: 5e-4
21 | 


--------------------------------------------------------------------------------
/classification/configs/flash_intern_image_t_1k_224.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   IMG_ON_MEMORY: False
 3 | MODEL:
 4 |   TYPE: flash_intern_image
 5 |   DROP_PATH_RATE: 0.1
 6 |   FLASH_INTERN_IMAGE:
 7 |     CORE_OP: 'DCNv4'
 8 |     DEPTHS: [4, 4, 18, 4]
 9 |     GROUPS: [4, 8, 16, 32]
10 |     CHANNELS: 64
11 |     OFFSET_SCALE: 1.0
12 |     MLP_RATIO: 4.0
13 | TRAIN:
14 |   EMA:
15 |     ENABLE: True
16 |     DECAY: 0.9999
17 |   BASE_LR: 5e-4
18 | 


--------------------------------------------------------------------------------
/classification/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DCNv4
3 | # Copyright (c) 2024 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | 
7 | from .build import build_loader, build_loader2


--------------------------------------------------------------------------------
/classification/dataset/zipreader.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # DCNv4
  3 | # Copyright (c) 2024 OpenGVLab
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # --------------------------------------------------------
  6 | 
  7 | import os
  8 | import zipfile
  9 | import io
 10 | import numpy as np
 11 | from PIL import Image
 12 | from PIL import ImageFile
 13 | 
 14 | ImageFile.LOAD_TRUNCATED_IMAGES = True
 15 | 
 16 | 
 17 | def is_zip_path(img_or_path):
 18 |     """judge if this is a zip path"""
 19 |     return '.zip@' in img_or_path
 20 | 
 21 | 
 22 | class ZipReader(object):
 23 |     """A class to read zipped files"""
 24 |     zip_bank = dict()
 25 | 
 26 |     def __init__(self):
 27 |         super(ZipReader, self).__init__()
 28 | 
 29 |     @staticmethod
 30 |     def get_zipfile(path):
 31 |         zip_bank = ZipReader.zip_bank
 32 |         if path not in zip_bank:
 33 |             zfile = zipfile.ZipFile(path, 'r')
 34 |             zip_bank[path] = zfile
 35 |         return zip_bank[path]
 36 | 
 37 |     @staticmethod
 38 |     def split_zip_style_path(path):
 39 |         pos_at = path.index('@')
 40 |         assert pos_at != -1, "character '@' is not found from the given path '%s'" % path
 41 | 
 42 |         zip_path = path[0:pos_at]
 43 |         folder_path = path[pos_at + 1:]
 44 |         folder_path = str.strip(folder_path, '/')
 45 |         return zip_path, folder_path
 46 | 
 47 |     @staticmethod
 48 |     def list_folder(path):
 49 |         zip_path, folder_path = ZipReader.split_zip_style_path(path)
 50 | 
 51 |         zfile = ZipReader.get_zipfile(zip_path)
 52 |         folder_list = []
 53 |         for file_foler_name in zfile.namelist():
 54 |             file_foler_name = str.strip(file_foler_name, '/')
 55 |             if file_foler_name.startswith(folder_path) and \
 56 |                     len(os.path.splitext(file_foler_name)[-1]) == 0 and \
 57 |                     file_foler_name != folder_path:
 58 |                 if len(folder_path) == 0:
 59 |                     folder_list.append(file_foler_name)
 60 |                 else:
 61 |                     folder_list.append(file_foler_name[len(folder_path) + 1:])
 62 | 
 63 |         return folder_list
 64 | 
 65 |     @staticmethod
 66 |     def list_files(path, extension=None):
 67 |         if extension is None:
 68 |             extension = ['.*']
 69 |         zip_path, folder_path = ZipReader.split_zip_style_path(path)
 70 | 
 71 |         zfile = ZipReader.get_zipfile(zip_path)
 72 |         file_lists = []
 73 |         for file_foler_name in zfile.namelist():
 74 |             file_foler_name = str.strip(file_foler_name, '/')
 75 |             if file_foler_name.startswith(folder_path) and \
 76 |                     str.lower(os.path.splitext(file_foler_name)[-1]) in extension:
 77 |                 if len(folder_path) == 0:
 78 |                     file_lists.append(file_foler_name)
 79 |                 else:
 80 |                     file_lists.append(file_foler_name[len(folder_path) + 1:])
 81 | 
 82 |         return file_lists
 83 | 
 84 |     @staticmethod
 85 |     def read(path):
 86 |         zip_path, path_img = ZipReader.split_zip_style_path(path)
 87 |         zfile = ZipReader.get_zipfile(zip_path)
 88 |         data = zfile.read(path_img)
 89 |         return data
 90 | 
 91 |     @staticmethod
 92 |     def imread(path):
 93 |         zip_path, path_img = ZipReader.split_zip_style_path(path)
 94 |         zfile = ZipReader.get_zipfile(zip_path)
 95 |         data = zfile.read(path_img)
 96 |         try:
 97 |             im = Image.open(io.BytesIO(data))
 98 |         except:
 99 |             print("ERROR IMG LOADED: ", path_img)
100 |             random_img = np.random.rand(224, 224, 3) * 255
101 |             im = Image.fromarray(np.uint8(random_img))
102 |         return im
103 | 


--------------------------------------------------------------------------------
/classification/eval.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.launch --nproc_per_node 1 --master_port 12345 main.py --eval \
2 | --cfg configs/flash_intern_image_l_22k_384.yaml  --data-path /path/to/imagenet1k
3 | 


--------------------------------------------------------------------------------
/classification/export.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # DCNv4
  3 | # Copyright (c) 2024 OpenGVLab
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # --------------------------------------------------------
  6 | 
  7 | 
  8 | import os
  9 | import time
 10 | import argparse
 11 | 
 12 | import torch
 13 | from tqdm import tqdm
 14 | 
 15 | from config import get_config
 16 | from models import build_model
 17 | 
 18 | def get_args():
 19 |     parser = argparse.ArgumentParser()
 20 |     parser.add_argument('--model_name', type=str,
 21 |                         default='internimage_t_1k_224')
 22 |     parser.add_argument('--ckpt_dir', type=str,
 23 |                         default='/mnt/petrelfs/share_data/huangzhenhang/code/internimage/checkpoint_dir/new/cls')
 24 |     parser.add_argument('--onnx', default=False, action='store_true')
 25 |     parser.add_argument('--trt', default=False, action='store_true')
 26 | 
 27 |     args = parser.parse_args()
 28 |     args.cfg = os.path.join('./configs', f'{args.model_name}.yaml')
 29 |     args.ckpt = os.path.join(args.ckpt_dir, f'{args.model_name}.pth')
 30 |     args.size = int(args.model_name.split('.')[0].split('_')[-1])
 31 | 
 32 |     cfg = get_config(args)
 33 |     return args, cfg
 34 | 
 35 | def get_model(args, cfg):
 36 |     model = build_model(cfg)
 37 |     ckpt = torch.load(args.ckpt, map_location='cpu')['model']
 38 | 
 39 |     model.load_state_dict(ckpt)
 40 |     return model
 41 | 
 42 | def speed_test(model, input):
 43 |     # warmup
 44 |     for _ in tqdm(range(100)):
 45 |         _ = model(input)
 46 | 
 47 |     # speed test
 48 |     torch.cuda.synchronize()
 49 |     start = time.time()
 50 |     for _ in tqdm(range(100)):
 51 |         _ = model(input)
 52 |     end = time.time()
 53 |     th = 100 / (end - start)
 54 |     print(f"using time: {end - start}, throughput {th}")
 55 | 
 56 | def torch2onnx(args, cfg):
 57 |     model = get_model(args, cfg).cuda()
 58 | 
 59 |     # speed_test(model)
 60 | 
 61 |     onnx_name = f'{args.model_name}.onnx'
 62 |     torch.onnx.export(model,
 63 |                       torch.rand(1, 3, args.size, args.size).cuda(),
 64 |                       onnx_name,
 65 |                       input_names=['input'],
 66 |                       output_names=['output'])
 67 | 
 68 |     return model
 69 | 
 70 | def onnx2trt(args):
 71 |     from mmdeploy.backend.tensorrt import from_onnx
 72 | 
 73 |     onnx_name = f'{args.model_name}.onnx'
 74 |     from_onnx(
 75 |         onnx_name,
 76 |         args.model_name,
 77 |         dict(
 78 |             input=dict(
 79 |                 min_shape=[1, 3, args.size, args.size],
 80 |                 opt_shape=[1, 3, args.size, args.size],
 81 |                 max_shape=[1, 3, args.size, args.size],
 82 |             )
 83 |         ),
 84 |         max_workspace_size=2**30,
 85 |     )
 86 | 
 87 | def check(args, cfg):
 88 |     from mmdeploy.backend.tensorrt.wrapper import TRTWrapper
 89 | 
 90 |     model = get_model(args, cfg).cuda()
 91 |     model.eval()
 92 |     trt_model = TRTWrapper(f'{args.model_name}.engine',
 93 |                            ['output'])
 94 | 
 95 |     x = torch.randn(1, 3, args.size, args.size).cuda()
 96 | 
 97 |     torch_out = model(x)
 98 |     trt_out = trt_model(dict(input=x))['output']
 99 | 
100 |     print('torch out shape:', torch_out.shape)
101 |     print('trt out shape:', trt_out.shape)
102 | 
103 |     print('max delta:', (torch_out - trt_out).abs().max())
104 |     print('mean delta:', (torch_out - trt_out).abs().mean())
105 | 
106 |     speed_test(model, x)
107 |     speed_test(trt_model, dict(input=x))
108 | 
109 | def main():
110 |     args, cfg = get_args()
111 | 
112 |     if args.onnx or args.trt:
113 |         torch2onnx(args, cfg)
114 |         print('torch -> onnx: succeess')
115 | 
116 |     if args.trt:
117 |         onnx2trt(args)
118 |         print('onnx -> trt: success')
119 |         check(args, cfg)
120 | 
121 | if __name__ == '__main__':
122 |     main()
123 | 


--------------------------------------------------------------------------------
/classification/logger.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | import os
 8 | import sys
 9 | import logging
10 | import functools
11 | from termcolor import colored
12 | 
13 | 
14 | @functools.lru_cache()
15 | def create_logger(output_dir, dist_rank=0, name=''):
16 |     # create logger
17 |     logger = logging.getLogger(name)
18 |     logger.setLevel(logging.DEBUG)
19 |     logger.propagate = False
20 | 
21 |     # create formatter
22 |     fmt = '[%(asctime)s %(name)s] (%(filename)s %(lineno)d): %(levelname)s %(message)s'
23 |     color_fmt = colored('[%(asctime)s %(name)s]', 'green') + \
24 |         colored('(%(filename)s %(lineno)d)', 'yellow') + \
25 |         ': %(levelname)s %(message)s'
26 | 
27 |     # create console handlers for master process
28 |     if dist_rank == 0:
29 |         console_handler = logging.StreamHandler(sys.stdout)
30 |         console_handler.setLevel(logging.DEBUG)
31 |         console_handler.setFormatter(
32 |             logging.Formatter(fmt=color_fmt, datefmt='%Y-%m-%d %H:%M:%S'))
33 |         logger.addHandler(console_handler)
34 | 
35 |     # create file handlers
36 |     file_handler = logging.FileHandler(os.path.join(
37 |         output_dir, f'log_rank{dist_rank}.txt'),
38 |                                        mode='a')
39 |     file_handler.setLevel(logging.DEBUG)
40 |     file_handler.setFormatter(
41 |         logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S'))
42 |     logger.addHandler(file_handler)
43 | 
44 |     return logger
45 | 


--------------------------------------------------------------------------------
/classification/lr_scheduler.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # DCNv4
  3 | # Copyright (c) 2024 OpenGVLab
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # --------------------------------------------------------
  6 | 
  7 | 
  8 | import torch
  9 | from timm.scheduler.cosine_lr import CosineLRScheduler
 10 | from timm.scheduler.step_lr import StepLRScheduler
 11 | from timm.scheduler.scheduler import Scheduler
 12 | 
 13 | 
 14 | def build_scheduler(config, optimizer, n_iter_per_epoch):
 15 |     num_steps = int(config.TRAIN.EPOCHS * n_iter_per_epoch)
 16 |     warmup_steps = int(config.TRAIN.WARMUP_EPOCHS * n_iter_per_epoch)
 17 |     decay_steps = int(config.TRAIN.LR_SCHEDULER.DECAY_EPOCHS *
 18 |                       n_iter_per_epoch)
 19 | 
 20 |     lr_scheduler = None
 21 |     if config.TRAIN.LR_SCHEDULER.NAME == 'cosine':
 22 |         lr_scheduler = CosineLRScheduler(
 23 |             optimizer,
 24 |             t_initial=num_steps,
 25 |             # t_mul=1.,
 26 |             lr_min=config.TRAIN.MIN_LR,
 27 |             warmup_lr_init=config.TRAIN.WARMUP_LR,
 28 |             warmup_t=warmup_steps,
 29 |             cycle_limit=1,
 30 |             t_in_epochs=False,
 31 |         )
 32 |     elif config.TRAIN.LR_SCHEDULER.NAME == 'linear':
 33 |         lr_scheduler = LinearLRScheduler(
 34 |             optimizer,
 35 |             t_initial=num_steps,
 36 |             lr_min_rate=0.01,
 37 |             warmup_lr_init=config.TRAIN.WARMUP_LR,
 38 |             warmup_t=warmup_steps,
 39 |             t_in_epochs=False,
 40 |         )
 41 |     elif config.TRAIN.LR_SCHEDULER.NAME == 'step':
 42 |         lr_scheduler = StepLRScheduler(
 43 |             optimizer,
 44 |             decay_t=decay_steps,
 45 |             decay_rate=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
 46 |             warmup_lr_init=config.TRAIN.WARMUP_LR,
 47 |             warmup_t=warmup_steps,
 48 |             t_in_epochs=False,
 49 |         )
 50 | 
 51 |     return lr_scheduler
 52 | 
 53 | 
 54 | class LinearLRScheduler(Scheduler):
 55 | 
 56 |     def __init__(
 57 |         self,
 58 |         optimizer: torch.optim.Optimizer,
 59 |         t_initial: int,
 60 |         lr_min_rate: float,
 61 |         warmup_t=0,
 62 |         warmup_lr_init=0.,
 63 |         t_in_epochs=True,
 64 |         noise_range_t=None,
 65 |         noise_pct=0.67,
 66 |         noise_std=1.0,
 67 |         noise_seed=42,
 68 |         initialize=True,
 69 |     ) -> None:
 70 |         super().__init__(optimizer,
 71 |                          param_group_field="lr",
 72 |                          noise_range_t=noise_range_t,
 73 |                          noise_pct=noise_pct,
 74 |                          noise_std=noise_std,
 75 |                          noise_seed=noise_seed,
 76 |                          initialize=initialize)
 77 | 
 78 |         self.t_initial = t_initial
 79 |         self.lr_min_rate = lr_min_rate
 80 |         self.warmup_t = warmup_t
 81 |         self.warmup_lr_init = warmup_lr_init
 82 |         self.t_in_epochs = t_in_epochs
 83 |         if self.warmup_t:
 84 |             self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t
 85 |                                  for v in self.base_values]
 86 |             super().update_groups(self.warmup_lr_init)
 87 |         else:
 88 |             self.warmup_steps = [1 for _ in self.base_values]
 89 | 
 90 |     def _get_lr(self, t):
 91 |         if t < self.warmup_t:
 92 |             lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps]
 93 |         else:
 94 |             t = t - self.warmup_t
 95 |             total_t = self.t_initial - self.warmup_t
 96 |             lrs = [
 97 |                 v - ((v - v * self.lr_min_rate) * (t / total_t))
 98 |                 for v in self.base_values
 99 |             ]
100 |         return lrs
101 | 
102 |     def get_epoch_values(self, epoch: int):
103 |         if self.t_in_epochs:
104 |             return self._get_lr(epoch)
105 |         else:
106 |             return None
107 | 
108 |     def get_update_values(self, num_updates: int):
109 |         if not self.t_in_epochs:
110 |             return self._get_lr(num_updates)
111 |         else:
112 |             return None
113 | 


--------------------------------------------------------------------------------
/classification/meta_data/meta:
--------------------------------------------------------------------------------
1 | /mnt/petrelfs/share/images/meta/


--------------------------------------------------------------------------------
/classification/models/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DCNv4
3 | # Copyright (c) 2024 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | 
7 | from .build import build_model


--------------------------------------------------------------------------------
/classification/models/build.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | 
 8 | from .intern_image import InternImage
 9 | from .flash_intern_image import FlashInternImage
10 | 
11 | def build_model(config):
12 |     model_type = config.MODEL.TYPE
13 |     if model_type == 'intern_image':
14 |         model = InternImage(
15 |             core_op=config.MODEL.INTERN_IMAGE.CORE_OP,
16 |             num_classes=config.MODEL.NUM_CLASSES,
17 |             channels=config.MODEL.INTERN_IMAGE.CHANNELS,
18 |             depths=config.MODEL.INTERN_IMAGE.DEPTHS,
19 |             groups=config.MODEL.INTERN_IMAGE.GROUPS,
20 |             layer_scale=config.MODEL.INTERN_IMAGE.LAYER_SCALE,
21 |             offset_scale=config.MODEL.INTERN_IMAGE.OFFSET_SCALE,
22 |             post_norm=config.MODEL.INTERN_IMAGE.POST_NORM,
23 |             mlp_ratio=config.MODEL.INTERN_IMAGE.MLP_RATIO,
24 |             with_cp=config.TRAIN.USE_CHECKPOINT,
25 |             drop_path_rate=config.MODEL.DROP_PATH_RATE,
26 |             res_post_norm=config.MODEL.INTERN_IMAGE.RES_POST_NORM, # for InternImage-H/G
27 |             dw_kernel_size=config.MODEL.INTERN_IMAGE.DW_KERNEL_SIZE, # for InternImage-H/G
28 |             use_clip_projector=config.MODEL.INTERN_IMAGE.USE_CLIP_PROJECTOR, # for InternImage-H/G
29 |             level2_post_norm=config.MODEL.INTERN_IMAGE.LEVEL2_POST_NORM, # for InternImage-H/G
30 |             level2_post_norm_block_ids=config.MODEL.INTERN_IMAGE.LEVEL2_POST_NORM_BLOCK_IDS, # for InternImage-H/G
31 |             center_feature_scale=config.MODEL.INTERN_IMAGE.CENTER_FEATURE_SCALE # for InternImage-H/G
32 |         )
33 |     elif model_type == 'flash_intern_image':
34 |         model = FlashInternImage(
35 |             core_op=config.MODEL.FLASH_INTERN_IMAGE.CORE_OP,
36 |             num_classes=config.MODEL.NUM_CLASSES,
37 |             channels=config.MODEL.FLASH_INTERN_IMAGE.CHANNELS,
38 |             depths=config.MODEL.FLASH_INTERN_IMAGE.DEPTHS,
39 |             groups=config.MODEL.FLASH_INTERN_IMAGE.GROUPS,
40 |             layer_scale=config.MODEL.FLASH_INTERN_IMAGE.LAYER_SCALE,
41 |             offset_scale=config.MODEL.FLASH_INTERN_IMAGE.OFFSET_SCALE,
42 |             post_norm=config.MODEL.FLASH_INTERN_IMAGE.POST_NORM,
43 |             mlp_ratio=config.MODEL.FLASH_INTERN_IMAGE.MLP_RATIO,
44 |             with_cp=config.TRAIN.USE_CHECKPOINT,
45 |             drop_path_rate=config.MODEL.DROP_PATH_RATE,
46 |             mlp_fc2_bias=config.MODEL.FLASH_INTERN_IMAGE.MLP_FC2_BIAS,
47 |             dcn_output_bias=config.MODEL.FLASH_INTERN_IMAGE.DCN_OUTPUT_BIAS,
48 |             res_post_norm=config.MODEL.FLASH_INTERN_IMAGE.RES_POST_NORM, # for InternImage-H/G
49 |             dw_kernel_size=config.MODEL.FLASH_INTERN_IMAGE.DW_KERNEL_SIZE,
50 |             use_clip_projector=config.MODEL.FLASH_INTERN_IMAGE.USE_CLIP_PROJECTOR, # for InternImage-H/G
51 |             level2_post_norm=config.MODEL.FLASH_INTERN_IMAGE.LEVEL2_POST_NORM, # for InternImage-H/G
52 |             level2_post_norm_block_ids=config.MODEL.FLASH_INTERN_IMAGE.LEVEL2_POST_NORM_BLOCK_IDS, # for InternImage-H/G
53 |             center_feature_scale=config.MODEL.FLASH_INTERN_IMAGE.CENTER_FEATURE_SCALE # for InternImage-H/G
54 |         )
55 |     else:
56 |         raise NotImplementedError(f"Unkown model: {model_type}")
57 | 
58 |     return model
59 | 


--------------------------------------------------------------------------------
/classification/ops_dcnv3/functions/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DCNv4
3 | # Copyright (c) 2024 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | 
7 | from .dcnv3_func import DCNv3Function, dcnv3_core_pytorch
8 | 


--------------------------------------------------------------------------------
/classification/ops_dcnv3/make.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # --------------------------------------------------------
3 | # DCNv4
4 | # Copyright (c) 2024 OpenGVLab
5 | # Licensed under The MIT License [see LICENSE for details]
6 | # --------------------------------------------------------
7 | 
8 | python setup.py build install
9 | 


--------------------------------------------------------------------------------
/classification/ops_dcnv3/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DCNv4
3 | # Copyright (c) 2024 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | 
7 | from .dcnv3 import DCNv3, DCNv3_pytorch


--------------------------------------------------------------------------------
/classification/ops_dcnv3/setup.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | import os
 8 | import glob
 9 | 
10 | import torch
11 | 
12 | from torch.utils.cpp_extension import CUDA_HOME
13 | from torch.utils.cpp_extension import CppExtension
14 | from torch.utils.cpp_extension import CUDAExtension
15 | 
16 | from setuptools import find_packages
17 | from setuptools import setup
18 | 
19 | requirements = ["torch", "torchvision"]
20 | 
21 | 
22 | def get_extensions():
23 |     this_dir = os.path.dirname(os.path.abspath(__file__))
24 |     extensions_dir = os.path.join(this_dir, "src")
25 | 
26 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
27 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
28 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
29 | 
30 |     sources = main_file + source_cpu
31 |     extension = CppExtension
32 |     extra_compile_args = {"cxx": []}
33 |     define_macros = []
34 | 
35 |     if torch.cuda.is_available() and CUDA_HOME is not None:
36 |         extension = CUDAExtension
37 |         sources += source_cuda
38 |         define_macros += [("WITH_CUDA", None)]
39 |         extra_compile_args["nvcc"] = [
40 |             # "-DCUDA_HAS_FP16=1",
41 |             # "-D__CUDA_NO_HALF_OPERATORS__",
42 |             # "-D__CUDA_NO_HALF_CONVERSIONS__",
43 |             # "-D__CUDA_NO_HALF2_OPERATORS__",
44 |         ]
45 |     else:
46 |         raise NotImplementedError('Cuda is not availabel')
47 | 
48 |     sources = [os.path.join(extensions_dir, s) for s in sources]
49 |     include_dirs = [extensions_dir]
50 |     ext_modules = [
51 |         extension(
52 |             "DCNv3",
53 |             sources,
54 |             include_dirs=include_dirs,
55 |             define_macros=define_macros,
56 |             extra_compile_args=extra_compile_args,
57 |         )
58 |     ]
59 |     return ext_modules
60 | 
61 | 
62 | setup(
63 |     name="DCNv3",
64 |     version="1.1",
65 |     author="InternImage",
66 |     url="https://github.com/OpenGVLab/InternImage",
67 |     description=
68 |     "PyTorch Wrapper for CUDA Functions of DCNv3",
69 |     packages=find_packages(exclude=(
70 |         "configs",
71 |         "tests",
72 |     )),
73 |     ext_modules=get_extensions(),
74 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
75 | )
76 | 


--------------------------------------------------------------------------------
/classification/ops_dcnv3/src/cpu/dcnv3_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #include <vector>
13 | 
14 | #include <ATen/ATen.h>
15 | #include <ATen/cuda/CUDAContext.h>
16 | 
17 | at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
18 |                              const at::Tensor &mask, const int kernel_h,
19 |                              const int kernel_w, const int stride_h,
20 |                              const int stride_w, const int pad_h,
21 |                              const int pad_w, const int dilation_h,
22 |                              const int dilation_w, const int group,
23 |                              const int group_channels, const float offset_scale,
24 |                              const int im2col_step) {
25 |     AT_ERROR("Not implement on cpu");
26 | }
27 | 
28 | std::vector<at::Tensor>
29 | dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
30 |                    const at::Tensor &mask, const int kernel_h,
31 |                    const int kernel_w, const int stride_h, const int stride_w,
32 |                    const int pad_h, const int pad_w, const int dilation_h,
33 |                    const int dilation_w, const int group,
34 |                    const int group_channels, const float offset_scale,
35 |                    const at::Tensor &grad_output, const int im2col_step) {
36 |     AT_ERROR("Not implement on cpu");
37 | }
38 | 


--------------------------------------------------------------------------------
/classification/ops_dcnv3/src/cpu/dcnv3_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #pragma once
13 | #include <torch/extension.h>
14 | 
15 | at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
16 |                              const at::Tensor &mask, const int kernel_h,
17 |                              const int kernel_w, const int stride_h,
18 |                              const int stride_w, const int pad_h,
19 |                              const int pad_w, const int dilation_h,
20 |                              const int dilation_w, const int group,
21 |                              const int group_channels, const float offset_scale,
22 |                              const int im2col_step);
23 | 
24 | std::vector<at::Tensor>
25 | dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
26 |                    const at::Tensor &mask, const int kernel_h,
27 |                    const int kernel_w, const int stride_h, const int stride_w,
28 |                    const int pad_h, const int pad_w, const int dilation_h,
29 |                    const int dilation_w, const int group,
30 |                    const int group_channels, const float offset_scale,
31 |                    const at::Tensor &grad_output, const int im2col_step);
32 | 


--------------------------------------------------------------------------------
/classification/ops_dcnv3/src/cuda/dcnv3_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #pragma once
13 | #include <torch/extension.h>
14 | 
15 | at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset,
16 |                               const at::Tensor &mask, const int kernel_h,
17 |                               const int kernel_w, const int stride_h,
18 |                               const int stride_w, const int pad_h,
19 |                               const int pad_w, const int dilation_h,
20 |                               const int dilation_w, const int group,
21 |                               const int group_channels,
22 |                               const float offset_scale, const int im2col_step, const int remove_center);
23 | 
24 | std::vector<at::Tensor>
25 | dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
26 |                     const at::Tensor &mask, const int kernel_h,
27 |                     const int kernel_w, const int stride_h, const int stride_w,
28 |                     const int pad_h, const int pad_w, const int dilation_h,
29 |                     const int dilation_w, const int group,
30 |                     const int group_channels, const float offset_scale,
31 |                     const at::Tensor &grad_output, const int im2col_step, const int remove_center);
32 | 


--------------------------------------------------------------------------------
/classification/ops_dcnv3/src/dcnv3.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #pragma once
13 | 
14 | #include "cpu/dcnv3_cpu.h"
15 | 
16 | #ifdef WITH_CUDA
17 | #include "cuda/dcnv3_cuda.h"
18 | #endif
19 | 
20 | at::Tensor dcnv3_forward(const at::Tensor &input, const at::Tensor &offset,
21 |                          const at::Tensor &mask, const int kernel_h,
22 |                          const int kernel_w, const int stride_h,
23 |                          const int stride_w, const int pad_h, const int pad_w,
24 |                          const int dilation_h, const int dilation_w,
25 |                          const int group, const int group_channels,
26 |                          const float offset_scale, const int im2col_step, const int remove_center) {
27 |     if (input.type().is_cuda()) {
28 | #ifdef WITH_CUDA
29 |         return dcnv3_cuda_forward(input, offset, mask, kernel_h, kernel_w,
30 |                                   stride_h, stride_w, pad_h, pad_w, dilation_h,
31 |                                   dilation_w, group, group_channels,
32 |                                   offset_scale, im2col_step, remove_center);
33 | #else
34 |         AT_ERROR("Not compiled with GPU support");
35 | #endif
36 |     }
37 |     AT_ERROR("Not implemented on the CPU");
38 | }
39 | 
40 | std::vector<at::Tensor>
41 | dcnv3_backward(const at::Tensor &input, const at::Tensor &offset,
42 |                const at::Tensor &mask, const int kernel_h, const int kernel_w,
43 |                const int stride_h, const int stride_w, const int pad_h,
44 |                const int pad_w, const int dilation_h, const int dilation_w,
45 |                const int group, const int group_channels,
46 |                const float offset_scale, const at::Tensor &grad_output,
47 |                const int im2col_step, const int remove_center) {
48 |     if (input.type().is_cuda()) {
49 | #ifdef WITH_CUDA
50 |         return dcnv3_cuda_backward(input, offset, mask, kernel_h, kernel_w,
51 |                                    stride_h, stride_w, pad_h, pad_w, dilation_h,
52 |                                    dilation_w, group, group_channels,
53 |                                    offset_scale, grad_output, im2col_step, remove_center);
54 | #else
55 |         AT_ERROR("Not compiled with GPU support");
56 | #endif
57 |     }
58 |     AT_ERROR("Not implemented on the CPU");
59 | }
60 | 


--------------------------------------------------------------------------------
/classification/ops_dcnv3/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #include "dcnv3.h"
13 | 
14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
15 |     m.def("dcnv3_forward", &dcnv3_forward, "dcnv3_forward");
16 |     m.def("dcnv3_backward", &dcnv3_backward, "dcnv3_backward");
17 | }
18 | 


--------------------------------------------------------------------------------
/classification/train_in1k.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION=$1
 6 | JOB_NAME=$2
 7 | CONFIG=$3
 8 | WORK_DIR=$4
 9 | GPUS=${GPUS:-1}
10 | GPUS_PER_NODE=${GPUS_PER_NODE:-1}
11 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
12 | SRUN_ARGS=${SRUN_ARGS:-""}
13 | PY_ARGS=${@:5}
14 | 
15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
16 | srun -p ${PARTITION} \
17 |     --job-name=${JOB_NAME} \
18 |     --gres=gpu:${GPUS_PER_NODE} \
19 |     --ntasks=${GPUS} \
20 |     --ntasks-per-node=${GPUS_PER_NODE} \
21 |     --cpus-per-task=${CPUS_PER_TASK} \
22 |     --kill-on-bad-exit=1 \
23 |     --quotatype=reserved \
24 |     ${SRUN_ARGS} \
25 | python -u main.py \
26 |     --cfg ${CONFIG} \
27 |     --accumulation-steps 1 \
28 |     --local-rank 0 \
29 |     --batch-size 128 \
30 |     --data-path /mnt/petrelfs/share/images \
31 |     --output work_dirs ${@:4} --launcher="slurm" 
32 | 


--------------------------------------------------------------------------------
/classification/train_in1k_deepspeed.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION=$1
 6 | JOB_NAME=$2
 7 | CONFIG=$3
 8 | GPUS=${GPUS:-8}
 9 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
10 | CPUS_PER_TASK=${CPUS_PER_TASK:-12}
11 | SRUN_ARGS=${SRUN_ARGS:-""}
12 | 
13 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
14 |     srun -p ${PARTITION} \
15 |     --job-name=${JOB_NAME} \
16 |     --gres=gpu:${GPUS_PER_NODE} \
17 |     --ntasks=${GPUS} \
18 |     --ntasks-per-node=${GPUS_PER_NODE} \
19 |     --cpus-per-task=${CPUS_PER_TASK} \
20 |     --kill-on-bad-exit=1 \
21 |     --quotatype=spot \
22 |     ${SRUN_ARGS} \
23 |     python -u main_deepspeed.py \
24 |     --cfg ${CONFIG} \
25 |     --local-rank 0 \
26 |     --data-path /mnt/lustre/share/images \
27 |     --output work_dirs_deepspeed ${@:4}
28 | 


--------------------------------------------------------------------------------
/detection/README.md:
--------------------------------------------------------------------------------
 1 | # FlashInternImage for Object Detection
 2 | 
 3 | This folder contains the implementation of the FlashInternImage for object detection. 
 4 | 
 5 | Our detection code is developed on top of [MMDetection v2.28.1](https://github.com/open-mmlab/mmdetection/tree/v2.28.1).
 6 | 
 7 | 
 8 | ## Usage
 9 | 
10 | ### Install
11 | 
12 | - Clone this repo:
13 | 
14 | ```bash
15 | git clone https://github.com/OpenGVLab/DCNv4.git
16 | cd DCNv4
17 | ```
18 | 
19 | - Create a conda virtual environment and activate it:
20 | 
21 | ```bash
22 | conda create -n dcnv4 python=3.7 -y
23 | conda activate dcnv4
24 | ```
25 | 
26 | - Install `CUDA>=10.2` with `cudnn>=7` following
27 |   the [official installation instructions](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
28 | - Install `PyTorch>=1.10.0` and `torchvision>=0.9.0` with `CUDA>=10.2`:
29 | 
30 | For examples, to install torch==1.11 with CUDA==11.3:
31 | ```bash
32 | pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113  -f https://download.pytorch.org/whl/torch_stable.html
33 | ```
34 | 
35 | - Install `timm==0.6.11` and `mmcv-full==1.5.0`:
36 | 
37 | ```bash
38 | pip install -U openmim
39 | mim install mmcv-full==1.5.0
40 | pip install timm==0.6.11 mmdet==2.28.1
41 | ```
42 | 
43 | - Install other requirements:
44 | 
45 | ```bash
46 | pip install opencv-python termcolor yacs pyyaml scipy
47 | ```
48 | 
49 | - Install DCNv4
50 | ```bash
51 | pip install DCNv4
52 | ```
53 | 
54 | 
55 | ### Data Preparation
56 | 
57 | Prepare COCO according to the guidelines in [MMDetection v2.28.1](https://github.com/open-mmlab/mmdetection/resolve/master/docs/en/1_exist_data_model.md).
58 | 
59 | 
60 | ### Evaluation
61 | 
62 | To evaluate our `FlashInternImage` on COCO val, run:
63 | 
64 | ```bash
65 | sh dist_test.sh <config-file> <checkpoint> <gpu-num> --eval bbox segm
66 | ```
67 | 
68 | For example, to evaluate the `FlashInternImage-T` with a single GPU:
69 | 
70 | ```bash
71 | python test.py configs/coco/mask_rcnn_flash_intern_image_t_fpn_1x_coco.py checkpoint_dir/det/mask_rcnn_flash_internimage_t_fpn_1x_coco.pth --eval bbox segm
72 | ```
73 | 
74 | For example, to evaluate the `FlashInternImage-B` with a single node with 8 GPUs:
75 | 
76 | ```bash
77 | sh dist_test.sh configs/coco/mask_rcnn_flash_intern_image_b_fpn_1x_coco.py checkpoint_dir/det/mask_rcnn_flash_internimage_b_fpn_1x_coco.py 8 --eval bbox segm
78 | ```
79 | 
80 | ### Training on COCO
81 | 
82 | To train an `FlashInternImage` on COCO, run:
83 | 
84 | ```bash
85 | sh dist_train.sh <config-file> <gpu-num>
86 | ```
87 | 
88 | For example, to train `FlashInternImage-T` with 8 GPU on 1 node, run:
89 | 
90 | ```bash
91 | sh dist_train.sh configs/coco/mask_rcnn_flash_intern_image_t_fpn_1x_coco.py 8
92 | ```
93 | 
94 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/datasets/coco_detection.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'CocoDataset'
 3 | data_root = 'data/coco/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | train_pipeline = [
 7 |     dict(type='LoadImageFromFile'),
 8 |     dict(type='LoadAnnotations', with_bbox=True),
 9 |     dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
10 |     dict(type='RandomFlip', flip_ratio=0.5),
11 |     dict(type='Normalize', **img_norm_cfg),
12 |     dict(type='Pad', size_divisor=32),
13 |     dict(type='DefaultFormatBundle'),
14 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
15 | ]
16 | test_pipeline = [
17 |     dict(type='LoadImageFromFile'),
18 |     dict(
19 |         type='MultiScaleFlipAug',
20 |         img_scale=(1333, 800),
21 |         flip=False,
22 |         transforms=[
23 |             dict(type='Resize', keep_ratio=True),
24 |             dict(type='RandomFlip'),
25 |             dict(type='Normalize', **img_norm_cfg),
26 |             dict(type='Pad', size_divisor=32),
27 |             dict(type='ImageToTensor', keys=['img']),
28 |             dict(type='Collect', keys=['img']),
29 |         ])
30 | ]
31 | data = dict(
32 |     samples_per_gpu=2,
33 |     workers_per_gpu=2,
34 |     train=dict(
35 |         type=dataset_type,
36 |         ann_file=data_root + 'annotations/instances_train2017.json',
37 |         img_prefix=data_root + 'train2017/',
38 |         pipeline=train_pipeline),
39 |     val=dict(
40 |         type=dataset_type,
41 |         ann_file=data_root + 'annotations/instances_val2017.json',
42 |         img_prefix=data_root + 'val2017/',
43 |         pipeline=test_pipeline),
44 |     test=dict(
45 |         type=dataset_type,
46 |         ann_file=data_root + 'annotations/instances_val2017.json',
47 |         img_prefix=data_root + 'val2017/',
48 |         pipeline=test_pipeline))
49 | evaluation = dict(interval=1, metric='bbox', classwise=True)


--------------------------------------------------------------------------------
/detection/configs/_base_/datasets/coco_instance.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'CocoDataset'
 3 | data_root = 'data/coco/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | train_pipeline = [
 7 |     dict(type='LoadImageFromFile'),
 8 |     dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
 9 |     dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
10 |     dict(type='RandomFlip', flip_ratio=0.5),
11 |     dict(type='Normalize', **img_norm_cfg),
12 |     dict(type='Pad', size_divisor=32),
13 |     dict(type='DefaultFormatBundle'),
14 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
15 | ]
16 | test_pipeline = [
17 |     dict(type='LoadImageFromFile'),
18 |     dict(
19 |         type='MultiScaleFlipAug',
20 |         img_scale=(1333, 800),
21 |         flip=False,
22 |         transforms=[
23 |             dict(type='Resize', keep_ratio=True),
24 |             dict(type='RandomFlip'),
25 |             dict(type='Normalize', **img_norm_cfg),
26 |             dict(type='Pad', size_divisor=32),
27 |             dict(type='ImageToTensor', keys=['img']),
28 |             dict(type='Collect', keys=['img']),
29 |         ])
30 | ]
31 | data = dict(
32 |     samples_per_gpu=2,
33 |     workers_per_gpu=2,
34 |     train=dict(
35 |         type=dataset_type,
36 |         ann_file=data_root + 'annotations/instances_train2017.json',
37 |         img_prefix=data_root + 'train2017/',
38 |         pipeline=train_pipeline),
39 |     val=dict(
40 |         type=dataset_type,
41 |         ann_file=data_root + 'annotations/instances_val2017.json',
42 |         img_prefix=data_root + 'val2017/',
43 |         pipeline=test_pipeline),
44 |     test=dict(
45 |         type=dataset_type,
46 |         ann_file=data_root + 'annotations/instances_val2017.json',
47 |         img_prefix=data_root + 'val2017/',
48 |         pipeline=test_pipeline))
49 | evaluation = dict(metric=['bbox', 'segm'], classwise=True)
50 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/datasets/crowd_human.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'CrowdHumanDataset'
 3 | data_root = 'data/CrowdHuman/'
 4 | classes = ('person',)
 5 | img_norm_cfg = dict(
 6 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 7 | train_pipeline = [
 8 |     dict(type='LoadImageFromFile'),
 9 |     dict(type='LoadAnnotations', with_bbox=True),
10 |     dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
11 |     dict(type='RandomFlip', flip_ratio=0.5),
12 |     dict(type='Normalize', **img_norm_cfg),
13 |     dict(type='Pad', size_divisor=32),
14 |     dict(type='DefaultFormatBundle'),
15 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
16 | ]
17 | test_pipeline = [
18 |     dict(type='LoadImageFromFile'),
19 |     dict(
20 |         type='MultiScaleFlipAug',
21 |         img_scale=(1333, 800),
22 |         flip=False,
23 |         transforms=[
24 |             dict(type='Resize', keep_ratio=True),
25 |             dict(type='RandomFlip'),
26 |             dict(type='Normalize', **img_norm_cfg),
27 |             dict(type='Pad', size_divisor=32),
28 |             dict(type='ImageToTensor', keys=['img']),
29 |             dict(type='Collect', keys=['img']),
30 |         ])
31 | ]
32 | data = dict(
33 |     samples_per_gpu=2,
34 |     workers_per_gpu=2,
35 |     train=dict(
36 |         type=dataset_type,
37 |         classes=classes,
38 |         filter_empty_gt=True,
39 |         ann_file=data_root + 'annotations/annotation_train.json',
40 |         img_prefix=data_root + 'Images',
41 |         pipeline=train_pipeline),
42 |     val=dict(
43 |         type=dataset_type,
44 |         classes=classes,
45 |         ann_file=data_root + 'annotations/annotation_val.json',
46 |         img_prefix=data_root + 'Images',
47 |         pipeline=test_pipeline),
48 |     test=dict(
49 |         type=dataset_type,
50 |         classes=classes,
51 |         ann_file=data_root + 'annotations/annotation_val.json',
52 |         img_prefix=data_root + 'Images',
53 |         pipeline=test_pipeline))
54 | evaluation = dict(interval=100, metric='bbox')
55 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
 1 | checkpoint_config = dict(interval=1)
 2 | # yapf:disable
 3 | log_config = dict(
 4 |     interval=50,
 5 |     hooks=[
 6 |         dict(type='TextLoggerHook'),
 7 |         # dict(type='TensorboardLoggerHook')
 8 |     ])
 9 | # yapf:enable
10 | custom_hooks = [dict(type='NumClassCheckHook')]
11 | 
12 | dist_params = dict(backend='nccl')
13 | log_level = 'INFO'
14 | load_from = None
15 | resume_from = None
16 | workflow = [('train', 1)]
17 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/models/fast_rcnn_r50_fpn.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | model = dict(
 3 |     type='FastRCNN',
 4 |     backbone=dict(
 5 |         type='ResNet',
 6 |         depth=50,
 7 |         num_stages=4,
 8 |         out_indices=(0, 1, 2, 3),
 9 |         frozen_stages=1,
10 |         norm_cfg=dict(type='BN', requires_grad=True),
11 |         norm_eval=True,
12 |         style='pytorch',
13 |         init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
14 |     neck=dict(
15 |         type='FPN',
16 |         in_channels=[256, 512, 1024, 2048],
17 |         out_channels=256,
18 |         num_outs=5),
19 |     roi_head=dict(
20 |         type='StandardRoIHead',
21 |         bbox_roi_extractor=dict(
22 |             type='SingleRoIExtractor',
23 |             roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
24 |             out_channels=256,
25 |             featmap_strides=[4, 8, 16, 32]),
26 |         bbox_head=dict(
27 |             type='Shared2FCBBoxHead',
28 |             in_channels=256,
29 |             fc_out_channels=1024,
30 |             roi_feat_size=7,
31 |             num_classes=80,
32 |             bbox_coder=dict(
33 |                 type='DeltaXYWHBBoxCoder',
34 |                 target_means=[0., 0., 0., 0.],
35 |                 target_stds=[0.1, 0.1, 0.2, 0.2]),
36 |             reg_class_agnostic=False,
37 |             loss_cls=dict(
38 |                 type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
39 |             loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
40 |     # model training and testing settings
41 |     train_cfg=dict(
42 |         rcnn=dict(
43 |             assigner=dict(
44 |                 type='MaxIoUAssigner',
45 |                 pos_iou_thr=0.5,
46 |                 neg_iou_thr=0.5,
47 |                 min_pos_iou=0.5,
48 |                 match_low_quality=False,
49 |                 ignore_iof_thr=-1),
50 |             sampler=dict(
51 |                 type='RandomSampler',
52 |                 num=512,
53 |                 pos_fraction=0.25,
54 |                 neg_pos_ub=-1,
55 |                 add_gt_as_proposals=True),
56 |             pos_weight=-1,
57 |             debug=False)),
58 |     test_cfg=dict(
59 |         rcnn=dict(
60 |             score_thr=0.05,
61 |             nms=dict(type='nms', iou_threshold=0.5),
62 |             max_per_img=100)))
63 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/models/faster_rcnn_r50_caffe_dc5.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | norm_cfg = dict(type='BN', requires_grad=False)
  3 | model = dict(
  4 |     type='FasterRCNN',
  5 |     backbone=dict(
  6 |         type='ResNet',
  7 |         depth=50,
  8 |         num_stages=4,
  9 |         strides=(1, 2, 2, 1),
 10 |         dilations=(1, 1, 1, 2),
 11 |         out_indices=(3, ),
 12 |         frozen_stages=1,
 13 |         norm_cfg=norm_cfg,
 14 |         norm_eval=True,
 15 |         style='caffe',
 16 |         init_cfg=dict(
 17 |             type='Pretrained',
 18 |             checkpoint='open-mmlab://detectron2/resnet50_caffe')),
 19 |     rpn_head=dict(
 20 |         type='RPNHead',
 21 |         in_channels=2048,
 22 |         feat_channels=2048,
 23 |         anchor_generator=dict(
 24 |             type='AnchorGenerator',
 25 |             scales=[2, 4, 8, 16, 32],
 26 |             ratios=[0.5, 1.0, 2.0],
 27 |             strides=[16]),
 28 |         bbox_coder=dict(
 29 |             type='DeltaXYWHBBoxCoder',
 30 |             target_means=[.0, .0, .0, .0],
 31 |             target_stds=[1.0, 1.0, 1.0, 1.0]),
 32 |         loss_cls=dict(
 33 |             type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
 34 |         loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
 35 |     roi_head=dict(
 36 |         type='StandardRoIHead',
 37 |         bbox_roi_extractor=dict(
 38 |             type='SingleRoIExtractor',
 39 |             roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
 40 |             out_channels=2048,
 41 |             featmap_strides=[16]),
 42 |         bbox_head=dict(
 43 |             type='Shared2FCBBoxHead',
 44 |             in_channels=2048,
 45 |             fc_out_channels=1024,
 46 |             roi_feat_size=7,
 47 |             num_classes=80,
 48 |             bbox_coder=dict(
 49 |                 type='DeltaXYWHBBoxCoder',
 50 |                 target_means=[0., 0., 0., 0.],
 51 |                 target_stds=[0.1, 0.1, 0.2, 0.2]),
 52 |             reg_class_agnostic=False,
 53 |             loss_cls=dict(
 54 |                 type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
 55 |             loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
 56 |     # model training and testing settings
 57 |     train_cfg=dict(
 58 |         rpn=dict(
 59 |             assigner=dict(
 60 |                 type='MaxIoUAssigner',
 61 |                 pos_iou_thr=0.7,
 62 |                 neg_iou_thr=0.3,
 63 |                 min_pos_iou=0.3,
 64 |                 match_low_quality=True,
 65 |                 ignore_iof_thr=-1),
 66 |             sampler=dict(
 67 |                 type='RandomSampler',
 68 |                 num=256,
 69 |                 pos_fraction=0.5,
 70 |                 neg_pos_ub=-1,
 71 |                 add_gt_as_proposals=False),
 72 |             allowed_border=0,
 73 |             pos_weight=-1,
 74 |             debug=False),
 75 |         rpn_proposal=dict(
 76 |             nms_pre=12000,
 77 |             max_per_img=2000,
 78 |             nms=dict(type='nms', iou_threshold=0.7),
 79 |             min_bbox_size=0),
 80 |         rcnn=dict(
 81 |             assigner=dict(
 82 |                 type='MaxIoUAssigner',
 83 |                 pos_iou_thr=0.5,
 84 |                 neg_iou_thr=0.5,
 85 |                 min_pos_iou=0.5,
 86 |                 match_low_quality=False,
 87 |                 ignore_iof_thr=-1),
 88 |             sampler=dict(
 89 |                 type='RandomSampler',
 90 |                 num=512,
 91 |                 pos_fraction=0.25,
 92 |                 neg_pos_ub=-1,
 93 |                 add_gt_as_proposals=True),
 94 |             pos_weight=-1,
 95 |             debug=False)),
 96 |     test_cfg=dict(
 97 |         rpn=dict(
 98 |             nms=dict(type='nms', iou_threshold=0.7),
 99 |             nms_pre=6000,
100 |             max_per_img=1000,
101 |             min_bbox_size=0),
102 |         rcnn=dict(
103 |             score_thr=0.05,
104 |             nms=dict(type='nms', iou_threshold=0.5),
105 |             max_per_img=100)))
106 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/models/retinanet_r50_fpn.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | model = dict(
 3 |     type='RetinaNet',
 4 |     backbone=dict(
 5 |         type='ResNet',
 6 |         depth=50,
 7 |         num_stages=4,
 8 |         out_indices=(0, 1, 2, 3),
 9 |         frozen_stages=1,
10 |         norm_cfg=dict(type='BN', requires_grad=True),
11 |         norm_eval=True,
12 |         style='pytorch',
13 |         init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
14 |     neck=dict(
15 |         type='FPN',
16 |         in_channels=[256, 512, 1024, 2048],
17 |         out_channels=256,
18 |         start_level=1,
19 |         add_extra_convs='on_input',
20 |         num_outs=5),
21 |     bbox_head=dict(
22 |         type='RetinaHead',
23 |         num_classes=80,
24 |         in_channels=256,
25 |         stacked_convs=4,
26 |         feat_channels=256,
27 |         anchor_generator=dict(
28 |             type='AnchorGenerator',
29 |             octave_base_scale=4,
30 |             scales_per_octave=3,
31 |             ratios=[0.5, 1.0, 2.0],
32 |             strides=[8, 16, 32, 64, 128]),
33 |         bbox_coder=dict(
34 |             type='DeltaXYWHBBoxCoder',
35 |             target_means=[.0, .0, .0, .0],
36 |             target_stds=[1.0, 1.0, 1.0, 1.0]),
37 |         loss_cls=dict(
38 |             type='FocalLoss',
39 |             use_sigmoid=True,
40 |             gamma=2.0,
41 |             alpha=0.25,
42 |             loss_weight=1.0),
43 |         loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
44 |     # model training and testing settings
45 |     train_cfg=dict(
46 |         assigner=dict(
47 |             type='MaxIoUAssigner',
48 |             pos_iou_thr=0.5,
49 |             neg_iou_thr=0.4,
50 |             min_pos_iou=0,
51 |             ignore_iof_thr=-1),
52 |         allowed_border=-1,
53 |         pos_weight=-1,
54 |         debug=False),
55 |     test_cfg=dict(
56 |         nms_pre=1000,
57 |         min_bbox_size=0,
58 |         score_thr=0.05,
59 |         nms=dict(type='nms', iou_threshold=0.5),
60 |         max_per_img=100))
61 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/models/rpn_r50_caffe_c4.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | model = dict(
 3 |     type='RPN',
 4 |     backbone=dict(
 5 |         type='ResNet',
 6 |         depth=50,
 7 |         num_stages=3,
 8 |         strides=(1, 2, 2),
 9 |         dilations=(1, 1, 1),
10 |         out_indices=(2, ),
11 |         frozen_stages=1,
12 |         norm_cfg=dict(type='BN', requires_grad=False),
13 |         norm_eval=True,
14 |         style='caffe',
15 |         init_cfg=dict(
16 |             type='Pretrained',
17 |             checkpoint='open-mmlab://detectron2/resnet50_caffe')),
18 |     neck=None,
19 |     rpn_head=dict(
20 |         type='RPNHead',
21 |         in_channels=1024,
22 |         feat_channels=1024,
23 |         anchor_generator=dict(
24 |             type='AnchorGenerator',
25 |             scales=[2, 4, 8, 16, 32],
26 |             ratios=[0.5, 1.0, 2.0],
27 |             strides=[16]),
28 |         bbox_coder=dict(
29 |             type='DeltaXYWHBBoxCoder',
30 |             target_means=[.0, .0, .0, .0],
31 |             target_stds=[1.0, 1.0, 1.0, 1.0]),
32 |         loss_cls=dict(
33 |             type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
34 |         loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
35 |     # model training and testing settings
36 |     train_cfg=dict(
37 |         rpn=dict(
38 |             assigner=dict(
39 |                 type='MaxIoUAssigner',
40 |                 pos_iou_thr=0.7,
41 |                 neg_iou_thr=0.3,
42 |                 min_pos_iou=0.3,
43 |                 ignore_iof_thr=-1),
44 |             sampler=dict(
45 |                 type='RandomSampler',
46 |                 num=256,
47 |                 pos_fraction=0.5,
48 |                 neg_pos_ub=-1,
49 |                 add_gt_as_proposals=False),
50 |             allowed_border=0,
51 |             pos_weight=-1,
52 |             debug=False)),
53 |     test_cfg=dict(
54 |         rpn=dict(
55 |             nms_pre=12000,
56 |             max_per_img=2000,
57 |             nms=dict(type='nms', iou_threshold=0.7),
58 |             min_bbox_size=0)))
59 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/models/rpn_r50_fpn.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | model = dict(
 3 |     type='RPN',
 4 |     backbone=dict(
 5 |         type='ResNet',
 6 |         depth=50,
 7 |         num_stages=4,
 8 |         out_indices=(0, 1, 2, 3),
 9 |         frozen_stages=1,
10 |         norm_cfg=dict(type='BN', requires_grad=True),
11 |         norm_eval=True,
12 |         style='pytorch',
13 |         init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
14 |     neck=dict(
15 |         type='FPN',
16 |         in_channels=[256, 512, 1024, 2048],
17 |         out_channels=256,
18 |         num_outs=5),
19 |     rpn_head=dict(
20 |         type='RPNHead',
21 |         in_channels=256,
22 |         feat_channels=256,
23 |         anchor_generator=dict(
24 |             type='AnchorGenerator',
25 |             scales=[8],
26 |             ratios=[0.5, 1.0, 2.0],
27 |             strides=[4, 8, 16, 32, 64]),
28 |         bbox_coder=dict(
29 |             type='DeltaXYWHBBoxCoder',
30 |             target_means=[.0, .0, .0, .0],
31 |             target_stds=[1.0, 1.0, 1.0, 1.0]),
32 |         loss_cls=dict(
33 |             type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
34 |         loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
35 |     # model training and testing settings
36 |     train_cfg=dict(
37 |         rpn=dict(
38 |             assigner=dict(
39 |                 type='MaxIoUAssigner',
40 |                 pos_iou_thr=0.7,
41 |                 neg_iou_thr=0.3,
42 |                 min_pos_iou=0.3,
43 |                 ignore_iof_thr=-1),
44 |             sampler=dict(
45 |                 type='RandomSampler',
46 |                 num=256,
47 |                 pos_fraction=0.5,
48 |                 neg_pos_ub=-1,
49 |                 add_gt_as_proposals=False),
50 |             allowed_border=0,
51 |             pos_weight=-1,
52 |             debug=False)),
53 |     test_cfg=dict(
54 |         rpn=dict(
55 |             nms_pre=2000,
56 |             max_per_img=1000,
57 |             nms=dict(type='nms', iou_threshold=0.7),
58 |             min_bbox_size=0)))
59 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/models/ssd300.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | input_size = 300
 3 | model = dict(
 4 |     type='SingleStageDetector',
 5 |     backbone=dict(
 6 |         type='SSDVGG',
 7 |         depth=16,
 8 |         with_last_pool=False,
 9 |         ceil_mode=True,
10 |         out_indices=(3, 4),
11 |         out_feature_indices=(22, 34),
12 |         init_cfg=dict(
13 |             type='Pretrained', checkpoint='open-mmlab://vgg16_caffe')),
14 |     neck=dict(
15 |         type='SSDNeck',
16 |         in_channels=(512, 1024),
17 |         out_channels=(512, 1024, 512, 256, 256, 256),
18 |         level_strides=(2, 2, 1, 1),
19 |         level_paddings=(1, 1, 0, 0),
20 |         l2_norm_scale=20),
21 |     bbox_head=dict(
22 |         type='SSDHead',
23 |         in_channels=(512, 1024, 512, 256, 256, 256),
24 |         num_classes=80,
25 |         anchor_generator=dict(
26 |             type='SSDAnchorGenerator',
27 |             scale_major=False,
28 |             input_size=input_size,
29 |             basesize_ratio_range=(0.15, 0.9),
30 |             strides=[8, 16, 32, 64, 100, 300],
31 |             ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]),
32 |         bbox_coder=dict(
33 |             type='DeltaXYWHBBoxCoder',
34 |             target_means=[.0, .0, .0, .0],
35 |             target_stds=[0.1, 0.1, 0.2, 0.2])),
36 |     # model training and testing settings
37 |     train_cfg=dict(
38 |         assigner=dict(
39 |             type='MaxIoUAssigner',
40 |             pos_iou_thr=0.5,
41 |             neg_iou_thr=0.5,
42 |             min_pos_iou=0.,
43 |             ignore_iof_thr=-1,
44 |             gt_max_assign_all=False),
45 |         smoothl1_beta=1.,
46 |         allowed_border=-1,
47 |         pos_weight=-1,
48 |         neg_pos_ratio=3,
49 |         debug=False),
50 |     test_cfg=dict(
51 |         nms_pre=1000,
52 |         nms=dict(type='nms', iou_threshold=0.45),
53 |         min_bbox_size=0,
54 |         score_thr=0.02,
55 |         max_per_img=200))
56 | cudnn_benchmark = True
57 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/schedules/schedule_1x.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
 3 | optimizer_config = dict(grad_clip=None)
 4 | # learning policy
 5 | lr_config = dict(
 6 |     policy='step',
 7 |     warmup='linear',
 8 |     warmup_iters=500,
 9 |     warmup_ratio=0.001,
10 |     step=[8, 11])
11 | runner = dict(type='EpochBasedRunner', max_epochs=12)
12 | 


--------------------------------------------------------------------------------
/detection/configs/_base_/schedules/schedule_3x.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
 3 | optimizer_config = dict(grad_clip=None)
 4 | # learning policy
 5 | lr_config = dict(
 6 |     policy='step',
 7 |     warmup='linear',
 8 |     warmup_iters=500,
 9 |     warmup_ratio=0.001,
10 |     step=[27, 33])
11 | runner = dict(type='EpochBasedRunner', max_epochs=36)
12 | 


--------------------------------------------------------------------------------
/detection/dist_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | CHECKPOINT=$2
 5 | GPUS=$3
 6 | PORT=${PORT:-29511}
 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
 8 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
 9 |     $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
10 | 


--------------------------------------------------------------------------------
/detection/dist_train.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | CONFIG=$1
3 | GPUS=$2
4 | PORT=${PORT:-29500}
5 | # cat /proc/193481/cmdline
6 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
7 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=63667 \
8 |     $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}


--------------------------------------------------------------------------------
/detection/image_demo.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import asyncio
 3 | from argparse import ArgumentParser
 4 | 
 5 | from mmdet.apis import (async_inference_detector, inference_detector,
 6 |                         init_detector, show_result_pyplot)
 7 | import mmcv
 8 | import mmcv_custom  # noqa: F401,F403
 9 | import mmdet_custom  # noqa: F401,F403
10 | import os.path as osp
11 | 
12 | 
13 | def parse_args():
14 |     parser = ArgumentParser()
15 |     parser.add_argument('img', help='Image file')
16 |     parser.add_argument('config', help='Config file')
17 |     parser.add_argument('checkpoint', help='Checkpoint file')
18 |     parser.add_argument('--out', type=str, default="demo", help='out dir')
19 |     parser.add_argument(
20 |         '--device', default='cuda:0', help='Device used for inference')
21 |     parser.add_argument(
22 |         '--palette',
23 |         default='coco',
24 |         choices=['coco', 'voc', 'citys', 'random'],
25 |         help='Color palette used for visualization')
26 |     parser.add_argument(
27 |         '--score-thr', type=float, default=0.3, help='bbox score threshold')
28 |     parser.add_argument(
29 |         '--async-test',
30 |         action='store_true',
31 |         help='whether to set async options for async inference.')
32 |     args = parser.parse_args()
33 |     return args
34 | 
35 | 
36 | def main(args):
37 |     # build the model from a config file and a checkpoint file
38 |     model = init_detector(args.config, args.checkpoint, device=args.device)
39 |     # test a single image
40 |     result = inference_detector(model, args.img)
41 |     
42 |     mmcv.mkdir_or_exist(args.out)
43 |     out_file = osp.join(args.out, osp.basename(args.img))
44 |     # show the results
45 |     model.show_result(
46 |         args.img,
47 |         result,
48 |         score_thr=args.score_thr,
49 |         show=False,
50 |         bbox_color=args.palette,
51 |         text_color=(200, 200, 200),
52 |         mask_color=args.palette,
53 |         out_file=out_file
54 |     )
55 |     print(f"Result is save at {out_file}")
56 | 
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     args = parse_args()
61 |     main(args)


--------------------------------------------------------------------------------
/detection/mmcv_custom/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | # -*- coding: utf-8 -*-
 8 | from .custom_layer_decay_optimizer_constructor import CustomLayerDecayOptimizerConstructor
 9 | __all__ = ['CustomLayerDecayOptimizerConstructor']
10 | 


--------------------------------------------------------------------------------
/detection/mmdet_custom/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DCNv4
3 | # Copyright (c) 2024 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | 
7 | from .models import *  # noqa: F401,F403
8 | from .datasets import *


--------------------------------------------------------------------------------
/detection/mmdet_custom/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DCNv4
3 | # Copyright (c) 2024 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | 
7 | from .crowd_human import CrowdHumanDataset


--------------------------------------------------------------------------------
/detection/mmdet_custom/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | from .backbones import *  # noqa: F401,F403
 8 | from .dense_heads import *  # noqa: F401,F403
 9 | from .detectors import *  # noqa: F401,F403
10 | from .utils import *  # noqa: F401,F403
11 | from .necks.fpn import *


--------------------------------------------------------------------------------
/detection/mmdet_custom/models/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DCNv4
3 | # Copyright (c) 2023 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | from .flash_intern_image import FlashInternImage
7 | 
8 | __all__ = ['FlashInternImage']
9 | 


--------------------------------------------------------------------------------
/detection/mmdet_custom/models/dense_heads/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | from .deformable_detr_head import DeformableDETRHead
 8 | from .detr_head import DETRHead
 9 | from .dino_head import DINOHead
10 | from .msda import  FlashMultiScaleDeformableAttention
11 | from .bbox_head import DCNv4FCBBoxHead
12 | from .mask_rcnn import MaskRCNN_
13 | __all__ = ['DeformableDETRHead', 'DETRHead', 'DINOHead']


--------------------------------------------------------------------------------
/detection/mmdet_custom/models/dense_heads/mask_rcnn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from mmdet.models.builder import DETECTORS
 3 | from .two_stage import TwoStageDetector
 4 | 
 5 | 
 6 | @DETECTORS.register_module()
 7 | class MaskRCNN_(TwoStageDetector):
 8 |     """Implementation of `Mask R-CNN <https://arxiv.org/abs/1703.06870>`_"""
 9 | 
10 |     def __init__(self,
11 |                  backbone,
12 |                  rpn_head,
13 |                  roi_head,
14 |                  train_cfg,
15 |                  test_cfg,
16 |                  neck=None,
17 |                  pretrained=None,
18 |                  init_cfg=None):
19 |         super(MaskRCNN_, self).__init__(
20 |             backbone=backbone,
21 |             neck=neck,
22 |             rpn_head=rpn_head,
23 |             roi_head=roi_head,
24 |             train_cfg=train_cfg,
25 |             test_cfg=test_cfg,
26 |             pretrained=pretrained,
27 |             init_cfg=init_cfg)
28 | 


--------------------------------------------------------------------------------
/detection/mmdet_custom/models/detectors/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DCNv4
3 | # Copyright (c) 2024 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | 
7 | from .dino import DINO
8 | 
9 | __all__ = ['DINO']


--------------------------------------------------------------------------------
/detection/mmdet_custom/models/detectors/dino.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from mmdet.models.builder import DETECTORS
 3 | from mmdet.models.detectors.detr import DETR
 4 | 
 5 | 
 6 | @DETECTORS.register_module()
 7 | class DINO(DETR):
 8 |     
 9 |     def __init__(self, *args, **kwargs):
10 |         super(DETR, self).__init__(*args, **kwargs)


--------------------------------------------------------------------------------
/detection/mmdet_custom/models/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .query_denoising import build_dn_generator
2 | from .transformer import (DinoTransformer, DinoTransformerDecoder)
3 | from .convModule_norm import ConvModule_Norm
4 | 
5 | 
6 | __all__ = ['build_dn_generator', 'DinoTransformer', 'DinoTransformerDecoder']


--------------------------------------------------------------------------------
/detection/mmdet_custom/models/utils/convModule_norm.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | from mmcv.cnn.bricks.conv_module import ConvModule
 8 | 
 9 | class ConvModule_Norm(ConvModule):
10 |     def __init__(self, in_channels,
11 |                 out_channels,
12 |                 kernel, **kwargs):
13 |         super().__init__(in_channels, out_channels, kernel, **kwargs)
14 | 
15 |         self.normType = kwargs.get('norm_cfg', {'type':''})
16 |         if self.normType is not None:
17 |             self.normType = self.normType['type']
18 |     
19 |     def forward(self, x, activate=True, norm=True):
20 |         for layer in self.order:
21 |             if layer == 'conv':
22 |                 if self.with_explicit_padding:
23 |                     x = self.padding_layer(x)
24 |                 x = self.conv(x)
25 |             elif layer == 'norm' and norm and self.with_norm:
26 |                 if 'LN' in self.normType:
27 |                     x = x.permute(0, 2, 3, 1)
28 |                     x = self.norm(x)
29 |                     x = x.permute(0, 3, 1, 2).contiguous()
30 |                 else:
31 |                     x = self.norm(x)
32 |             elif layer == 'act' and activate and self.with_activation:
33 |                 x = self.activate(x)
34 |         return x


--------------------------------------------------------------------------------
/detection/ops_dcnv3/functions/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DCNv4
3 | # Copyright (c) 2024 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | 
7 | from .dcnv3_func import DCNv3Function, dcnv3_core_pytorch
8 | 


--------------------------------------------------------------------------------
/detection/ops_dcnv3/make.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # --------------------------------------------------------
3 | # DCNv4
4 | # Copyright (c) 2024 OpenGVLab
5 | # Licensed under The MIT License [see LICENSE for details]
6 | # --------------------------------------------------------
7 | 
8 | python setup.py build install
9 | 


--------------------------------------------------------------------------------
/detection/ops_dcnv3/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DCNv4
3 | # Copyright (c) 2024 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | 
7 | from .dcnv3 import DCNv3, DCNv3_pytorch


--------------------------------------------------------------------------------
/detection/ops_dcnv3/setup.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | import os
 8 | import glob
 9 | 
10 | import torch
11 | 
12 | from torch.utils.cpp_extension import CUDA_HOME
13 | from torch.utils.cpp_extension import CppExtension
14 | from torch.utils.cpp_extension import CUDAExtension
15 | 
16 | from setuptools import find_packages
17 | from setuptools import setup
18 | 
19 | requirements = ["torch", "torchvision"]
20 | 
21 | 
22 | def get_extensions():
23 |     this_dir = os.path.dirname(os.path.abspath(__file__))
24 |     extensions_dir = os.path.join(this_dir, "src")
25 | 
26 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
27 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
28 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
29 | 
30 |     sources = main_file + source_cpu
31 |     extension = CppExtension
32 |     extra_compile_args = {"cxx": []}
33 |     define_macros = []
34 | 
35 |     if torch.cuda.is_available() and CUDA_HOME is not None:
36 |         extension = CUDAExtension
37 |         sources += source_cuda
38 |         define_macros += [("WITH_CUDA", None)]
39 |         extra_compile_args["nvcc"] = [
40 |             # "-DCUDA_HAS_FP16=1",
41 |             # "-D__CUDA_NO_HALF_OPERATORS__",
42 |             # "-D__CUDA_NO_HALF_CONVERSIONS__",
43 |             # "-D__CUDA_NO_HALF2_OPERATORS__",
44 |         ]
45 |     else:
46 |         raise NotImplementedError('Cuda is not availabel')
47 | 
48 |     sources = [os.path.join(extensions_dir, s) for s in sources]
49 |     include_dirs = [extensions_dir]
50 |     ext_modules = [
51 |         extension(
52 |             "DCNv3",
53 |             sources,
54 |             include_dirs=include_dirs,
55 |             define_macros=define_macros,
56 |             extra_compile_args=extra_compile_args,
57 |         )
58 |     ]
59 |     return ext_modules
60 | 
61 | 
62 | setup(
63 |     name="DCNv3",
64 |     version="1.0",
65 |     author="InternImage",
66 |     url="https://github.com/OpenGVLab/InternImage",
67 |     description=
68 |     "PyTorch Wrapper for CUDA Functions of DCNv3",
69 |     packages=find_packages(exclude=(
70 |         "configs",
71 |         "tests",
72 |     )),
73 |     ext_modules=get_extensions(),
74 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
75 | )
76 | 


--------------------------------------------------------------------------------
/detection/ops_dcnv3/src/cpu/dcnv3_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #include <vector>
13 | 
14 | #include <ATen/ATen.h>
15 | #include <ATen/cuda/CUDAContext.h>
16 | 
17 | at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
18 |                              const at::Tensor &mask, const int kernel_h,
19 |                              const int kernel_w, const int stride_h,
20 |                              const int stride_w, const int pad_h,
21 |                              const int pad_w, const int dilation_h,
22 |                              const int dilation_w, const int group,
23 |                              const int group_channels, const float offset_scale,
24 |                              const int im2col_step) {
25 |     AT_ERROR("Not implement on cpu");
26 | }
27 | 
28 | std::vector<at::Tensor>
29 | dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
30 |                    const at::Tensor &mask, const int kernel_h,
31 |                    const int kernel_w, const int stride_h, const int stride_w,
32 |                    const int pad_h, const int pad_w, const int dilation_h,
33 |                    const int dilation_w, const int group,
34 |                    const int group_channels, const float offset_scale,
35 |                    const at::Tensor &grad_output, const int im2col_step) {
36 |     AT_ERROR("Not implement on cpu");
37 | }
38 | 


--------------------------------------------------------------------------------
/detection/ops_dcnv3/src/cpu/dcnv3_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #pragma once
13 | #include <torch/extension.h>
14 | 
15 | at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
16 |                              const at::Tensor &mask, const int kernel_h,
17 |                              const int kernel_w, const int stride_h,
18 |                              const int stride_w, const int pad_h,
19 |                              const int pad_w, const int dilation_h,
20 |                              const int dilation_w, const int group,
21 |                              const int group_channels, const float offset_scale,
22 |                              const int im2col_step);
23 | 
24 | std::vector<at::Tensor>
25 | dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
26 |                    const at::Tensor &mask, const int kernel_h,
27 |                    const int kernel_w, const int stride_h, const int stride_w,
28 |                    const int pad_h, const int pad_w, const int dilation_h,
29 |                    const int dilation_w, const int group,
30 |                    const int group_channels, const float offset_scale,
31 |                    const at::Tensor &grad_output, const int im2col_step);
32 | 


--------------------------------------------------------------------------------
/detection/ops_dcnv3/src/cuda/dcnv3_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #pragma once
13 | #include <torch/extension.h>
14 | 
15 | at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset,
16 |                               const at::Tensor &mask, const int kernel_h,
17 |                               const int kernel_w, const int stride_h,
18 |                               const int stride_w, const int pad_h,
19 |                               const int pad_w, const int dilation_h,
20 |                               const int dilation_w, const int group,
21 |                               const int group_channels,
22 |                               const float offset_scale, const int im2col_step);
23 | 
24 | std::vector<at::Tensor>
25 | dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
26 |                     const at::Tensor &mask, const int kernel_h,
27 |                     const int kernel_w, const int stride_h, const int stride_w,
28 |                     const int pad_h, const int pad_w, const int dilation_h,
29 |                     const int dilation_w, const int group,
30 |                     const int group_channels, const float offset_scale,
31 |                     const at::Tensor &grad_output, const int im2col_step);
32 | 


--------------------------------------------------------------------------------
/detection/ops_dcnv3/src/dcnv3.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #pragma once
13 | 
14 | #include "cpu/dcnv3_cpu.h"
15 | 
16 | #ifdef WITH_CUDA
17 | #include "cuda/dcnv3_cuda.h"
18 | #endif
19 | 
20 | at::Tensor dcnv3_forward(const at::Tensor &input, const at::Tensor &offset,
21 |                          const at::Tensor &mask, const int kernel_h,
22 |                          const int kernel_w, const int stride_h,
23 |                          const int stride_w, const int pad_h, const int pad_w,
24 |                          const int dilation_h, const int dilation_w,
25 |                          const int group, const int group_channels,
26 |                          const float offset_scale, const int im2col_step) {
27 |     if (input.type().is_cuda()) {
28 | #ifdef WITH_CUDA
29 |         return dcnv3_cuda_forward(input, offset, mask, kernel_h, kernel_w,
30 |                                   stride_h, stride_w, pad_h, pad_w, dilation_h,
31 |                                   dilation_w, group, group_channels,
32 |                                   offset_scale, im2col_step);
33 | #else
34 |         AT_ERROR("Not compiled with GPU support");
35 | #endif
36 |     }
37 |     AT_ERROR("Not implemented on the CPU");
38 | }
39 | 
40 | std::vector<at::Tensor>
41 | dcnv3_backward(const at::Tensor &input, const at::Tensor &offset,
42 |                const at::Tensor &mask, const int kernel_h, const int kernel_w,
43 |                const int stride_h, const int stride_w, const int pad_h,
44 |                const int pad_w, const int dilation_h, const int dilation_w,
45 |                const int group, const int group_channels,
46 |                const float offset_scale, const at::Tensor &grad_output,
47 |                const int im2col_step) {
48 |     if (input.type().is_cuda()) {
49 | #ifdef WITH_CUDA
50 |         return dcnv3_cuda_backward(input, offset, mask, kernel_h, kernel_w,
51 |                                    stride_h, stride_w, pad_h, pad_w, dilation_h,
52 |                                    dilation_w, group, group_channels,
53 |                                    offset_scale, grad_output, im2col_step);
54 | #else
55 |         AT_ERROR("Not compiled with GPU support");
56 | #endif
57 |     }
58 |     AT_ERROR("Not implemented on the CPU");
59 | }
60 | 


--------------------------------------------------------------------------------
/detection/ops_dcnv3/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #include "dcnv3.h"
13 | 
14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
15 |     m.def("dcnv3_forward", &dcnv3_forward, "dcnv3_forward");
16 |     m.def("dcnv3_backward", &dcnv3_backward, "dcnv3_backward");
17 | }
18 | 


--------------------------------------------------------------------------------
/detection/slurm_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION=$1
 6 | JOB_NAME=$2
 7 | CONFIG=$3
 8 | CHECKPOINT=$4
 9 | GPUS=${GPUS:-8}
10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5}
12 | PY_ARGS=${@:5}
13 | SRUN_ARGS=${SRUN_ARGS:-""}
14 | 
15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
16 | srun -p ${PARTITION} \
17 |     --job-name=${JOB_NAME} \
18 |     --gres=gpu:${GPUS_PER_NODE} \
19 |     --ntasks=${GPUS} \
20 |     --ntasks-per-node=${GPUS_PER_NODE} \
21 |     --cpus-per-task=${CPUS_PER_TASK} \
22 |     --kill-on-bad-exit=1 \
23 |     --quotatype=spot \
24 |     ${SRUN_ARGS} \
25 |     python -u test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
26 | 


--------------------------------------------------------------------------------
/detection/slurm_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION=$1
 6 | JOB_NAME=$2
 7 | CONFIG=$3
 8 | WORK_DIR=$4
 9 | GPUS=${GPUS:-8}
10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
11 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
12 | SRUN_ARGS=${SRUN_ARGS:-""}
13 | PY_ARGS=${@:5}
14 | 
15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
16 | srun -p ${PARTITION} \
17 |     --job-name=${JOB_NAME} \
18 |     --gres=gpu:${GPUS_PER_NODE} \
19 |     --ntasks=${GPUS} \
20 |     --ntasks-per-node=${GPUS_PER_NODE} \
21 |     --cpus-per-task=${CPUS_PER_TASK} \
22 |     --kill-on-bad-exit=1 \
23 |     --quotatype=reserved \
24 |     ${SRUN_ARGS} \
25 |     python -u train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}


--------------------------------------------------------------------------------
/detection/tools/create_crowd_anno.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pickle as pkl
 4 | import numpy as np
 5 | import random
 6 | from PIL import Image
 7 | import concurrent.futures
 8 | import json
 9 | import mmcv
10 | 
11 | def parse_args():
12 |     parser = argparse.ArgumentParser(description='Generate MMDetection Annotations for Crowdhuman-like dataset')
13 |     parser.add_argument('--dataset', help='dataset name', type=str)
14 |     parser.add_argument('--dataset-split', help='dataset split, e.g. train, val', type=str)
15 | 
16 |     args = parser.parse_args()
17 |     return args.dataset, args.dataset_split
18 | 
19 | def load_func(fpath):
20 |     assert os.path.exists(fpath)
21 |     with open(fpath, 'r') as fid:
22 |         lines = fid.readlines()
23 |     records = [json.loads(line.strip('\n')) for line in lines]
24 |     return records
25 | 
26 | def decode_annotations(records, dataset_path):
27 |     rec_ids = list(range(len(records)))
28 |     img_list = []
29 |     ann_list = []
30 |     ann_id = 1
31 |     for idx, rec_id in enumerate(rec_ids):
32 |         img_id = records[rec_id]['ID']
33 |         img_url = dataset_path + 'Images/' + img_id + '.jpg'
34 |         assert os.path.exists(img_url)
35 |         im = Image.open(img_url)
36 |         im_w, im_h = im.width, im.height
37 | 
38 |         gt_box = records[rec_id]['gtboxes']
39 |         gt_box_len = len(gt_box)
40 |         img_dict = dict(
41 |             file_name=img_id + '.jpg',
42 |             height=im_h,
43 |             width=im_w,
44 |             id=idx
45 |         )
46 |         img_list.append(img_dict)
47 |         for ii in range(gt_box_len):
48 |             each_data = gt_box[ii]
49 |             x, y, w, h = each_data['fbox']
50 | 
51 |             if w <= 0 or h <= 0:
52 |                 continue
53 |             # x1 = x; y1 = y; x2 = x + w; y2 = y + h
54 | 
55 |             valid_bbox = [x, y, w, h]
56 |             if each_data['tag'] == 'person':
57 |                 tag = 1
58 |             else:
59 |                 tag = -2
60 |             if 'extra' in each_data:
61 |                 if 'ignore' in each_data['extra']:
62 |                     if each_data['extra']['ignore'] != 0:
63 |                         tag = -2
64 |             ann_dict = dict(
65 |                 area=w * h,
66 |                 iscrowd=1 if tag == -2 else 0,
67 |                 image_id=idx,
68 |                 bbox=[x, y, w, h],
69 |                 category_id=1,
70 |                 id=ann_id,
71 |                 # ignore=1 if tag == -2 else 1,
72 |             )
73 |             ann_id += 1
74 |             ann_list.append(ann_dict)
75 |     cate_list = [{'supercategory': 'none', 'id': 1, 'name': 'person'}]
76 |     json_dict = dict(
77 |         images=img_list,
78 |         annotations=ann_list,
79 |         categories=cate_list
80 |     )
81 |     return json_dict
82 | 
83 | if __name__ == "__main__":
84 |     dataset_name, dataset_type = parse_args()
85 |     dataset_path = 'data/%s/' % dataset_name
86 |     ch_file_path = dataset_path + 'annotations/annotation_%s.odgt' % dataset_type
87 |     json_file_path = dataset_path + 'annotations/annotation_%s.json' % dataset_type
88 | 
89 |     records = load_func(ch_file_path)
90 |     print("Loading Annotations Done")
91 | 
92 |     json_dict = decode_annotations(records, dataset_path)
93 | 
94 |     print("Parsing Bbox Number: %d" % len(json_dict['annotations']))
95 |     mmcv.dump(json_dict, json_file_path)
96 | 


--------------------------------------------------------------------------------
/detection/tools/evaluate/__init__.py:
--------------------------------------------------------------------------------
1 | from .compute_APMR import compute_APMR
2 | from .compute_JI import compute_JI_with_ignore


--------------------------------------------------------------------------------
/segmentation/README.md:
--------------------------------------------------------------------------------
 1 | # FlashInternImage for Semantic Segmentation
 2 | 
 3 | This folder contains the implementation of the InternImage for semantic segmentation. 
 4 | 
 5 | Our segmentation code is developed on top of [MMSegmentation v0.27.0](https://github.com/open-mmlab/mmsegmentation/tree/v0.27.0).
 6 | 
 7 | ## Usage
 8 | 
 9 | ### Install
10 | 
11 | - Clone this repo:
12 | 
13 | ```bash
14 | git clone https://github.com/OpenGVLab/DCNv4.git
15 | cd DCNv4
16 | ```
17 | 
18 | - Create a conda virtual environment and activate it:
19 | 
20 | ```bash
21 | conda create -n dcnv4 python=3.7 -y
22 | conda activate dcnv4
23 | ```
24 | 
25 | - Install `CUDA>=10.2` with `cudnn>=7` following
26 |   the [official installation instructions](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
27 | - Install `PyTorch>=1.10.0` and `torchvision>=0.9.0` with `CUDA>=10.2`:
28 | 
29 | For examples, to install torch==1.11 with CUDA==11.3 and nvcc:
30 | ```bash
31 | conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch -y
32 | conda install -c conda-forge cudatoolkit-dev=11.3 -y # to install nvcc
33 | ```
34 | 
35 | - Install other requirements:
36 | 
37 |   note: conda opencv will break torchvision as not to support GPU, so we need to install opencv using pip. 	  
38 | 
39 | ```bash
40 | conda install -c conda-forge termcolor yacs pyyaml scipy pip -y
41 | pip install opencv-python
42 | ```
43 | 
44 | - Install `timm` and `mmcv-full` and `mmsegmentation':
45 | 
46 | ```bash
47 | pip install -U openmim
48 | mim install mmcv-full==1.5.0
49 | mim install mmsegmentation==0.27.0
50 | pip install timm==0.6.11 mmdet==2.28.1
51 | ```
52 | 
53 | - Install DCNv4
54 | ```bash
55 | pip install DCNv4
56 | ```
57 | 
58 | ### Data Preparation
59 | 
60 | Prepare datasets according to the [guidelines](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#prepare-datasets) in MMSegmentation.
61 | 
62 | 
63 | ### Evaluation
64 | 
65 | To evaluate our `FlashInternImage` on ADE20K val, run:
66 | 
67 | ```bash
68 | sh dist_test.sh <config-file> <checkpoint> <gpu-num> --eval mIoU
69 | ```
70 | You can download checkpoint files from [here](https://huggingface.co/OpenGVLab/DCNv4). Then place it to segmentation/checkpoint_dir/seg.
71 | 
72 | For example, to evaluate the `FlashInternImage-T` with a single GPU:
73 | 
74 | ```bash
75 | python test.py configs/ade20k/upernet_flash_internimage_t_512_160k_ade20k.py checkpoint_dir/seg/upernet_flash_internimage_t_512_160k_ade20k.pth --eval mIoU
76 | ```
77 | 
78 | For example, to evaluate the `FlashInternImage-B` with a single node with 8 GPUs:
79 | 
80 | ```bash
81 | sh dist_test.sh configs/ade20k/upernet_flash_internimage_b_512_160k_ade20k.py checkpoint_dir/seg/upernet_flash_internimage_b_512_160k_ade20k.pth 8 --eval mIoU
82 | ```
83 | 
84 | ### Training
85 | 
86 | To train an `FlashInternImage` on ADE20K, run:
87 | 
88 | ```bash
89 | sh dist_train.sh <config-file> <gpu-num>
90 | ```
91 | 
92 | For example, to train `FlashInternImage-T` with 8 GPU on 1 node (total batch size 16), run:
93 | 
94 | ```bash
95 | sh dist_train.sh configs/ade20k/upernet_flash_internimage_t_512_160k_ade20k.py 8
96 | ```
97 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/ade20k.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'ADE20KDataset'
 3 | data_root = 'data/ADEChallengeData2016'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | crop_size = (512, 512)
 7 | train_pipeline = [
 8 |     dict(type='LoadImageFromFile'),
 9 |     dict(type='LoadAnnotations', reduce_zero_label=True),
10 |     dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
11 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12 |     dict(type='RandomFlip', prob=0.5),
13 |     dict(type='PhotoMetricDistortion'),
14 |     dict(type='Normalize', **img_norm_cfg),
15 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16 |     dict(type='DefaultFormatBundle'),
17 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18 | ]
19 | test_pipeline = [
20 |     dict(type='LoadImageFromFile'),
21 |     dict(
22 |         type='MultiScaleFlipAug',
23 |         img_scale=(2048, 512),
24 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25 |         flip=False,
26 |         transforms=[
27 |             dict(type='Resize', keep_ratio=True),
28 |             dict(type='RandomFlip'),
29 |             dict(type='Normalize', **img_norm_cfg),
30 |             dict(type='ImageToTensor', keys=['img']),
31 |             dict(type='Collect', keys=['img']),
32 |         ])
33 | ]
34 | data = dict(
35 |     samples_per_gpu=4,
36 |     workers_per_gpu=4,
37 |     train=dict(
38 |         type=dataset_type,
39 |         data_root=data_root,
40 |         img_dir='images/training',
41 |         ann_dir='annotations/training',
42 |         pipeline=train_pipeline),
43 |     val=dict(
44 |         type=dataset_type,
45 |         data_root=data_root,
46 |         img_dir='images/validation',
47 |         ann_dir='annotations/validation',
48 |         pipeline=test_pipeline),
49 |     test=dict(
50 |         type=dataset_type,
51 |         data_root=data_root,
52 |         img_dir='images/validation',
53 |         ann_dir='annotations/validation',
54 |         pipeline=test_pipeline))
55 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/ade20k_640x640.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'ADE20KDataset'
 3 | data_root = 'data/ADEChallengeData2016'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | crop_size = (640, 640)
 7 | train_pipeline = [
 8 |     dict(type='LoadImageFromFile'),
 9 |     dict(type='LoadAnnotations', reduce_zero_label=True),
10 |     dict(type='Resize', img_scale=(2560, 640), ratio_range=(0.5, 2.0)),
11 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12 |     dict(type='RandomFlip', prob=0.5),
13 |     dict(type='PhotoMetricDistortion'),
14 |     dict(type='Normalize', **img_norm_cfg),
15 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16 |     dict(type='DefaultFormatBundle'),
17 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18 | ]
19 | test_pipeline = [
20 |     dict(type='LoadImageFromFile'),
21 |     dict(
22 |         type='MultiScaleFlipAug',
23 |         img_scale=(2560, 640),
24 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25 |         flip=False,
26 |         transforms=[
27 |             dict(type='Resize', keep_ratio=True),
28 |             dict(type='RandomFlip'),
29 |             dict(type='Normalize', **img_norm_cfg),
30 |             dict(type='ImageToTensor', keys=['img']),
31 |             dict(type='Collect', keys=['img']),
32 |         ])
33 | ]
34 | data = dict(
35 |     samples_per_gpu=4,
36 |     workers_per_gpu=4,
37 |     train=dict(
38 |         type=dataset_type,
39 |         data_root=data_root,
40 |         img_dir='images/training',
41 |         ann_dir='annotations/training',
42 |         pipeline=train_pipeline),
43 |     val=dict(
44 |         type=dataset_type,
45 |         data_root=data_root,
46 |         img_dir='images/validation',
47 |         ann_dir='annotations/validation',
48 |         pipeline=test_pipeline),
49 |     test=dict(
50 |         type=dataset_type,
51 |         data_root=data_root,
52 |         img_dir='images/validation',
53 |         ann_dir='annotations/validation',
54 |         pipeline=test_pipeline))
55 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/chase_db1.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'ChaseDB1Dataset'
 3 | data_root = 'data/CHASE_DB1'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | img_scale = (960, 999)
 7 | crop_size = (128, 128)
 8 | train_pipeline = [
 9 |     dict(type='LoadImageFromFile'),
10 |     dict(type='LoadAnnotations'),
11 |     dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
12 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
13 |     dict(type='RandomFlip', prob=0.5),
14 |     dict(type='PhotoMetricDistortion'),
15 |     dict(type='Normalize', **img_norm_cfg),
16 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
17 |     dict(type='DefaultFormatBundle'),
18 |     dict(type='Collect', keys=['img', 'gt_semantic_seg'])
19 | ]
20 | test_pipeline = [
21 |     dict(type='LoadImageFromFile'),
22 |     dict(
23 |         type='MultiScaleFlipAug',
24 |         img_scale=img_scale,
25 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
26 |         flip=False,
27 |         transforms=[
28 |             dict(type='Resize', keep_ratio=True),
29 |             dict(type='RandomFlip'),
30 |             dict(type='Normalize', **img_norm_cfg),
31 |             dict(type='ImageToTensor', keys=['img']),
32 |             dict(type='Collect', keys=['img'])
33 |         ])
34 | ]
35 | 
36 | data = dict(
37 |     samples_per_gpu=4,
38 |     workers_per_gpu=4,
39 |     train=dict(
40 |         type='RepeatDataset',
41 |         times=40000,
42 |         dataset=dict(
43 |             type=dataset_type,
44 |             data_root=data_root,
45 |             img_dir='images/training',
46 |             ann_dir='annotations/training',
47 |             pipeline=train_pipeline)),
48 |     val=dict(
49 |         type=dataset_type,
50 |         data_root=data_root,
51 |         img_dir='images/validation',
52 |         ann_dir='annotations/validation',
53 |         pipeline=test_pipeline),
54 |     test=dict(
55 |         type=dataset_type,
56 |         data_root=data_root,
57 |         img_dir='images/validation',
58 |         ann_dir='annotations/validation',
59 |         pipeline=test_pipeline))
60 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/cityscapes.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'CityscapesDataset'
 3 | data_root = 'data/cityscapes/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | crop_size = (512, 1024)
 7 | train_pipeline = [
 8 |     dict(type='LoadImageFromFile'),
 9 |     dict(type='LoadAnnotations'),
10 |     dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
11 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12 |     dict(type='RandomFlip', prob=0.5),
13 |     dict(type='PhotoMetricDistortion'),
14 |     dict(type='Normalize', **img_norm_cfg),
15 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16 |     dict(type='DefaultFormatBundle'),
17 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18 | ]
19 | test_pipeline = [
20 |     dict(type='LoadImageFromFile'),
21 |     dict(
22 |         type='MultiScaleFlipAug',
23 |         img_scale=(2048, 1024),
24 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25 |         flip=False,
26 |         transforms=[
27 |             dict(type='Resize', keep_ratio=True),
28 |             dict(type='RandomFlip'),
29 |             dict(type='Normalize', **img_norm_cfg),
30 |             dict(type='ImageToTensor', keys=['img']),
31 |             dict(type='Collect', keys=['img']),
32 |         ])
33 | ]
34 | data = dict(
35 |     samples_per_gpu=2,
36 |     workers_per_gpu=2,
37 |     train=dict(
38 |         type=dataset_type,
39 |         data_root=data_root,
40 |         img_dir='leftImg8bit/train',
41 |         ann_dir='gtFine/train',
42 |         pipeline=train_pipeline),
43 |     val=dict(
44 |         type=dataset_type,
45 |         data_root=data_root,
46 |         img_dir='leftImg8bit/val',
47 |         ann_dir='gtFine/val',
48 |         pipeline=test_pipeline),
49 |     test=dict(
50 |         type=dataset_type,
51 |         data_root=data_root,
52 |         img_dir='leftImg8bit/val',
53 |         ann_dir='gtFine/val',
54 |         pipeline=test_pipeline))
55 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/cityscapes_1024x1024.py:
--------------------------------------------------------------------------------
 1 | _base_ = './cityscapes.py'
 2 | img_norm_cfg = dict(
 3 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 4 | crop_size = (1024, 1024)
 5 | train_pipeline = [
 6 |     dict(type='LoadImageFromFile'),
 7 |     dict(type='LoadAnnotations'),
 8 |     dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
 9 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
10 |     dict(type='RandomFlip', prob=0.5),
11 |     dict(type='PhotoMetricDistortion'),
12 |     dict(type='Normalize', **img_norm_cfg),
13 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
14 |     dict(type='DefaultFormatBundle'),
15 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
16 | ]
17 | test_pipeline = [
18 |     dict(type='LoadImageFromFile'),
19 |     dict(
20 |         type='MultiScaleFlipAug',
21 |         img_scale=(2048, 1024),
22 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
23 |         flip=False,
24 |         transforms=[
25 |             dict(type='Resize', keep_ratio=True),
26 |             dict(type='RandomFlip'),
27 |             dict(type='Normalize', **img_norm_cfg),
28 |             dict(type='ImageToTensor', keys=['img']),
29 |             dict(type='Collect', keys=['img']),
30 |         ])
31 | ]
32 | data = dict(
33 |     train=dict(pipeline=train_pipeline),
34 |     val=dict(pipeline=test_pipeline),
35 |     test=dict(pipeline=test_pipeline))
36 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/cityscapes_extra.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'CityscapesDataset'
 3 | data_root = 'data/cityscapes/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | crop_size = (512, 1024)
 7 | train_pipeline = [
 8 |     dict(type='LoadImageFromFile'),
 9 |     dict(type='LoadAnnotations'),
10 |     dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
11 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12 |     dict(type='RandomFlip', prob=0.5),
13 |     dict(type='PhotoMetricDistortion'),
14 |     dict(type='Normalize', **img_norm_cfg),
15 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16 |     dict(type='DefaultFormatBundle'),
17 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18 | ]
19 | test_pipeline = [
20 |     dict(type='LoadImageFromFile'),
21 |     dict(
22 |         type='MultiScaleFlipAug',
23 |         img_scale=(2048, 1024),
24 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25 |         flip=False,
26 |         transforms=[
27 |             dict(type='Resize', keep_ratio=True),
28 |             dict(type='RandomFlip'),
29 |             dict(type='Normalize', **img_norm_cfg),
30 |             dict(type='ImageToTensor', keys=['img']),
31 |             dict(type='Collect', keys=['img']),
32 |         ])
33 | ]
34 | data = dict(
35 |     samples_per_gpu=2,
36 |     workers_per_gpu=2,
37 |     train=dict(
38 |         type=dataset_type,
39 |         data_root=data_root,
40 |         img_dir=['leftImg8bit/train', 'leftImg8bit/train_extra'],
41 |         ann_dir=['gtFine/train', 'refinement_final/train_extra'],
42 |         pipeline=train_pipeline),
43 |     val=dict(
44 |         type=dataset_type,
45 |         data_root=data_root,
46 |         img_dir='leftImg8bit/val',
47 |         ann_dir='gtFine/val',
48 |         pipeline=test_pipeline),
49 |     test=dict(
50 |         type=dataset_type,
51 |         data_root=data_root,
52 |         img_dir='leftImg8bit/val',
53 |         ann_dir='gtFine/val',
54 |         pipeline=test_pipeline))
55 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/coco-stuff10k.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'COCOStuffDataset'
 3 | data_root = 'data/coco_stuff10k'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | crop_size = (512, 512)
 7 | train_pipeline = [
 8 |     dict(type='LoadImageFromFile'),
 9 |     dict(type='LoadAnnotations', reduce_zero_label=True),
10 |     dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
11 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12 |     dict(type='RandomFlip', prob=0.5),
13 |     dict(type='PhotoMetricDistortion'),
14 |     dict(type='Normalize', **img_norm_cfg),
15 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16 |     dict(type='DefaultFormatBundle'),
17 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18 | ]
19 | test_pipeline = [
20 |     dict(type='LoadImageFromFile'),
21 |     dict(
22 |         type='MultiScaleFlipAug',
23 |         img_scale=(2048, 512),
24 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25 |         flip=False,
26 |         transforms=[
27 |             dict(type='Resize', keep_ratio=True),
28 |             dict(type='RandomFlip'),
29 |             dict(type='Normalize', **img_norm_cfg),
30 |             dict(type='ImageToTensor', keys=['img']),
31 |             dict(type='Collect', keys=['img']),
32 |         ])
33 | ]
34 | data = dict(
35 |     samples_per_gpu=4,
36 |     workers_per_gpu=4,
37 |     train=dict(
38 |         type=dataset_type,
39 |         data_root=data_root,
40 |         reduce_zero_label=True,
41 |         img_dir='images/train2014',
42 |         ann_dir='annotations/train2014',
43 |         pipeline=train_pipeline),
44 |     val=dict(
45 |         type=dataset_type,
46 |         data_root=data_root,
47 |         reduce_zero_label=True,
48 |         img_dir='images/test2014',
49 |         ann_dir='annotations/test2014',
50 |         pipeline=test_pipeline),
51 |     test=dict(
52 |         type=dataset_type,
53 |         data_root=data_root,
54 |         reduce_zero_label=True,
55 |         img_dir='images/test2014',
56 |         ann_dir='annotations/test2014',
57 |         pipeline=test_pipeline))
58 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/coco-stuff164k.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'COCOStuffDataset'
 3 | data_root = 'data/coco_stuff164k'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | crop_size = (512, 512)
 7 | train_pipeline = [
 8 |     dict(type='LoadImageFromFile'),
 9 |     dict(type='LoadAnnotations'),
10 |     dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
11 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12 |     dict(type='RandomFlip', prob=0.5),
13 |     dict(type='PhotoMetricDistortion'),
14 |     dict(type='Normalize', **img_norm_cfg),
15 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16 |     dict(type='DefaultFormatBundle'),
17 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18 | ]
19 | test_pipeline = [
20 |     dict(type='LoadImageFromFile'),
21 |     dict(
22 |         type='MultiScaleFlipAug',
23 |         img_scale=(2048, 512),
24 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25 |         flip=False,
26 |         transforms=[
27 |             dict(type='Resize', keep_ratio=True),
28 |             dict(type='RandomFlip'),
29 |             dict(type='Normalize', **img_norm_cfg),
30 |             dict(type='ImageToTensor', keys=['img']),
31 |             dict(type='Collect', keys=['img']),
32 |         ])
33 | ]
34 | data = dict(
35 |     samples_per_gpu=4,
36 |     workers_per_gpu=4,
37 |     train=dict(
38 |         type=dataset_type,
39 |         data_root=data_root,
40 |         img_dir='images/train2017',
41 |         ann_dir='annotations/train2017',
42 |         pipeline=train_pipeline),
43 |     val=dict(
44 |         type=dataset_type,
45 |         data_root=data_root,
46 |         img_dir='images/val2017',
47 |         ann_dir='annotations/val2017',
48 |         pipeline=test_pipeline),
49 |     test=dict(
50 |         type=dataset_type,
51 |         data_root=data_root,
52 |         img_dir='images/val2017',
53 |         ann_dir='annotations/val2017',
54 |         pipeline=test_pipeline))
55 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/drive.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'DRIVEDataset'
 3 | data_root = 'data/DRIVE'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | img_scale = (584, 565)
 7 | crop_size = (64, 64)
 8 | train_pipeline = [
 9 |     dict(type='LoadImageFromFile'),
10 |     dict(type='LoadAnnotations'),
11 |     dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
12 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
13 |     dict(type='RandomFlip', prob=0.5),
14 |     dict(type='PhotoMetricDistortion'),
15 |     dict(type='Normalize', **img_norm_cfg),
16 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
17 |     dict(type='DefaultFormatBundle'),
18 |     dict(type='Collect', keys=['img', 'gt_semantic_seg'])
19 | ]
20 | test_pipeline = [
21 |     dict(type='LoadImageFromFile'),
22 |     dict(
23 |         type='MultiScaleFlipAug',
24 |         img_scale=img_scale,
25 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
26 |         flip=False,
27 |         transforms=[
28 |             dict(type='Resize', keep_ratio=True),
29 |             dict(type='RandomFlip'),
30 |             dict(type='Normalize', **img_norm_cfg),
31 |             dict(type='ImageToTensor', keys=['img']),
32 |             dict(type='Collect', keys=['img'])
33 |         ])
34 | ]
35 | 
36 | data = dict(
37 |     samples_per_gpu=4,
38 |     workers_per_gpu=4,
39 |     train=dict(
40 |         type='RepeatDataset',
41 |         times=40000,
42 |         dataset=dict(
43 |             type=dataset_type,
44 |             data_root=data_root,
45 |             img_dir='images/training',
46 |             ann_dir='annotations/training',
47 |             pipeline=train_pipeline)),
48 |     val=dict(
49 |         type=dataset_type,
50 |         data_root=data_root,
51 |         img_dir='images/validation',
52 |         ann_dir='annotations/validation',
53 |         pipeline=test_pipeline),
54 |     test=dict(
55 |         type=dataset_type,
56 |         data_root=data_root,
57 |         img_dir='images/validation',
58 |         ann_dir='annotations/validation',
59 |         pipeline=test_pipeline))
60 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/hrf.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'HRFDataset'
 3 | data_root = 'data/HRF'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | img_scale = (2336, 3504)
 7 | crop_size = (256, 256)
 8 | train_pipeline = [
 9 |     dict(type='LoadImageFromFile'),
10 |     dict(type='LoadAnnotations'),
11 |     dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
12 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
13 |     dict(type='RandomFlip', prob=0.5),
14 |     dict(type='PhotoMetricDistortion'),
15 |     dict(type='Normalize', **img_norm_cfg),
16 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
17 |     dict(type='DefaultFormatBundle'),
18 |     dict(type='Collect', keys=['img', 'gt_semantic_seg'])
19 | ]
20 | test_pipeline = [
21 |     dict(type='LoadImageFromFile'),
22 |     dict(
23 |         type='MultiScaleFlipAug',
24 |         img_scale=img_scale,
25 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
26 |         flip=False,
27 |         transforms=[
28 |             dict(type='Resize', keep_ratio=True),
29 |             dict(type='RandomFlip'),
30 |             dict(type='Normalize', **img_norm_cfg),
31 |             dict(type='ImageToTensor', keys=['img']),
32 |             dict(type='Collect', keys=['img'])
33 |         ])
34 | ]
35 | 
36 | data = dict(
37 |     samples_per_gpu=4,
38 |     workers_per_gpu=4,
39 |     train=dict(
40 |         type='RepeatDataset',
41 |         times=40000,
42 |         dataset=dict(
43 |             type=dataset_type,
44 |             data_root=data_root,
45 |             img_dir='images/training',
46 |             ann_dir='annotations/training',
47 |             pipeline=train_pipeline)),
48 |     val=dict(
49 |         type=dataset_type,
50 |         data_root=data_root,
51 |         img_dir='images/validation',
52 |         ann_dir='annotations/validation',
53 |         pipeline=test_pipeline),
54 |     test=dict(
55 |         type=dataset_type,
56 |         data_root=data_root,
57 |         img_dir='images/validation',
58 |         ann_dir='annotations/validation',
59 |         pipeline=test_pipeline))
60 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/loveda.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'LoveDADataset'
 3 | data_root = 'data/loveDA'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | crop_size = (512, 512)
 7 | train_pipeline = [
 8 |     dict(type='LoadImageFromFile'),
 9 |     dict(type='LoadAnnotations', reduce_zero_label=True),
10 |     dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
11 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12 |     dict(type='RandomFlip', prob=0.5),
13 |     dict(type='PhotoMetricDistortion'),
14 |     dict(type='Normalize', **img_norm_cfg),
15 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16 |     dict(type='DefaultFormatBundle'),
17 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18 | ]
19 | test_pipeline = [
20 |     dict(type='LoadImageFromFile'),
21 |     dict(
22 |         type='MultiScaleFlipAug',
23 |         img_scale=(1024, 1024),
24 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25 |         flip=False,
26 |         transforms=[
27 |             dict(type='Resize', keep_ratio=True),
28 |             dict(type='RandomFlip'),
29 |             dict(type='Normalize', **img_norm_cfg),
30 |             dict(type='ImageToTensor', keys=['img']),
31 |             dict(type='Collect', keys=['img']),
32 |         ])
33 | ]
34 | data = dict(
35 |     samples_per_gpu=4,
36 |     workers_per_gpu=4,
37 |     train=dict(
38 |         type=dataset_type,
39 |         data_root=data_root,
40 |         img_dir='img_dir/train',
41 |         ann_dir='ann_dir/train',
42 |         pipeline=train_pipeline),
43 |     val=dict(
44 |         type=dataset_type,
45 |         data_root=data_root,
46 |         img_dir='img_dir/val',
47 |         ann_dir='ann_dir/val',
48 |         pipeline=test_pipeline),
49 |     test=dict(
50 |         type=dataset_type,
51 |         data_root=data_root,
52 |         img_dir='img_dir/val',
53 |         ann_dir='ann_dir/val',
54 |         pipeline=test_pipeline))
55 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/mapillary.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'MapillaryDataset'
 3 | data_root = 'data/Mapillary/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | crop_size = (512, 1024)
 7 | train_pipeline = [
 8 |     dict(type='LoadImageFromFile'),
 9 |     dict(type='LoadAnnotations'),
10 |     dict(type='MapillaryHack'),
11 |     dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 1.0)),
12 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
13 |     dict(type='RandomFlip', prob=0.5),
14 |     dict(type='PhotoMetricDistortion'),
15 |     dict(type='Normalize', **img_norm_cfg),
16 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
17 |     dict(type='DefaultFormatBundle'),
18 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
19 | ]
20 | test_pipeline = [
21 |     dict(type='LoadImageFromFile'),
22 |     dict(
23 |         type='MultiScaleFlipAug',
24 |         img_scale=(2048, 1024),
25 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
26 |         flip=False,
27 |         transforms=[
28 |             dict(type='Resize', keep_ratio=True),
29 |             dict(type='RandomFlip'),
30 |             dict(type='Normalize', **img_norm_cfg),
31 |             dict(type='ImageToTensor', keys=['img']),
32 |             dict(type='Collect', keys=['img']),
33 |         ])
34 | ]
35 | data = dict(
36 |     samples_per_gpu=2,
37 |     workers_per_gpu=2,
38 |     train=dict(
39 |         type=dataset_type,
40 |         data_root='data/Mapillary/',
41 |         img_dir=['training/images', 'validation/images'],
42 |         ann_dir=['training/labels', 'validation/labels'],
43 |         pipeline=train_pipeline),
44 |     val=dict(
45 |         type='CityscapesDataset',
46 |         data_root='data/cityscapes/',
47 |         img_dir='leftImg8bit/val',
48 |         ann_dir='gtFine/val',
49 |         pipeline=test_pipeline),
50 |     test=dict(
51 |         type='CityscapesDataset',
52 |         data_root='data/cityscapes/',
53 |         img_dir='leftImg8bit/val',
54 |         ann_dir='gtFine/val',
55 |         pipeline=test_pipeline))
56 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/mapillary_1024x1024.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'MapillaryDataset'
 3 | data_root = 'data/Mapillary/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | crop_size = (1024, 1024)
 7 | train_pipeline = [
 8 |     dict(type='LoadImageFromFile'),
 9 |     dict(type='LoadAnnotations'),
10 |     dict(type='MapillaryHack'),
11 |     dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 1.0)),
12 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
13 |     dict(type='RandomFlip', prob=0.5),
14 |     dict(type='PhotoMetricDistortion'),
15 |     dict(type='Normalize', **img_norm_cfg),
16 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
17 |     dict(type='DefaultFormatBundle'),
18 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
19 | ]
20 | test_pipeline = [
21 |     dict(type='LoadImageFromFile'),
22 |     dict(
23 |         type='MultiScaleFlipAug',
24 |         img_scale=(2048, 1024),
25 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
26 |         flip=False,
27 |         transforms=[
28 |             dict(type='Resize', keep_ratio=True),
29 |             dict(type='RandomFlip'),
30 |             dict(type='Normalize', **img_norm_cfg),
31 |             dict(type='ImageToTensor', keys=['img']),
32 |             dict(type='Collect', keys=['img']),
33 |         ])
34 | ]
35 | data = dict(
36 |     samples_per_gpu=2,
37 |     workers_per_gpu=2,
38 |     train=dict(
39 |         type=dataset_type,
40 |         data_root='data/Mapillary/',
41 |         img_dir=['training/images', 'validation/images'],
42 |         ann_dir=['training/labels', 'validation/labels'],
43 |         pipeline=train_pipeline),
44 |     val=dict(
45 |         type='CityscapesDataset',
46 |         data_root='data/cityscapes/',
47 |         img_dir='leftImg8bit/val',
48 |         ann_dir='gtFine/val',
49 |         pipeline=test_pipeline),
50 |     test=dict(
51 |         type='CityscapesDataset',
52 |         data_root='data/cityscapes/',
53 |         img_dir='leftImg8bit/val',
54 |         ann_dir='gtFine/val',
55 |         pipeline=test_pipeline))
56 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/nyu_depth_v2.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'NYUDepthV2Dataset'
 3 | data_root = 'data/nyu_depth_v2/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | 
 7 | crop_size = (480, 480)
 8 | 
 9 | train_pipeline = [
10 |     dict(type='LoadImageFromFile'),
11 |     dict(type='LoadAnnotations', reduce_zero_label=True),
12 |     dict(type='Resize', img_scale=(640, 480), ratio_range=(0.5, 2.0)),
13 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
14 |     dict(type='RandomFlip', prob=0.5),
15 |     dict(type='PhotoMetricDistortion'),
16 |     dict(type='Normalize', **img_norm_cfg),
17 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
18 |     dict(type='DefaultFormatBundle'),
19 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
20 | ]
21 | test_pipeline = [
22 |     dict(type='LoadImageFromFile'),
23 |     dict(
24 |         type='MultiScaleFlipAug',
25 |         img_scale=(640, 480),
26 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
27 |         flip=False,
28 |         transforms=[
29 |             dict(type='Resize', keep_ratio=True),
30 |             dict(type='RandomFlip'),
31 |             dict(type='Normalize', **img_norm_cfg),
32 |             dict(type='ImageToTensor', keys=['img']),
33 |             dict(type='Collect', keys=['img']),
34 |         ])
35 | ]
36 | data = dict(
37 |     samples_per_gpu=4,
38 |     workers_per_gpu=4,
39 |     train=dict(
40 |         type=dataset_type,
41 |         data_root=data_root,
42 |         img_dir='image',
43 |         ann_dir='label40',
44 |         split='train.txt',
45 |         pipeline=train_pipeline),
46 |     val=dict(
47 |         type=dataset_type,
48 |         data_root=data_root,
49 |         img_dir='image',
50 |         ann_dir='label40',
51 |         split='test.txt',
52 |         pipeline=test_pipeline),
53 |     test=dict(
54 |         type=dataset_type,
55 |         data_root=data_root,
56 |         img_dir='image',
57 |         ann_dir='label40',
58 |         split='test.txt',
59 |         pipeline=test_pipeline))
60 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/pascal_context.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'PascalContextDataset'
 3 | data_root = 'data/VOCdevkit/VOC2010/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | 
 7 | img_scale = (520, 520)
 8 | crop_size = (480, 480)
 9 | 
10 | train_pipeline = [
11 |     dict(type='LoadImageFromFile'),
12 |     dict(type='LoadAnnotations'),
13 |     dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
14 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
15 |     dict(type='RandomFlip', prob=0.5),
16 |     dict(type='PhotoMetricDistortion'),
17 |     dict(type='Normalize', **img_norm_cfg),
18 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
19 |     dict(type='DefaultFormatBundle'),
20 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
21 | ]
22 | test_pipeline = [
23 |     dict(type='LoadImageFromFile'),
24 |     dict(
25 |         type='MultiScaleFlipAug',
26 |         img_scale=img_scale,
27 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
28 |         flip=False,
29 |         transforms=[
30 |             dict(type='Resize', keep_ratio=True),
31 |             dict(type='RandomFlip'),
32 |             dict(type='Normalize', **img_norm_cfg),
33 |             dict(type='ImageToTensor', keys=['img']),
34 |             dict(type='Collect', keys=['img']),
35 |         ])
36 | ]
37 | data = dict(
38 |     samples_per_gpu=4,
39 |     workers_per_gpu=4,
40 |     train=dict(
41 |         type=dataset_type,
42 |         data_root=data_root,
43 |         img_dir='JPEGImages',
44 |         ann_dir='SegmentationClassContext',
45 |         split='ImageSets/SegmentationContext/train.txt',
46 |         pipeline=train_pipeline),
47 |     val=dict(
48 |         type=dataset_type,
49 |         data_root=data_root,
50 |         img_dir='JPEGImages',
51 |         ann_dir='SegmentationClassContext',
52 |         split='ImageSets/SegmentationContext/val.txt',
53 |         pipeline=test_pipeline),
54 |     test=dict(
55 |         type=dataset_type,
56 |         data_root=data_root,
57 |         img_dir='JPEGImages',
58 |         ann_dir='SegmentationClassContext',
59 |         split='ImageSets/SegmentationContext/val.txt',
60 |         pipeline=test_pipeline))
61 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/pascal_context_59.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'PascalContextDataset59'
 3 | data_root = 'data/VOCdevkit/VOC2010/'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | 
 7 | img_scale = (520, 520)
 8 | crop_size = (480, 480)
 9 | 
10 | train_pipeline = [
11 |     dict(type='LoadImageFromFile'),
12 |     dict(type='LoadAnnotations', reduce_zero_label=True),
13 |     dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
14 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
15 |     dict(type='RandomFlip', prob=0.5),
16 |     dict(type='PhotoMetricDistortion'),
17 |     dict(type='Normalize', **img_norm_cfg),
18 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
19 |     dict(type='DefaultFormatBundle'),
20 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
21 | ]
22 | test_pipeline = [
23 |     dict(type='LoadImageFromFile'),
24 |     dict(
25 |         type='MultiScaleFlipAug',
26 |         img_scale=img_scale,
27 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
28 |         flip=False,
29 |         transforms=[
30 |             dict(type='Resize', keep_ratio=True),
31 |             dict(type='RandomFlip'),
32 |             dict(type='Normalize', **img_norm_cfg),
33 |             dict(type='ImageToTensor', keys=['img']),
34 |             dict(type='Collect', keys=['img']),
35 |         ])
36 | ]
37 | data = dict(
38 |     samples_per_gpu=4,
39 |     workers_per_gpu=4,
40 |     train=dict(
41 |         type=dataset_type,
42 |         data_root=data_root,
43 |         img_dir='JPEGImages',
44 |         ann_dir='SegmentationClassContext',
45 |         split='ImageSets/SegmentationContext/train.txt',
46 |         pipeline=train_pipeline),
47 |     val=dict(
48 |         type=dataset_type,
49 |         data_root=data_root,
50 |         img_dir='JPEGImages',
51 |         ann_dir='SegmentationClassContext',
52 |         split='ImageSets/SegmentationContext/val.txt',
53 |         pipeline=test_pipeline),
54 |     test=dict(
55 |         type=dataset_type,
56 |         data_root=data_root,
57 |         img_dir='JPEGImages',
58 |         ann_dir='SegmentationClassContext',
59 |         split='ImageSets/SegmentationContext/val.txt',
60 |         pipeline=test_pipeline))
61 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/pascal_voc12.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'PascalVOCDataset'
 3 | data_root = 'data/VOCdevkit/VOC2012'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | crop_size = (512, 512)
 7 | train_pipeline = [
 8 |     dict(type='LoadImageFromFile'),
 9 |     dict(type='LoadAnnotations'),
10 |     dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
11 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12 |     dict(type='RandomFlip', prob=0.5),
13 |     dict(type='PhotoMetricDistortion'),
14 |     dict(type='Normalize', **img_norm_cfg),
15 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16 |     dict(type='DefaultFormatBundle'),
17 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18 | ]
19 | test_pipeline = [
20 |     dict(type='LoadImageFromFile'),
21 |     dict(
22 |         type='MultiScaleFlipAug',
23 |         img_scale=(2048, 512),
24 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25 |         flip=False,
26 |         transforms=[
27 |             dict(type='Resize', keep_ratio=True),
28 |             dict(type='RandomFlip'),
29 |             dict(type='Normalize', **img_norm_cfg),
30 |             dict(type='ImageToTensor', keys=['img']),
31 |             dict(type='Collect', keys=['img']),
32 |         ])
33 | ]
34 | data = dict(
35 |     samples_per_gpu=4,
36 |     workers_per_gpu=4,
37 |     train=dict(
38 |         type=dataset_type,
39 |         data_root=data_root,
40 |         img_dir='JPEGImages',
41 |         ann_dir='SegmentationClass',
42 |         split='ImageSets/Segmentation/train.txt',
43 |         pipeline=train_pipeline),
44 |     val=dict(
45 |         type=dataset_type,
46 |         data_root=data_root,
47 |         img_dir='JPEGImages',
48 |         ann_dir='SegmentationClass',
49 |         split='ImageSets/Segmentation/val.txt',
50 |         pipeline=test_pipeline),
51 |     test=dict(
52 |         type=dataset_type,
53 |         data_root=data_root,
54 |         img_dir='JPEGImages',
55 |         ann_dir='SegmentationClass',
56 |         split='ImageSets/Segmentation/val.txt',
57 |         pipeline=test_pipeline))
58 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/pascal_voc12_aug.py:
--------------------------------------------------------------------------------
 1 | _base_ = './pascal_voc12.py'
 2 | # dataset settings
 3 | data = dict(
 4 |     train=dict(
 5 |         ann_dir=['SegmentationClass', 'SegmentationClassAug'],
 6 |         split=[
 7 |             'ImageSets/Segmentation/train.txt',
 8 |             'ImageSets/Segmentation/aug.txt'
 9 |         ]))
10 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/potsdam.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/DCNv4/4b848f7dd7da74ff03f7d278f902c6fd05b391b5/segmentation/configs/_base_/datasets/potsdam.py


--------------------------------------------------------------------------------
/segmentation/configs/_base_/datasets/stare.py:
--------------------------------------------------------------------------------
 1 | # dataset settings
 2 | dataset_type = 'STAREDataset'
 3 | data_root = 'data/STARE'
 4 | img_norm_cfg = dict(
 5 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 6 | img_scale = (605, 700)
 7 | crop_size = (128, 128)
 8 | train_pipeline = [
 9 |     dict(type='LoadImageFromFile'),
10 |     dict(type='LoadAnnotations'),
11 |     dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
12 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
13 |     dict(type='RandomFlip', prob=0.5),
14 |     dict(type='PhotoMetricDistortion'),
15 |     dict(type='Normalize', **img_norm_cfg),
16 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
17 |     dict(type='DefaultFormatBundle'),
18 |     dict(type='Collect', keys=['img', 'gt_semantic_seg'])
19 | ]
20 | test_pipeline = [
21 |     dict(type='LoadImageFromFile'),
22 |     dict(
23 |         type='MultiScaleFlipAug',
24 |         img_scale=img_scale,
25 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
26 |         flip=False,
27 |         transforms=[
28 |             dict(type='Resize', keep_ratio=True),
29 |             dict(type='RandomFlip'),
30 |             dict(type='Normalize', **img_norm_cfg),
31 |             dict(type='ImageToTensor', keys=['img']),
32 |             dict(type='Collect', keys=['img'])
33 |         ])
34 | ]
35 | 
36 | data = dict(
37 |     samples_per_gpu=4,
38 |     workers_per_gpu=4,
39 |     train=dict(
40 |         type='RepeatDataset',
41 |         times=40000,
42 |         dataset=dict(
43 |             type=dataset_type,
44 |             data_root=data_root,
45 |             img_dir='images/training',
46 |             ann_dir='annotations/training',
47 |             pipeline=train_pipeline)),
48 |     val=dict(
49 |         type=dataset_type,
50 |         data_root=data_root,
51 |         img_dir='images/validation',
52 |         ann_dir='annotations/validation',
53 |         pipeline=test_pipeline),
54 |     test=dict(
55 |         type=dataset_type,
56 |         data_root=data_root,
57 |         img_dir='images/validation',
58 |         ann_dir='annotations/validation',
59 |         pipeline=test_pipeline))
60 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
 1 | # yapf:disable
 2 | log_config = dict(
 3 |     interval=50,
 4 |     hooks=[
 5 |         dict(type='TextLoggerHook', by_epoch=False),
 6 |         # dict(type='TensorboardLoggerHook')
 7 |     ])
 8 | # yapf:enable
 9 | dist_params = dict(backend='nccl')
10 | log_level = 'INFO'
11 | load_from = None
12 | resume_from = None
13 | workflow = [('train', 1)]
14 | cudnn_benchmark = True
15 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/models/segformer_mit-b0.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     pretrained=None,
 6 |     backbone=dict(
 7 |         type='MixVisionTransformer',
 8 |         in_channels=3,
 9 |         embed_dims=32,
10 |         num_stages=4,
11 |         num_layers=[2, 2, 2, 2],
12 |         num_heads=[1, 2, 5, 8],
13 |         patch_sizes=[7, 3, 3, 3],
14 |         sr_ratios=[8, 4, 2, 1],
15 |         out_indices=(0, 1, 2, 3),
16 |         mlp_ratio=4,
17 |         qkv_bias=True,
18 |         drop_rate=0.0,
19 |         attn_drop_rate=0.0,
20 |         drop_path_rate=0.1),
21 |     decode_head=dict(
22 |         type='SegformerHead',
23 |         in_channels=[32, 64, 160, 256],
24 |         in_index=[0, 1, 2, 3],
25 |         channels=256,
26 |         dropout_ratio=0.1,
27 |         num_classes=19,
28 |         norm_cfg=norm_cfg,
29 |         align_corners=False,
30 |         loss_decode=dict(
31 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
32 |     # model training and testing settings
33 |     train_cfg=dict(),
34 |     test_cfg=dict(mode='whole'))


--------------------------------------------------------------------------------
/segmentation/configs/_base_/models/upernet_convnext.py:
--------------------------------------------------------------------------------
 1 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 2 | custom_imports = dict(imports='mmcls.models', allow_failed_imports=False)
 3 | # checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-base_3rdparty_32xb128-noema_in1k_20220301-2a0ee547.pth'  # noqa
 4 | model = dict(
 5 |     type='EncoderDecoder',
 6 |     pretrained=None,
 7 |     backbone=dict(
 8 |         type='mmcls.ConvNeXt',
 9 |         arch='base',
10 |         norm_cfg=dict(type='LN2dv2', eps=1e-6),
11 |         out_indices=[0, 1, 2, 3],
12 |         drop_path_rate=0.4,
13 |         layer_scale_init_value=1.0,
14 |         gap_before_final_norm=False,
15 |         # init_cfg=dict(
16 |         #     type='Pretrained', checkpoint=checkpoint_file,
17 |         #     prefix='backbone.')
18 |             ),
19 |     decode_head=dict(
20 |         type='UPerHead',
21 |         in_channels=[128, 256, 512, 1024],
22 |         in_index=[0, 1, 2, 3],
23 |         pool_scales=(1, 2, 3, 6),
24 |         channels=512,
25 |         dropout_ratio=0.1,
26 |         num_classes=19,
27 |         norm_cfg=norm_cfg,
28 |         align_corners=False,
29 |         loss_decode=dict(
30 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
31 |     auxiliary_head=dict(
32 |         type='FCNHead',
33 |         in_channels=384,
34 |         in_index=2,
35 |         channels=256,
36 |         num_convs=1,
37 |         concat_input=False,
38 |         dropout_ratio=0.1,
39 |         num_classes=19,
40 |         norm_cfg=norm_cfg,
41 |         align_corners=False,
42 |         loss_decode=dict(
43 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
44 |     # model training and testing settings
45 |     train_cfg=dict(),
46 |     test_cfg=dict(mode='whole'))
47 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/models/upernet_r50.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     pretrained='open-mmlab://resnet50_v1c',
 6 |     backbone=dict(
 7 |         type='ResNetV1c',
 8 |         depth=50,
 9 |         num_stages=4,
10 |         out_indices=(0, 1, 2, 3),
11 |         dilations=(1, 1, 1, 1),
12 |         strides=(1, 2, 2, 2),
13 |         norm_cfg=norm_cfg,
14 |         norm_eval=False,
15 |         style='pytorch',
16 |         contract_dilation=True),
17 |     decode_head=dict(
18 |         type='UPerHead',
19 |         in_channels=[256, 512, 1024, 2048],
20 |         in_index=[0, 1, 2, 3],
21 |         pool_scales=(1, 2, 3, 6),
22 |         channels=512,
23 |         dropout_ratio=0.1,
24 |         num_classes=19,
25 |         norm_cfg=norm_cfg,
26 |         align_corners=False,
27 |         loss_decode=dict(
28 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
29 |     auxiliary_head=dict(
30 |         type='FCNHead',
31 |         in_channels=1024,
32 |         in_index=2,
33 |         channels=256,
34 |         num_convs=1,
35 |         concat_input=False,
36 |         dropout_ratio=0.1,
37 |         num_classes=19,
38 |         norm_cfg=norm_cfg,
39 |         align_corners=False,
40 |         loss_decode=dict(
41 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
42 |     # model training and testing settings
43 |     train_cfg=dict(),
44 |     test_cfg=dict(mode='whole'))
45 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/models/upernet_swin.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | backbone_norm_cfg = dict(type='LN', requires_grad=True)
 4 | model = dict(
 5 |     type='EncoderDecoder',
 6 |     pretrained=None,
 7 |     backbone=dict(
 8 |         type='SwinTransformer',
 9 |         pretrain_img_size=224,
10 |         embed_dims=96,
11 |         patch_size=4,
12 |         window_size=7,
13 |         mlp_ratio=4,
14 |         depths=[2, 2, 6, 2],
15 |         num_heads=[3, 6, 12, 24],
16 |         strides=(4, 2, 2, 2),
17 |         out_indices=(0, 1, 2, 3),
18 |         qkv_bias=True,
19 |         qk_scale=None,
20 |         patch_norm=True,
21 |         drop_rate=0.,
22 |         attn_drop_rate=0.,
23 |         drop_path_rate=0.3,
24 |         use_abs_pos_embed=False,
25 |         act_cfg=dict(type='GELU'),
26 |         norm_cfg=backbone_norm_cfg),
27 |     decode_head=dict(
28 |         type='UPerHead',
29 |         in_channels=[96, 192, 384, 768],
30 |         in_index=[0, 1, 2, 3],
31 |         pool_scales=(1, 2, 3, 6),
32 |         channels=512,
33 |         dropout_ratio=0.1,
34 |         num_classes=19,
35 |         norm_cfg=norm_cfg,
36 |         align_corners=False,
37 |         loss_decode=dict(
38 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
39 |     auxiliary_head=dict(
40 |         type='FCNHead',
41 |         in_channels=384,
42 |         in_index=2,
43 |         channels=256,
44 |         num_convs=1,
45 |         concat_input=False,
46 |         dropout_ratio=0.1,
47 |         num_classes=19,
48 |         norm_cfg=norm_cfg,
49 |         align_corners=False,
50 |         loss_decode=dict(
51 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
52 |     # model training and testing settings
53 |     train_cfg=dict(),
54 |     test_cfg=dict(mode='whole'))
55 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/schedules/schedule_160k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=160000)
 8 | checkpoint_config = dict(by_epoch=False, interval=16000)
 9 | evaluation = dict(interval=16000, metric='mIoU', pre_eval=True)
10 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/schedules/schedule_20k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=20000)
 8 | checkpoint_config = dict(by_epoch=False, interval=2000)
 9 | evaluation = dict(interval=2000, metric='mIoU', pre_eval=True)
10 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/schedules/schedule_320k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=320000)
 8 | checkpoint_config = dict(by_epoch=False, interval=32000)
 9 | evaluation = dict(interval=32000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/schedules/schedule_40k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=40000)
 8 | checkpoint_config = dict(by_epoch=False, interval=4000)
 9 | evaluation = dict(interval=4000, metric='mIoU', pre_eval=True)
10 | 


--------------------------------------------------------------------------------
/segmentation/configs/_base_/schedules/schedule_80k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=80000)
 8 | checkpoint_config = dict(by_epoch=False, interval=8000)
 9 | evaluation = dict(interval=8000, metric='mIoU', pre_eval=True)
10 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/README.md:
--------------------------------------------------------------------------------
 1 | # ADE20K
 2 | 
 3 | Introduced by Zhou et al. in [Scene Parsing Through ADE20K Dataset](https://paperswithcode.com/paper/scene-parsing-through-ade20k-dataset).
 4 | 
 5 | The ADE20K semantic segmentation dataset contains more than 20K scene-centric images exhaustively annotated with pixel-level objects and object parts labels. There are totally 150 semantic categories, which include stuffs like sky, road, grass, and discrete objects like person, car, bed.
 6 | 
 7 | 
 8 | ## Model Zoo
 9 | 
10 | ### UperNet + InternImage
11 | 
12 | 
13 | | backbone       | resolution | mIoU (ss/ms) | Config | Download            |
14 | |:--------------:|:----------:|:-----------:|:-----------:|:----------:
15 | | FlashInternImage-T  | 512x512    | 49.3 / 50.3   | [config](./upernet_flash_internimage_t_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_t_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_t_512_160k_ade20k.log)   | 
16 | | FlashInternImage-S  | 512x512    | 50.6 / 51.6     | [config](./upernet_flash_internimage_s_512_160k_ade20k.py)  | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_s_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_s_512_160k_ade20k.log)  | 
17 | | FlashInternImage-B  | 512x512    | 52.0 / 52.6       | [config](./upernet_flash_internimage_b_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_b_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_s_512_160k_ade20k.log)  | 
18 | | FlashInternImage-L  | 640x640    | 55.6 / 56.0    | [config](./upernet_flash_internimage_l_640_160k_ade20k.py)| [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_l_640_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_l_640_160k_ade20k.log)  | 
19 | 
20 | - Training speed is measured with A100 GPU.
21 | - Please set `with_cp=True` to save memory if you meet `out-of-memory` issues.
22 | - The logs are our recent newly trained ones. There are slight differences between the results in logs and our paper.
23 | 
24 | 
25 | ### Mask2Former + InternImage
26 | 
27 | | backbone       | resolution | mIoU (ss) | Config | Download            |
28 | |:--------------:|:----------:|:-----------:|:-----------:|:----------:
29 | | FlashInternImage-T  | 512x512    | 51.2   | [config](./mask2former_flash_internimage_t_512_160k_ade20k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_t_512_160k_ade20k_ss.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_t_512_160k_ade20k_ss.log)   | 
30 | | FlashInternImage-S  | 640x640    | 52.2     | [config](./mask2former_flash_internimage_s_640_160k_ade20k_ss.py)  | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_s_640_160k_ade20k_ss.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_s_640_160k_ade20k_ss.log)  | 
31 | | FlashInternImage-B  | 640x640    |  53.4       | [config](./mask2former_flash_internimage_b_640_160k_ade20k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_b_640_160k_ade20k_ss.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_b_640_160k_ade20k_ss.log)  | 
32 | | FlashInternImage-L  | 640x640    | 56.7     | [config](./mask2former_flash_internimage_l_640_160k_ade20k_ss.py)| [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_l_640_160k_ade20k_ss.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_l_640_160k_ade20k_ss.log)  | 
33 | 
34 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/upernet_flash_internimage_b_512_160k_ade20k.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | _base_ = [
 7 |     '../_base_/models/upernet_r50.py', '../_base_/datasets/ade20k.py',
 8 |     '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
 9 | ]
10 | pretrained = 'https://huggingface.co/OpenGVLab/DCNv4/resolve/main/flash_intern_image_b_1k_224.pth'
11 | model = dict(
12 |     backbone=dict(
13 |         _delete_=True,
14 |         type='FlashInternImage',
15 |         core_op='DCNv4',
16 |         channels=112,
17 |         depths=[4, 4, 21, 4],
18 |         groups=[7, 14, 28, 56],
19 |         mlp_ratio=4.,
20 |         drop_path_rate=0.3,
21 |         norm_layer='LN',
22 |         layer_scale=1.0,
23 |         offset_scale=0.5,
24 |         post_norm=True,
25 |         with_cp=False,
26 |         dw_kernel_size=3,
27 |         out_indices=(0, 1, 2, 3),
28 |         init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
29 |     decode_head=dict(num_classes=150, in_channels=[112, 224, 448, 896]),
30 |     auxiliary_head=dict(num_classes=150, in_channels=448),
31 |     test_cfg=dict(mode='whole')
32 | )
33 | img_norm_cfg = dict(
34 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
35 | test_pipeline = [
36 |     dict(type='LoadImageFromFile'),
37 |     dict(
38 |         type='MultiScaleFlipAug',
39 |         img_scale=(2048, 512),
40 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
41 |         flip=False,
42 |         transforms=[
43 |             dict(type='Resize', keep_ratio=True),
44 |             dict(type='ResizeToMultiple', size_divisor=32),
45 |             dict(type='RandomFlip'),
46 |             dict(type='Normalize', **img_norm_cfg),
47 |             dict(type='ImageToTensor', keys=['img']),
48 |             dict(type='Collect', keys=['img']),
49 |         ])
50 | ]
51 | optimizer = dict(
52 |     _delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
53 |     constructor='CustomLayerDecayOptimizerConstructor',
54 |     paramwise_cfg=dict(num_layers=33, layer_decay_rate=1.0,
55 |                        depths=[4, 4, 21, 4]))
56 | lr_config = dict(_delete_=True, policy='poly',
57 |                  warmup='linear',
58 |                  warmup_iters=1500,
59 |                  warmup_ratio=1e-6,
60 |                  power=1.0, min_lr=0.0, by_epoch=False)
61 | # By default, models are trained on 8 GPUs with 2 images per GPU
62 | data=dict(samples_per_gpu=2,
63 |           val=dict(pipeline=test_pipeline),
64 |           test=dict(pipeline=test_pipeline))
65 | runner = dict(type='IterBasedRunner')
66 | checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
67 | evaluation = dict(interval=16000, metric='mIoU', save_best='mIoU')
68 | # fp16 = dict(loss_scale=dict(init_scale=512))
69 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/upernet_flash_internimage_l_640_160k_ade20k.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | _base_ = [
 7 |     '../_base_/models/upernet_r50.py', '../_base_/datasets/ade20k.py',
 8 |     '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
 9 | ]
10 | pretrained = 'https://huggingface.co/OpenGVLab/DCNv4/resolve/main/flash_intern_image_l_22k_384.pth'
11 | model = dict(
12 |     backbone=dict(
13 |         _delete_=True,
14 |         type='FlashInternImage',
15 |         core_op='DCNv4',
16 |         channels=160,
17 |         depths=[5, 5, 22, 5],
18 |         groups=[10, 20, 40, 80],
19 |         mlp_ratio=4.,
20 |         drop_path_rate=0.4,
21 |         norm_layer='LN',
22 |         layer_scale=1.0,
23 |         offset_scale=2.0,
24 |         post_norm=True,
25 |         with_cp=False,
26 |         dcn_output_bias=True,
27 |         mlp_fc2_bias=True,
28 |         dw_kernel_size=3,
29 |         out_indices=(0, 1, 2, 3),
30 |         init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
31 |     decode_head=dict(num_classes=150, in_channels=[160, 320, 640, 1280]),
32 |     auxiliary_head=dict(num_classes=150, in_channels=640),
33 |     test_cfg=dict(mode='whole'))
34 | img_norm_cfg = dict(
35 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
36 | crop_size = (640, 640)
37 | train_pipeline = [
38 |     dict(type='LoadImageFromFile'),
39 |     dict(type='LoadAnnotations', reduce_zero_label=True),
40 |     dict(type='Resize', img_scale=(2560, 640), ratio_range=(0.5, 2.0)),
41 |     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
42 |     dict(type='RandomFlip', prob=0.5),
43 |     dict(type='PhotoMetricDistortion'),
44 |     dict(type='Normalize', **img_norm_cfg),
45 |     dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
46 |     dict(type='DefaultFormatBundle'),
47 |     dict(type='Collect', keys=['img', 'gt_semantic_seg']),
48 | ]
49 | test_pipeline = [
50 |     dict(type='LoadImageFromFile'),
51 |     dict(
52 |         type='MultiScaleFlipAug',
53 |         img_scale=(2560, 640),
54 |         # img_ratios=[0.75, 1.0, 1.25],
55 |         flip=False,
56 |         transforms=[
57 |             dict(type='Resize', keep_ratio=True),
58 |             dict(type='ResizeToMultiple', size_divisor=32),
59 |             dict(type='RandomFlip'),
60 |             dict(type='Normalize', **img_norm_cfg),
61 |             dict(type='ImageToTensor', keys=['img']),
62 |             dict(type='Collect', keys=['img']),
63 |         ])
64 | ]
65 | optimizer = dict(
66 |     _delete_=True, type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05,
67 |     constructor='CustomLayerDecayOptimizerConstructor',
68 |     paramwise_cfg=dict(num_layers=37, layer_decay_rate=0.94,
69 |                        depths=[5, 5, 22, 5], offset_lr_scale=1.0))
70 | lr_config = dict(_delete_=True, policy='poly',
71 |                  warmup='linear',
72 |                  warmup_iters=1500,
73 |                  warmup_ratio=1e-6,
74 |                  power=1.0, min_lr=0.0, by_epoch=False)
75 | # By default, models are trained on 8 GPUs with 2 images per GPU
76 | data = dict(samples_per_gpu=2,
77 |             train=dict(pipeline=train_pipeline),
78 |             val=dict(pipeline=test_pipeline),
79 |             test=dict(pipeline=test_pipeline))
80 | runner = dict(type='IterBasedRunner')
81 | optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
82 | checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
83 | evaluation = dict(interval=16000, metric='mIoU', save_best='mIoU')
84 | # fp16 = dict(loss_scale=dict(init_scale=512))
85 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/upernet_flash_internimage_s_512_160k_ade20k.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | _base_ = [
 7 |     '../_base_/models/upernet_r50.py', '../_base_/datasets/ade20k.py',
 8 |     '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
 9 | ]
10 | pretrained = 'https://huggingface.co/OpenGVLab/DCNv4/resolve/main/flash_intern_image_s_1k_224.pth'
11 | model = dict(
12 |     backbone=dict(
13 |         _delete_=True,
14 |         type='FlashInternImage',
15 |         core_op='DCNv4',
16 |         channels=80,
17 |         depths=[4, 4, 21, 4],
18 |         groups=[5, 10, 20, 40],
19 |         mlp_ratio=4.,
20 |         drop_path_rate=0.3,
21 |         norm_layer='LN',
22 |         layer_scale=1.0,
23 |         offset_scale=1.0,
24 |         post_norm=True,
25 |         with_cp=True,
26 |         dw_kernel_size=3,
27 |         out_indices=(0, 1, 2, 3),
28 |         init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
29 |     decode_head=dict(num_classes=150, in_channels=[80, 160, 320, 640]),
30 |     auxiliary_head=dict(num_classes=150, in_channels=320),
31 |     test_cfg=dict(mode='whole')
32 | )
33 | img_norm_cfg = dict(
34 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
35 | test_pipeline = [
36 |     dict(type='LoadImageFromFile'),
37 |     dict(
38 |         type='MultiScaleFlipAug',
39 |         img_scale=(2048, 512),
40 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
41 |         flip=False,
42 |         transforms=[
43 |             dict(type='Resize', keep_ratio=True),
44 |             dict(type='ResizeToMultiple', size_divisor=32),
45 |             dict(type='RandomFlip'),
46 |             dict(type='Normalize', **img_norm_cfg),
47 |             dict(type='ImageToTensor', keys=['img']),
48 |             dict(type='Collect', keys=['img']),
49 |         ])
50 | ]
51 | optimizer = dict(
52 |     _delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
53 |     constructor='CustomLayerDecayOptimizerConstructor',
54 |     paramwise_cfg=dict(num_layers=33, layer_decay_rate=1.0,
55 |                        depths=[4, 4, 21, 4]))
56 | lr_config = dict(_delete_=True, policy='poly',
57 |                  warmup='linear',
58 |                  warmup_iters=1500,
59 |                  warmup_ratio=1e-6,
60 |                  power=1.0, min_lr=0.0, by_epoch=False)
61 | # By default, models are trained on 8 GPUs with 2 images per GPU
62 | data=dict(samples_per_gpu=2,
63 |           val=dict(pipeline=test_pipeline),
64 |           test=dict(pipeline=test_pipeline))
65 | runner = dict(type='IterBasedRunner')
66 | checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
67 | evaluation = dict(interval=16000, metric='mIoU', save_best='mIoU')
68 | # fp16 = dict(loss_scale=dict(init_scale=512))
69 | 


--------------------------------------------------------------------------------
/segmentation/configs/ade20k/upernet_flash_internimage_t_512_160k_ade20k.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | _base_ = [
 7 |     '../_base_/models/upernet_r50.py', '../_base_/datasets/ade20k.py',
 8 |     '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
 9 | ]
10 | pretrained = 'https://huggingface.co/OpenGVLab/DCNv4/resolve/main/flash_intern_image_t_1k_224.pth'
11 | model = dict(
12 |     backbone=dict(
13 |         _delete_=True,
14 |         type='FlashInternImage',
15 |         core_op='DCNv4',
16 |         channels=64,
17 |         depths=[4, 4, 18, 4],
18 |         groups=[4, 8, 16, 32],
19 |         mlp_ratio=4.,
20 |         drop_path_rate=0.2,
21 |         norm_layer='LN',
22 |         layer_scale=1.0,
23 |         offset_scale=1.0,
24 |         post_norm=False,
25 |         with_cp=True,
26 |         out_indices=(0, 1, 2, 3),
27 |         init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
28 |     decode_head=dict(num_classes=150, in_channels=[64, 128, 256, 512]),
29 |     auxiliary_head=dict(num_classes=150, in_channels=256),
30 |     test_cfg=dict(mode='whole')
31 | )
32 | img_norm_cfg = dict(
33 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
34 | test_pipeline = [
35 |     dict(type='LoadImageFromFile'),
36 |     dict(
37 |         type='MultiScaleFlipAug',
38 |         img_scale=(2048, 512),
39 |         # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
40 |         flip=False,
41 |         transforms=[
42 |             dict(type='Resize', keep_ratio=True),
43 |             dict(type='ResizeToMultiple', size_divisor=32),
44 |             dict(type='RandomFlip'),
45 |             dict(type='Normalize', **img_norm_cfg),
46 |             dict(type='ImageToTensor', keys=['img']),
47 |             dict(type='Collect', keys=['img']),
48 |         ])
49 | ]
50 | optimizer = dict(
51 |     _delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
52 |     constructor='CustomLayerDecayOptimizerConstructor',
53 |     paramwise_cfg=dict(num_layers=30, layer_decay_rate=1.0,
54 |                        depths=[4, 4, 18, 4]))
55 | lr_config = dict(_delete_=True, policy='poly',
56 |                  warmup='linear',
57 |                  warmup_iters=1500,
58 |                  warmup_ratio=1e-6,
59 |                  power=1.0, min_lr=0.0, by_epoch=False)
60 | # By default, models are trained on 8 GPUs with 2 images per GPU
61 | data=dict(samples_per_gpu=2,
62 |           # val=dict(pipeline=test_pipeline),
63 |           # test=dict(pipeline=test_pipeline)
64 |           )
65 | runner = dict(type='IterBasedRunner')
66 | checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
67 | evaluation = dict(interval=16000, metric='mIoU', save_best='mIoU')
68 | # fp16 = dict(loss_scale=dict(init_scale=512))
69 | 


--------------------------------------------------------------------------------
/segmentation/deploy/configs/_base_/backends/tensorrt.py:
--------------------------------------------------------------------------------
1 | backend_config = dict(
2 |     type='tensorrt', common_config=dict(fp16_mode=False, max_workspace_size=0))
3 | 


--------------------------------------------------------------------------------
/segmentation/deploy/configs/_base_/onnx_config.py:
--------------------------------------------------------------------------------
 1 | onnx_config = dict(
 2 |     type='onnx',
 3 |     export_params=True,
 4 |     keep_initializers_as_inputs=False,
 5 |     opset_version=11,
 6 |     save_file='end2end.onnx',
 7 |     input_names=['input'],
 8 |     output_names=['output'],
 9 |     input_shape=None,
10 |     optimize=True)
11 | 


--------------------------------------------------------------------------------
/segmentation/deploy/configs/mmseg/segmentation_static.py:
--------------------------------------------------------------------------------
1 | _base_ = ['../_base_/onnx_config.py']
2 | codebase_config = dict(type='mmseg', task='Segmentation', with_argmax=True)
3 | 


--------------------------------------------------------------------------------
/segmentation/deploy/configs/mmseg/segmentation_tensorrt_static-512x512.py:
--------------------------------------------------------------------------------
 1 | _base_ = ['./segmentation_static.py', '../_base_/backends/tensorrt.py']
 2 | 
 3 | onnx_config = dict(input_shape=[512, 512])
 4 | backend_config = dict(
 5 |     common_config=dict(max_workspace_size=1 << 30),
 6 |     model_inputs=[
 7 |         dict(
 8 |             input_shapes=dict(
 9 |                 input=dict(
10 |                     min_shape=[1, 3, 512, 512],
11 |                     opt_shape=[1, 3, 512, 512],
12 |                     max_shape=[1, 3, 512, 512])))
13 |     ])
14 | 


--------------------------------------------------------------------------------
/segmentation/deploy/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/DCNv4/4b848f7dd7da74ff03f7d278f902c6fd05b391b5/segmentation/deploy/demo.png


--------------------------------------------------------------------------------
/segmentation/dist_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | CHECKPOINT=$2
 5 | GPUS=$3
 6 | PORT=${PORT:-29510}
 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
 8 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
 9 |     $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
10 | 


--------------------------------------------------------------------------------
/segmentation/dist_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CONFIG=$1
 4 | GPUS=$2
 5 | PORT=${PORT:-29300}
 6 | 
 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
 8 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
 9 |     $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
10 | 


--------------------------------------------------------------------------------
/segmentation/image_demo.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from argparse import ArgumentParser
 3 | 
 4 | import mmcv
 5 | 
 6 | import mmcv_custom   # noqa: F401,F403
 7 | import mmseg_custom   # noqa: F401,F403
 8 | from mmseg.apis import inference_segmentor, init_segmentor, show_result_pyplot
 9 | from mmseg.core.evaluation import get_palette
10 | from mmcv.runner import load_checkpoint
11 | from mmseg.core import get_classes
12 | import cv2
13 | import os.path as osp
14 | import os
15 | 
16 | 
17 | def test_single_image(model, img_name, out_dir, color_palette, opacity):
18 |     result = inference_segmentor(model, img_name)
19 |     
20 |     # show the results
21 |     if hasattr(model, 'module'):
22 |         model = model.module
23 |     img = model.show_result(img_name, result,
24 |                             palette=color_palette,
25 |                             show=False, opacity=opacity)
26 |     
27 |     # save the results
28 |     mmcv.mkdir_or_exist(out_dir)
29 |     out_path = osp.join(out_dir, osp.basename(img_name))
30 |     cv2.imwrite(out_path, img)
31 |     print(f"Result is save at {out_path}")
32 | 
33 | 
34 | def main():
35 |     parser = ArgumentParser()
36 |     parser.add_argument('img', help='Image file or a directory contains images')
37 |     parser.add_argument('config', help='Config file')
38 |     parser.add_argument('checkpoint', help='Checkpoint file')
39 |     parser.add_argument('--out', type=str, default="demo", help='out dir')
40 |     parser.add_argument(
41 |         '--device', default='cuda:0', help='Device used for inference')
42 |     parser.add_argument(
43 |         '--palette',
44 |         default='ade20k',
45 |         choices=['ade20k', 'cityscapes', 'cocostuff'],
46 |         help='Color palette used for segmentation map')
47 |     parser.add_argument(
48 |         '--opacity',
49 |         type=float,
50 |         default=0.5,
51 |         help='Opacity of painted segmentation map. In (0, 1] range.')
52 |     args = parser.parse_args()
53 | 
54 |     # build the model from a config file and a checkpoint file
55 |     model = init_segmentor(args.config, checkpoint=None, device=args.device)
56 |     checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
57 |     if 'CLASSES' in checkpoint.get('meta', {}):
58 |         model.CLASSES = checkpoint['meta']['CLASSES']
59 |     else:
60 |         model.CLASSES = get_classes(args.palette)
61 |         
62 |     # check arg.img is directory of a single image.
63 |     if osp.isdir(args.img):
64 |         for img in os.listdir(args.img):
65 |             test_single_image(model, osp.join(args.img, img), args.out, get_palette(args.palette), args.opacity)
66 |     else:
67 |         test_single_image(model, args.img, args.out, get_palette(args.palette), args.opacity)
68 | 
69 | if __name__ == '__main__':
70 |     main()


--------------------------------------------------------------------------------
/segmentation/mmcv_custom/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | # -*- coding: utf-8 -*-
 8 | from .custom_layer_decay_optimizer_constructor import CustomLayerDecayOptimizerConstructor
 9 | from .layer_decay import LearningRateDecayOptimizerConstructor
10 | from .layer_decay_vit import LayerDecayOptimizerConstructor_vit
11 | __all__ = ['CustomLayerDecayOptimizerConstructor',]
12 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | from .models import *  # noqa: F401,F403
 8 | from .datasets import *  # noqa: F401,F403
 9 | from .core import *  # noqa: F401,F403
10 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/core/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Shanghai AI Lab. All rights reserved.
 2 | from mmseg.core.evaluation import *  # noqa: F401, F403
 3 | from mmseg.core.seg import *  # noqa: F401, F403
 4 | 
 5 | from .anchor import *  # noqa: F401,F403
 6 | from .box import *  # noqa: F401,F403
 7 | from .evaluation import *  # noqa: F401,F403
 8 | from .mask import *  # noqa: F401,F403
 9 | from .utils import *  # noqa: F401, F403
10 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/core/anchor/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Shanghai AI Lab. All rights reserved.
2 | from .point_generator import MlvlPointGenerator  # noqa: F401,F403
3 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/core/anchor/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import warnings
 3 | 
 4 | from mmcv.utils import Registry, build_from_cfg
 5 | 
 6 | PRIOR_GENERATORS = Registry('Generator for anchors and points')
 7 | 
 8 | ANCHOR_GENERATORS = PRIOR_GENERATORS
 9 | 
10 | 
11 | def build_prior_generator(cfg, default_args=None):
12 |     return build_from_cfg(cfg, PRIOR_GENERATORS, default_args)
13 | 
14 | 
15 | def build_anchor_generator(cfg, default_args=None):
16 |     warnings.warn(
17 |         '``build_anchor_generator`` would be deprecated soon, please use '
18 |         '``build_prior_generator`` ')
19 |     return build_prior_generator(cfg, default_args=default_args)
20 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/core/box/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Shanghai AI Lab. All rights reserved.
2 | from .builder import *  # noqa: F401,F403
3 | from .samplers import MaskPseudoSampler  # noqa: F401,F403
4 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/core/box/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from mmcv.utils import Registry, build_from_cfg
 3 | 
 4 | BBOX_SAMPLERS = Registry('bbox_sampler')
 5 | BBOX_CODERS = Registry('bbox_coder')
 6 | 
 7 | 
 8 | def build_sampler(cfg, **default_args):
 9 |     """Builder of box sampler."""
10 |     return build_from_cfg(cfg, BBOX_SAMPLERS, default_args)
11 | 
12 | 
13 | def build_bbox_coder(cfg, **default_args):
14 |     """Builder of box coder."""
15 |     return build_from_cfg(cfg, BBOX_CODERS, default_args)
16 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/core/box/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Shanghai AI Lab. All rights reserved.
2 | from .mask_pseudo_sampler import MaskPseudoSampler  # noqa: F401,F403
3 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/core/box/samplers/mask_pseudo_sampler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | """copy from
 3 | https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py."""
 4 | 
 5 | import torch
 6 | 
 7 | from ..builder import BBOX_SAMPLERS
 8 | from .base_sampler import BaseSampler
 9 | from .mask_sampling_result import MaskSamplingResult
10 | 
11 | 
12 | @BBOX_SAMPLERS.register_module()
13 | class MaskPseudoSampler(BaseSampler):
14 |     """A pseudo sampler that does not do sampling actually."""
15 |     def __init__(self, **kwargs):
16 |         pass
17 | 
18 |     def _sample_pos(self, **kwargs):
19 |         """Sample positive samples."""
20 |         raise NotImplementedError
21 | 
22 |     def _sample_neg(self, **kwargs):
23 |         """Sample negative samples."""
24 |         raise NotImplementedError
25 | 
26 |     def sample(self, assign_result, masks, gt_masks, **kwargs):
27 |         """Directly returns the positive and negative indices  of samples.
28 | 
29 |         Args:
30 |             assign_result (:obj:`AssignResult`): Assigned results
31 |             masks (torch.Tensor): Bounding boxes
32 |             gt_masks (torch.Tensor): Ground truth boxes
33 |         Returns:
34 |             :obj:`SamplingResult`: sampler results
35 |         """
36 |         pos_inds = torch.nonzero(assign_result.gt_inds > 0,
37 |                                  as_tuple=False).squeeze(-1).unique()
38 |         neg_inds = torch.nonzero(assign_result.gt_inds == 0,
39 |                                  as_tuple=False).squeeze(-1).unique()
40 |         gt_flags = masks.new_zeros(masks.shape[0], dtype=torch.uint8)
41 |         sampling_result = MaskSamplingResult(pos_inds, neg_inds, masks,
42 |                                              gt_masks, assign_result, gt_flags)
43 |         return sampling_result
44 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/core/box/samplers/mask_sampling_result.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | """copy from
 3 | https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py."""
 4 | 
 5 | import torch
 6 | 
 7 | from .sampling_result import SamplingResult
 8 | 
 9 | 
10 | class MaskSamplingResult(SamplingResult):
11 |     """Mask sampling result."""
12 |     def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result,
13 |                  gt_flags):
14 |         self.pos_inds = pos_inds
15 |         self.neg_inds = neg_inds
16 |         self.pos_masks = masks[pos_inds]
17 |         self.neg_masks = masks[neg_inds]
18 |         self.pos_is_gt = gt_flags[pos_inds]
19 | 
20 |         self.num_gts = gt_masks.shape[0]
21 |         self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
22 | 
23 |         if gt_masks.numel() == 0:
24 |             # hack for index error case
25 |             assert self.pos_assigned_gt_inds.numel() == 0
26 |             self.pos_gt_masks = torch.empty_like(gt_masks)
27 |         else:
28 |             self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :]
29 | 
30 |         if assign_result.labels is not None:
31 |             self.pos_gt_labels = assign_result.labels[pos_inds]
32 |         else:
33 |             self.pos_gt_labels = None
34 | 
35 |     @property
36 |     def masks(self):
37 |         """torch.Tensor: concatenated positive and negative boxes"""
38 |         return torch.cat([self.pos_masks, self.neg_masks])
39 | 
40 |     def __nice__(self):
41 |         data = self.info.copy()
42 |         data['pos_masks'] = data.pop('pos_masks').shape
43 |         data['neg_masks'] = data.pop('neg_masks').shape
44 |         parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
45 |         body = '    ' + ',\n    '.join(parts)
46 |         return '{\n' + body + '\n}'
47 | 
48 |     @property
49 |     def info(self):
50 |         """Returns a dictionary of info about the object."""
51 |         return {
52 |             'pos_inds': self.pos_inds,
53 |             'neg_inds': self.neg_inds,
54 |             'pos_masks': self.pos_masks,
55 |             'neg_masks': self.neg_masks,
56 |             'pos_is_gt': self.pos_is_gt,
57 |             'num_gts': self.num_gts,
58 |             'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
59 |         }
60 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/core/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Shanghai AI Lab. All rights reserved.
2 | from .panoptic_utils import INSTANCE_OFFSET  # noqa: F401,F403
3 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/core/evaluation/panoptic_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | # A custom value to distinguish instance ID and category ID; need to
3 | # be greater than the number of categories.
4 | # For a pixel in the panoptic result map:
5 | #   pan_id = ins_id * INSTANCE_OFFSET + cat_id
6 | INSTANCE_OFFSET = 1000
7 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/core/mask/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Shanghai AI Lab. All rights reserved.
2 | from .utils import mask2bbox  # noqa: F401,F403
3 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/core/mask/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import mmcv
 3 | import numpy as np
 4 | import pycocotools.mask as mask_util
 5 | import torch
 6 | 
 7 | 
 8 | def split_combined_polys(polys, poly_lens, polys_per_mask):
 9 |     """Split the combined 1-D polys into masks.
10 | 
11 |     A mask is represented as a list of polys, and a poly is represented as
12 |     a 1-D array. In dataset, all masks are concatenated into a single 1-D
13 |     tensor. Here we need to split the tensor into original representations.
14 | 
15 |     Args:
16 |         polys (list): a list (length = image num) of 1-D tensors
17 |         poly_lens (list): a list (length = image num) of poly length
18 |         polys_per_mask (list): a list (length = image num) of poly number
19 |             of each mask
20 | 
21 |     Returns:
22 |         list: a list (length = image num) of list (length = mask num) of \
23 |             list (length = poly num) of numpy array.
24 |     """
25 |     mask_polys_list = []
26 |     for img_id in range(len(polys)):
27 |         polys_single = polys[img_id]
28 |         polys_lens_single = poly_lens[img_id].tolist()
29 |         polys_per_mask_single = polys_per_mask[img_id].tolist()
30 | 
31 |         split_polys = mmcv.slice_list(polys_single, polys_lens_single)
32 |         mask_polys = mmcv.slice_list(split_polys, polys_per_mask_single)
33 |         mask_polys_list.append(mask_polys)
34 |     return mask_polys_list
35 | 
36 | 
37 | # TODO: move this function to more proper place
38 | def encode_mask_results(mask_results):
39 |     """Encode bitmap mask to RLE code.
40 | 
41 |     Args:
42 |         mask_results (list | tuple[list]): bitmap mask results.
43 |             In mask scoring rcnn, mask_results is a tuple of (segm_results,
44 |             segm_cls_score).
45 | 
46 |     Returns:
47 |         list | tuple: RLE encoded mask.
48 |     """
49 |     if isinstance(mask_results, tuple):  # mask scoring
50 |         cls_segms, cls_mask_scores = mask_results
51 |     else:
52 |         cls_segms = mask_results
53 |     num_classes = len(cls_segms)
54 |     encoded_mask_results = [[] for _ in range(num_classes)]
55 |     for i in range(len(cls_segms)):
56 |         for cls_segm in cls_segms[i]:
57 |             encoded_mask_results[i].append(
58 |                 mask_util.encode(
59 |                     np.array(
60 |                         cls_segm[:, :, np.newaxis], order='F',
61 |                         dtype='uint8'))[0])  # encoded with RLE
62 |     if isinstance(mask_results, tuple):
63 |         return encoded_mask_results, cls_mask_scores
64 |     else:
65 |         return encoded_mask_results
66 | 
67 | 
68 | def mask2bbox(masks):
69 |     """Obtain tight bounding boxes of binary masks.
70 | 
71 |     Args:
72 |         masks (Tensor): Binary mask of shape (n, h, w).
73 | 
74 |     Returns:
75 |         Tensor: Bboxe with shape (n, 4) of \
76 |             positive region in binary mask.
77 |     """
78 |     N = masks.shape[0]
79 |     bboxes = masks.new_zeros((N, 4), dtype=torch.float32)
80 |     x_any = torch.any(masks, dim=1)
81 |     y_any = torch.any(masks, dim=2)
82 |     for i in range(N):
83 |         x = torch.where(x_any[i, :])[0]
84 |         y = torch.where(y_any[i, :])[0]
85 |         if len(x) > 0 and len(y) > 0:
86 |             bboxes[i, :] = bboxes.new_tensor(
87 |                 [x[0], y[0], x[-1] + 1, y[-1] + 1])
88 | 
89 |     return bboxes
90 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/core/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .dist_utils import (DistOptimizerHook, all_reduce_dict, allreduce_grads,
 3 |                          reduce_mean)
 4 | from .misc import add_prefix, multi_apply
 5 | 
 6 | __all__ = [
 7 |     'add_prefix', 'multi_apply', 'DistOptimizerHook', 'allreduce_grads',
 8 |     'all_reduce_dict', 'reduce_mean'
 9 | ]
10 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/core/utils/misc.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | def multi_apply(func, *args, **kwargs):
 3 |     """Apply function to a list of arguments.
 4 | 
 5 |     Note:
 6 |         This function applies the ``func`` to multiple inputs and
 7 |         map the multiple outputs of the ``func`` into different
 8 |         list. Each list contains the same type of outputs corresponding
 9 |         to different inputs.
10 | 
11 |     Args:
12 |         func (Function): A function that will be applied to a list of
13 |             arguments
14 | 
15 |     Returns:
16 |         tuple(list): A tuple containing multiple list, each list contains \
17 |             a kind of returned results by the function
18 |     """
19 |     pfunc = partial(func, **kwargs) if kwargs else func
20 |     map_results = map(pfunc, *args)
21 |     return tuple(map(list, zip(*map_results)))
22 | 
23 | 
24 | def add_prefix(inputs, prefix):
25 |     """Add prefix for dict.
26 | 
27 |     Args:
28 |         inputs (dict): The input dict with str keys.
29 |         prefix (str): The prefix to add.
30 | 
31 |     Returns:
32 | 
33 |         dict: The dict with keys updated with ``prefix``.
34 |     """
35 | 
36 |     outputs = dict()
37 |     for name, value in inputs.items():
38 |         outputs[f'{prefix}.{name}'] = value
39 | 
40 |     return outputs
41 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .mapillary import MapillaryDataset  # noqa: F401,F403
 3 | from .nyu_depth_v2 import NYUDepthV2Dataset  # noqa: F401,F403
 4 | from .pipelines import *  # noqa: F401,F403
 5 | from .dataset_wrappers import ConcatDataset
 6 | 
 7 | 
 8 | __all__ = [
 9 |     'MapillaryDataset', 'NYUDepthV2Dataset', 'ConcatDataset'
10 | ]


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/datasets/mapillary.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | from mmseg.datasets.builder import DATASETS
 7 | from mmseg.datasets.custom import CustomDataset
 8 | 
 9 | 
10 | @DATASETS.register_module()
11 | class MapillaryDataset(CustomDataset):
12 |     """Mapillary dataset.
13 |     """
14 |     CLASSES = ('Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail', 'Barrier',
15 |                'Wall', 'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Parking', 'Pedestrian Area',
16 |                'Rail Track', 'Road', 'Service Lane', 'Sidewalk', 'Bridge', 'Building', 'Tunnel',
17 |                'Person', 'Bicyclist', 'Motorcyclist', 'Other Rider', 'Lane Marking - Crosswalk',
18 |                'Lane Marking - General', 'Mountain', 'Sand', 'Sky', 'Snow', 'Terrain', 'Vegetation',
19 |                'Water', 'Banner', 'Bench', 'Bike Rack', 'Billboard', 'Catch Basin', 'CCTV Camera',
20 |                'Fire Hydrant', 'Junction Box', 'Mailbox', 'Manhole', 'Phone Booth', 'Pothole',
21 |                'Street Light', 'Pole', 'Traffic Sign Frame', 'Utility Pole', 'Traffic Light',
22 |                'Traffic Sign (Back)', 'Traffic Sign (Front)', 'Trash Can', 'Bicycle', 'Boat',
23 |                'Bus', 'Car', 'Caravan', 'Motorcycle', 'On Rails', 'Other Vehicle', 'Trailer',
24 |                'Truck', 'Wheeled Slow', 'Car Mount', 'Ego Vehicle', 'Unlabeled')
25 | 
26 |     PALETTE = [[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153],
27 |                [180, 165, 180], [90, 120, 150], [102, 102, 156], [128, 64, 255],
28 |                [140, 140, 200], [170, 170, 170], [250, 170, 160], [96, 96, 96],
29 |                [230, 150, 140], [128, 64, 128], [110, 110, 110], [244, 35, 232],
30 |                [150, 100, 100], [70, 70, 70], [150, 120, 90], [220, 20, 60],
31 |                [255, 0, 0], [255, 0, 100], [255, 0, 200], [200, 128, 128],
32 |                [255, 255, 255], [64, 170, 64], [230, 160, 50], [70, 130, 180],
33 |                [190, 255, 255], [152, 251, 152], [107, 142, 35], [0, 170, 30],
34 |                [255, 255, 128], [250, 0, 30], [100, 140, 180], [220, 220, 220],
35 |                [220, 128, 128], [222, 40, 40], [100, 170, 30], [40, 40, 40],
36 |                [33, 33, 33], [100, 128, 160], [142, 0, 0], [70, 100, 150],
37 |                [210, 170, 100], [153, 153, 153], [128, 128, 128], [0, 0, 80],
38 |                [250, 170, 30], [192, 192, 192], [220, 220, 0], [140, 140, 20],
39 |                [119, 11, 32], [150, 0, 255], [0, 60, 100], [0, 0, 142], [0, 0, 90],
40 |                [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110], [0, 0, 70],
41 |                [0, 0, 192], [32, 32, 32], [120, 10, 10], [0, 0, 0]]
42 | 
43 |     def __init__(self, **kwargs):
44 |         super(MapillaryDataset, self).__init__(
45 |             img_suffix='.jpg',
46 |             seg_map_suffix='.png',
47 |             reduce_zero_label=False,
48 |             **kwargs)


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/datasets/nyu_depth_v2.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | from mmseg.datasets.builder import DATASETS
 7 | from mmseg.datasets.custom import CustomDataset
 8 | 
 9 | 
10 | @DATASETS.register_module()
11 | class NYUDepthV2Dataset(CustomDataset):
12 |     """NYU Depth V2 dataset.
13 |     """
14 | 
15 |     CLASSES = ('wall', 'floor', 'cabinet', 'bed', 'chair',
16 |                'sofa', 'table', 'door', 'window', 'bookshelf',
17 |                'picture', 'counter', 'blinds', 'desk', 'shelves',
18 |                'curtain', 'dresser', 'pillow', 'mirror', 'floor mat',
19 |                'clothes', 'ceiling', 'books', 'refridgerator', 'television',
20 |                'paper', 'towel', 'shower curtain', 'box', 'whiteboard',
21 |                'person', 'night stand', 'toilet', 'sink', 'lamp',
22 |                'bathtub', 'bag', 'otherstructure', 'otherfurniture', 'otherprop')
23 | 
24 |     
25 |     PALETTE = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
26 |                [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
27 |                [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
28 |                [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
29 |                [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
30 |                [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
31 |                [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
32 |                [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
33 |                [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
34 |                [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],]
35 | 
36 |     def __init__(self, split, **kwargs):
37 |         super(NYUDepthV2Dataset, self).__init__(
38 |             img_suffix='.png',
39 |             seg_map_suffix='.png',
40 |             split=split,
41 |             reduce_zero_label=True,
42 |             **kwargs)
43 |         


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/datasets/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .formatting import DefaultFormatBundle, ToMask
3 | from .transform import MapillaryHack, PadShortSide, SETR_Resize
4 | 
5 | __all__ = [
6 |     'DefaultFormatBundle', 'ToMask', 'SETR_Resize',
7 |     'PadShortSide', 'MapillaryHack'
8 | ]
9 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/datasets/pipelines/formatting.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import numpy as np
 3 | from mmcv.parallel import DataContainer as DC
 4 | from mmseg.datasets.builder import PIPELINES
 5 | from mmseg.datasets.pipelines.formatting import to_tensor
 6 | 
 7 | 
 8 | @PIPELINES.register_module(force=True)
 9 | class DefaultFormatBundle(object):
10 |     """Default formatting bundle.
11 | 
12 |     It simplifies the pipeline of formatting common fields, including "img"
13 |     and "gt_semantic_seg". These fields are formatted as follows.
14 | 
15 |     - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
16 |     - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor,
17 |                        (3)to DataContainer (stack=True)
18 |     """
19 |     def __call__(self, results):
20 |         """Call function to transform and format common fields in results.
21 | 
22 |         Args:
23 |             results (dict): Result dict contains the data to convert.
24 | 
25 |         Returns:
26 |             dict: The result dict contains the data that is formatted with
27 |                 default bundle.
28 |         """
29 | 
30 |         if 'img' in results:
31 |             img = results['img']
32 |             if len(img.shape) < 3:
33 |                 img = np.expand_dims(img, -1)
34 |             img = np.ascontiguousarray(img.transpose(2, 0, 1))
35 |             results['img'] = DC(to_tensor(img), stack=True)
36 |         if 'gt_semantic_seg' in results:
37 |             # convert to long
38 |             results['gt_semantic_seg'] = DC(to_tensor(
39 |                 results['gt_semantic_seg'][None, ...].astype(np.int64)),
40 |                                             stack=True)
41 |         if 'gt_masks' in results:
42 |             results['gt_masks'] = DC(to_tensor(results['gt_masks']))
43 |         if 'gt_labels' in results:
44 |             results['gt_labels'] = DC(to_tensor(results['gt_labels']))
45 | 
46 |         return results
47 | 
48 |     def __repr__(self):
49 |         return self.__class__.__name__
50 | 
51 | 
52 | @PIPELINES.register_module()
53 | class ToMask(object):
54 |     """Transfer gt_semantic_seg to binary mask and generate gt_labels."""
55 |     def __init__(self, ignore_index=255):
56 |         self.ignore_index = ignore_index
57 | 
58 |     def __call__(self, results):
59 |         gt_semantic_seg = results['gt_semantic_seg']
60 |         gt_labels = np.unique(gt_semantic_seg)
61 |         # remove ignored region
62 |         gt_labels = gt_labels[gt_labels != self.ignore_index]
63 | 
64 |         gt_masks = []
65 |         for class_id in gt_labels:
66 |             gt_masks.append(gt_semantic_seg == class_id)
67 | 
68 |         if len(gt_masks) == 0:
69 |             # Some image does not have annotation (all ignored)
70 |             gt_masks = np.empty((0, ) + results['pad_shape'][:-1], dtype=np.int64)
71 |             gt_labels = np.empty((0, ),  dtype=np.int64)
72 |         else:
73 |             gt_masks = np.asarray(gt_masks, dtype=np.int64)
74 |             gt_labels = np.asarray(gt_labels, dtype=np.int64)
75 | 
76 |         results['gt_labels'] = gt_labels
77 |         results['gt_masks'] = gt_masks
78 |         return results
79 | 
80 |     def __repr__(self):
81 |         return self.__class__.__name__ + \
82 |                f'(ignore_index={self.ignore_index})'
83 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | from .backbones import *  # noqa: F401,F403
 8 | from .decode_heads import *  # noqa: F401,F403
 9 | from .losses import *  # noqa: F401,F403
10 | from .plugins import *  # noqa: F401,F403
11 | from .segmentors import *  # noqa: F401,F403
12 | from .utils import *  # noqa: F401,F403


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/models/backbones/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # FlashInternImage
 3 | # Copyright (c) 2023 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | from .flash_intern_image import FlashInternImage
 8 | 
 9 | __all__ = ['FlashInternImage']
10 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/models/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import warnings   # noqa: F401,F403
 3 | 
 4 | from mmcv.utils import Registry
 5 | 
 6 | TRANSFORMER = Registry('Transformer')
 7 | MASK_ASSIGNERS = Registry('mask_assigner')
 8 | MATCH_COST = Registry('match_cost')
 9 | 
10 | 
11 | def build_match_cost(cfg):
12 |     """Build Match Cost."""
13 |     return MATCH_COST.build(cfg)
14 | 
15 | 
16 | def build_assigner(cfg):
17 |     """Build Assigner."""
18 |     return MASK_ASSIGNERS.build(cfg)
19 | 
20 | 
21 | def build_transformer(cfg):
22 |     """Build Transformer."""
23 |     return TRANSFORMER.build(cfg)
24 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/models/decode_heads/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .mask2former_head import Mask2FormerHead
3 | from .maskformer_head import MaskFormerHead
4 | from .msda import CustomMultiScaleDeformableAttention
5 | __all__ = [
6 |     'MaskFormerHead',
7 |     'Mask2FormerHead',
8 | ]
9 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/models/losses/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy,
 3 |                                  cross_entropy, mask_cross_entropy)
 4 | from .dice_loss import DiceLoss
 5 | from .focal_loss import FocalLoss
 6 | from .match_costs import (ClassificationCost, CrossEntropyLossCost, DiceCost,
 7 |                           MaskFocalLossCost)
 8 | 
 9 | __all__ = [
10 |     'cross_entropy', 'binary_cross_entropy', 'mask_cross_entropy',
11 |     'CrossEntropyLoss', 'DiceLoss', 'FocalLoss', 'ClassificationCost',
12 |     'MaskFocalLossCost', 'DiceCost', 'CrossEntropyLossCost'
13 | ]
14 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/models/plugins/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Shanghai AI Lab. All rights reserved.
2 | from .msdeformattn_pixel_decoder import MSDeformAttnPixelDecoder
3 | from .pixel_decoder import PixelDecoder, TransformerEncoderPixelDecoder
4 | 
5 | __all__ = [
6 |     'PixelDecoder', 'TransformerEncoderPixelDecoder',
7 |     'MSDeformAttnPixelDecoder'
8 | ]
9 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/models/segmentors/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .encoder_decoder_mask2former import EncoderDecoderMask2Former
3 | from .encoder_decoder_mask2former_aug import EncoderDecoderMask2FormerAug
4 | 
5 | __all__ = ['EncoderDecoderMask2Former', 'EncoderDecoderMask2FormerAug']
6 | 


--------------------------------------------------------------------------------
/segmentation/mmseg_custom/models/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Shanghai AI Lab. All rights reserved.
 2 | from .assigner import MaskHungarianAssigner
 3 | from .point_sample import get_uncertain_point_coords_with_randomness
 4 | from .positional_encoding import (LearnedPositionalEncoding,
 5 |                                   SinePositionalEncoding)
 6 | from .transformer import (DetrTransformerDecoder, DetrTransformerDecoderLayer,
 7 |                           DynamicConv, Transformer)
 8 | 
 9 | __all__ = [
10 |     'DetrTransformerDecoderLayer', 'DetrTransformerDecoder', 'DynamicConv',
11 |     'Transformer', 'LearnedPositionalEncoding', 'SinePositionalEncoding',
12 |     'MaskHungarianAssigner', 'get_uncertain_point_coords_with_randomness'
13 | ]
14 | 


--------------------------------------------------------------------------------
/segmentation/ops_dcnv3/functions/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DCNv4
3 | # Copyright (c) 2024 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | 
7 | from .dcnv3_func import DCNv3Function, dcnv3_core_pytorch
8 | 


--------------------------------------------------------------------------------
/segmentation/ops_dcnv3/make.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # --------------------------------------------------------
3 | # DCNv4
4 | # Copyright (c) 2024 OpenGVLab
5 | # Licensed under The MIT License [see LICENSE for details]
6 | # --------------------------------------------------------
7 | 
8 | python setup.py build install
9 | 


--------------------------------------------------------------------------------
/segmentation/ops_dcnv3/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DCNv4
3 | # Copyright (c) 2024 OpenGVLab
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # --------------------------------------------------------
6 | 
7 | from .dcnv3 import DCNv3, DCNv3_pytorch


--------------------------------------------------------------------------------
/segmentation/ops_dcnv3/setup.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DCNv4
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | import os
 8 | import glob
 9 | 
10 | import torch
11 | 
12 | from torch.utils.cpp_extension import CUDA_HOME
13 | from torch.utils.cpp_extension import CppExtension
14 | from torch.utils.cpp_extension import CUDAExtension
15 | 
16 | from setuptools import find_packages
17 | from setuptools import setup
18 | 
19 | requirements = ["torch", "torchvision"]
20 | 
21 | 
22 | def get_extensions():
23 |     this_dir = os.path.dirname(os.path.abspath(__file__))
24 |     extensions_dir = os.path.join(this_dir, "src")
25 | 
26 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
27 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
28 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
29 | 
30 |     sources = main_file + source_cpu
31 |     extension = CppExtension
32 |     extra_compile_args = {"cxx": []}
33 |     define_macros = []
34 | 
35 |     if torch.cuda.is_available() and CUDA_HOME is not None:
36 |         extension = CUDAExtension
37 |         sources += source_cuda
38 |         define_macros += [("WITH_CUDA", None)]
39 |         extra_compile_args["nvcc"] = [
40 |             # "-DCUDA_HAS_FP16=1",
41 |             # "-D__CUDA_NO_HALF_OPERATORS__",
42 |             # "-D__CUDA_NO_HALF_CONVERSIONS__",
43 |             # "-D__CUDA_NO_HALF2_OPERATORS__",
44 |         ]
45 |     else:
46 |         raise NotImplementedError('Cuda is not availabel')
47 | 
48 |     sources = [os.path.join(extensions_dir, s) for s in sources]
49 |     include_dirs = [extensions_dir]
50 |     ext_modules = [
51 |         extension(
52 |             "DCNv3",
53 |             sources,
54 |             include_dirs=include_dirs,
55 |             define_macros=define_macros,
56 |             extra_compile_args=extra_compile_args,
57 |         )
58 |     ]
59 |     return ext_modules
60 | 
61 | 
62 | setup(
63 |     name="DCNv3",
64 |     version="1.0",
65 |     author="InternImage",
66 |     url="https://github.com/OpenGVLab/InternImage",
67 |     description=
68 |     "PyTorch Wrapper for CUDA Functions of DCNv3",
69 |     packages=find_packages(exclude=(
70 |         "configs",
71 |         "tests",
72 |     )),
73 |     ext_modules=get_extensions(),
74 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
75 | )
76 | 


--------------------------------------------------------------------------------
/segmentation/ops_dcnv3/src/cpu/dcnv3_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #include <vector>
13 | 
14 | #include <ATen/ATen.h>
15 | #include <ATen/cuda/CUDAContext.h>
16 | 
17 | at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
18 |                              const at::Tensor &mask, const int kernel_h,
19 |                              const int kernel_w, const int stride_h,
20 |                              const int stride_w, const int pad_h,
21 |                              const int pad_w, const int dilation_h,
22 |                              const int dilation_w, const int group,
23 |                              const int group_channels, const float offset_scale,
24 |                              const int im2col_step) {
25 |     AT_ERROR("Not implement on cpu");
26 | }
27 | 
28 | std::vector<at::Tensor>
29 | dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
30 |                    const at::Tensor &mask, const int kernel_h,
31 |                    const int kernel_w, const int stride_h, const int stride_w,
32 |                    const int pad_h, const int pad_w, const int dilation_h,
33 |                    const int dilation_w, const int group,
34 |                    const int group_channels, const float offset_scale,
35 |                    const at::Tensor &grad_output, const int im2col_step) {
36 |     AT_ERROR("Not implement on cpu");
37 | }
38 | 


--------------------------------------------------------------------------------
/segmentation/ops_dcnv3/src/cpu/dcnv3_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #pragma once
13 | #include <torch/extension.h>
14 | 
15 | at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
16 |                              const at::Tensor &mask, const int kernel_h,
17 |                              const int kernel_w, const int stride_h,
18 |                              const int stride_w, const int pad_h,
19 |                              const int pad_w, const int dilation_h,
20 |                              const int dilation_w, const int group,
21 |                              const int group_channels, const float offset_scale,
22 |                              const int im2col_step);
23 | 
24 | std::vector<at::Tensor>
25 | dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
26 |                    const at::Tensor &mask, const int kernel_h,
27 |                    const int kernel_w, const int stride_h, const int stride_w,
28 |                    const int pad_h, const int pad_w, const int dilation_h,
29 |                    const int dilation_w, const int group,
30 |                    const int group_channels, const float offset_scale,
31 |                    const at::Tensor &grad_output, const int im2col_step);
32 | 


--------------------------------------------------------------------------------
/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #pragma once
13 | #include <torch/extension.h>
14 | 
15 | at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset,
16 |                               const at::Tensor &mask, const int kernel_h,
17 |                               const int kernel_w, const int stride_h,
18 |                               const int stride_w, const int pad_h,
19 |                               const int pad_w, const int dilation_h,
20 |                               const int dilation_w, const int group,
21 |                               const int group_channels,
22 |                               const float offset_scale, const int im2col_step);
23 | 
24 | std::vector<at::Tensor>
25 | dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
26 |                     const at::Tensor &mask, const int kernel_h,
27 |                     const int kernel_w, const int stride_h, const int stride_w,
28 |                     const int pad_h, const int pad_w, const int dilation_h,
29 |                     const int dilation_w, const int group,
30 |                     const int group_channels, const float offset_scale,
31 |                     const at::Tensor &grad_output, const int im2col_step);
32 | 


--------------------------------------------------------------------------------
/segmentation/ops_dcnv3/src/dcnv3.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #pragma once
13 | 
14 | #include "cpu/dcnv3_cpu.h"
15 | 
16 | #ifdef WITH_CUDA
17 | #include "cuda/dcnv3_cuda.h"
18 | #endif
19 | 
20 | at::Tensor dcnv3_forward(const at::Tensor &input, const at::Tensor &offset,
21 |                          const at::Tensor &mask, const int kernel_h,
22 |                          const int kernel_w, const int stride_h,
23 |                          const int stride_w, const int pad_h, const int pad_w,
24 |                          const int dilation_h, const int dilation_w,
25 |                          const int group, const int group_channels,
26 |                          const float offset_scale, const int im2col_step) {
27 |     if (input.type().is_cuda()) {
28 | #ifdef WITH_CUDA
29 |         return dcnv3_cuda_forward(input, offset, mask, kernel_h, kernel_w,
30 |                                   stride_h, stride_w, pad_h, pad_w, dilation_h,
31 |                                   dilation_w, group, group_channels,
32 |                                   offset_scale, im2col_step);
33 | #else
34 |         AT_ERROR("Not compiled with GPU support");
35 | #endif
36 |     }
37 |     AT_ERROR("Not implemented on the CPU");
38 | }
39 | 
40 | std::vector<at::Tensor>
41 | dcnv3_backward(const at::Tensor &input, const at::Tensor &offset,
42 |                const at::Tensor &mask, const int kernel_h, const int kernel_w,
43 |                const int stride_h, const int stride_w, const int pad_h,
44 |                const int pad_w, const int dilation_h, const int dilation_w,
45 |                const int group, const int group_channels,
46 |                const float offset_scale, const at::Tensor &grad_output,
47 |                const int im2col_step) {
48 |     if (input.type().is_cuda()) {
49 | #ifdef WITH_CUDA
50 |         return dcnv3_cuda_backward(input, offset, mask, kernel_h, kernel_w,
51 |                                    stride_h, stride_w, pad_h, pad_w, dilation_h,
52 |                                    dilation_w, group, group_channels,
53 |                                    offset_scale, grad_output, im2col_step);
54 | #else
55 |         AT_ERROR("Not compiled with GPU support");
56 | #endif
57 |     }
58 |     AT_ERROR("Not implemented on the CPU");
59 | }
60 | 


--------------------------------------------------------------------------------
/segmentation/ops_dcnv3/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * InternImage
 4 | * Copyright (c) 2022 OpenGVLab
 5 | * Licensed under The MIT License [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from
 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 9 | **************************************************************************************************
10 | */
11 | 
12 | #include "dcnv3.h"
13 | 
14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
15 |     m.def("dcnv3_forward", &dcnv3_forward, "dcnv3_forward");
16 |     m.def("dcnv3_backward", &dcnv3_backward, "dcnv3_backward");
17 | }
18 | 


--------------------------------------------------------------------------------
/segmentation/slurm_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION=$1
 6 | JOB_NAME=$2
 7 | CONFIG=$3
 8 | CHECKPOINT=$4
 9 | GPUS=${GPUS:-8}
10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5}
12 | PY_ARGS=${@:5}
13 | SRUN_ARGS=${SRUN_ARGS:-""}
14 | 
15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
16 | srun -p ${PARTITION} \
17 |     --job-name=${JOB_NAME} \
18 |     --gres=gpu:${GPUS_PER_NODE} \
19 |     --ntasks=${GPUS} \
20 |     --ntasks-per-node=${GPUS_PER_NODE} \
21 |     --cpus-per-task=${CPUS_PER_TASK} \
22 |     --kill-on-bad-exit=1 \
23 |     --quotatype=auto \
24 |     ${SRUN_ARGS} \
25 |     python -u test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
26 | 


--------------------------------------------------------------------------------
/segmentation/slurm_train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION=$1
 6 | JOB_NAME=$2
 7 | CONFIG=$3
 8 | GPUS=${GPUS:-8}
 9 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
10 | CPUS_PER_TASK=${CPUS_PER_TASK:-5}
11 | SRUN_ARGS=${SRUN_ARGS:-""}
12 | PY_ARGS=${@:4}
13 | 
14 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
15 | srun -p ${PARTITION} \
16 |     --job-name=${JOB_NAME} \
17 |     --gres=gpu:${GPUS_PER_NODE} \
18 |     --ntasks=${GPUS} \
19 |     --ntasks-per-node=${GPUS_PER_NODE} \
20 |     --cpus-per-task=${CPUS_PER_TASK} \
21 |     --quotatype=reserved \
22 |     --kill-on-bad-exit=1 \
23 |     ${SRUN_ARGS} \
24 |     python -u train.py ${CONFIG} --launcher="slurm" ${PY_ARGS}
25 | 


--------------------------------------------------------------------------------