├── .gitignore ├── DCNv4_op ├── DCNv4 │ ├── __init__.py │ ├── functions │ │ ├── __init__.py │ │ ├── dcnv4_func.py │ │ ├── flash_deform_attn_func.py │ │ └── table.py │ └── modules │ │ ├── __init__.py │ │ ├── dcnv4.py │ │ └── flash_deform_attn.py ├── MANIFEST.in ├── __init__.py ├── make.sh ├── scripts │ ├── find_best.py │ ├── search_bwd.sh │ ├── search_dcnv4.py │ ├── search_dcnv4_bwd.py │ ├── search_dcnv4_bwd_engine.py │ ├── search_dcnv4_engine.py │ ├── search_fwd.sh │ ├── test_dcnv4.py │ ├── test_dcnv4_bwd.py │ ├── test_flash_deform_attn.py │ └── test_flash_deform_attn_backward.py ├── setup.py └── src │ ├── cuda │ ├── common.h │ ├── dcnv4_col2im_cuda.cuh │ ├── dcnv4_cuda.cu │ ├── dcnv4_cuda.h │ ├── dcnv4_im2col_cuda.cuh │ ├── flash_deform_attn_cuda.cu │ ├── flash_deform_attn_cuda.h │ ├── flash_deform_col2im_cuda.cuh │ └── flash_deform_im2col_cuda.cuh │ ├── dcnv4.h │ └── vision.cpp ├── LICENSE ├── README.md ├── classification ├── README.md ├── config.py ├── configs │ ├── flash_intern_image_b_1k_224.yaml │ ├── flash_intern_image_l_22kto1k_384.yaml │ ├── flash_intern_image_s_1k_224.yaml │ └── flash_intern_image_t_1k_224.yaml ├── dataset │ ├── __init__.py │ ├── build.py │ ├── cached_image_folder.py │ ├── samplers.py │ └── zipreader.py ├── ddp_hooks.py ├── ema_deepspeed.py ├── eval.sh ├── export.py ├── extract_feature.py ├── logger.py ├── lr_scheduler.py ├── main.py ├── main_accelerate.py ├── main_deepspeed.py ├── meta_data │ ├── 22k_class_to_idx.json │ ├── map22kto1k.txt │ └── meta ├── models │ ├── __init__.py │ ├── build.py │ ├── flash_intern_image.py │ └── intern_image.py ├── ops_dcnv3 │ ├── functions │ │ ├── __init__.py │ │ └── dcnv3_func.py │ ├── make.sh │ ├── modules │ │ ├── __init__.py │ │ └── dcnv3.py │ ├── setup.py │ ├── src │ │ ├── cpu │ │ │ ├── dcnv3_cpu.cpp │ │ │ └── dcnv3_cpu.h │ │ ├── cuda │ │ │ ├── dcnv3_cuda.cu │ │ │ ├── dcnv3_cuda.h │ │ │ └── dcnv3_im2col_cuda.cuh │ │ ├── dcnv3.h │ │ └── vision.cpp │ └── test.py ├── optimizer.py ├── train_in1k.sh ├── train_in1k_deepspeed.sh └── utils.py ├── detection ├── README.md ├── configs │ ├── _base_ │ │ ├── datasets │ │ │ ├── coco_detection.py │ │ │ ├── coco_instance.py │ │ │ └── crowd_human.py │ │ ├── default_runtime.py │ │ ├── models │ │ │ ├── cascade_mask_rcnn_convnext_fpn.py │ │ │ ├── cascade_mask_rcnn_r50_fpn.py │ │ │ ├── cascade_mask_rcnn_r50_fpn_crowdhuman.py │ │ │ ├── cascade_rcnn_r50_fpn.py │ │ │ ├── fast_rcnn_r50_fpn.py │ │ │ ├── faster_rcnn_r50_caffe_c4.py │ │ │ ├── faster_rcnn_r50_caffe_dc5.py │ │ │ ├── faster_rcnn_r50_fpn.py │ │ │ ├── mask_rcnn_convnext_fpn.py │ │ │ ├── mask_rcnn_r50_caffe_c4.py │ │ │ ├── mask_rcnn_r50_fpn.py │ │ │ ├── retinanet_r50_fpn.py │ │ │ ├── rpn_r50_caffe_c4.py │ │ │ ├── rpn_r50_fpn.py │ │ │ └── ssd300.py │ │ └── schedules │ │ │ ├── schedule_1x.py │ │ │ └── schedule_3x.py │ └── coco │ │ ├── README.md │ │ ├── cascade_flash_intern_image_l_fpn_1x_coco.py │ │ ├── cascade_flash_intern_image_l_fpn_3x_coco.py │ │ ├── dino_4scale_flash_internimage_b_1x_coco.py │ │ ├── dino_4scale_flash_internimage_l_1x_coco.py │ │ ├── dino_4scale_flash_internimage_s_1x_coco.py │ │ ├── dino_4scale_flash_internimage_t_1x_coco.py │ │ ├── mask_rcnn_flash_intern_image_b_fpn_1x_coco.py │ │ ├── mask_rcnn_flash_intern_image_b_fpn_3x_coco.py │ │ ├── mask_rcnn_flash_intern_image_s_fpn_1x_coco.py │ │ ├── mask_rcnn_flash_intern_image_s_fpn_3x_coco.py │ │ ├── mask_rcnn_flash_intern_image_t_fpn_1x_coco.py │ │ └── mask_rcnn_flash_intern_image_t_fpn_3x_coco.py ├── dist_test.sh ├── dist_train.sh ├── get_flops.py ├── image_demo.py ├── mmcv_custom │ ├── __init__.py │ └── custom_layer_decay_optimizer_constructor.py ├── mmdet_custom │ ├── __init__.py │ ├── datasets │ │ ├── __init__.py │ │ └── crowd_human.py │ └── models │ │ ├── __init__.py │ │ ├── backbones │ │ ├── __init__.py │ │ └── flash_intern_image.py │ │ ├── dense_heads │ │ ├── __init__.py │ │ ├── bbox_head.py │ │ ├── deformable_detr_head.py │ │ ├── detr_head.py │ │ ├── dino_head.py │ │ ├── mask_rcnn.py │ │ ├── msda.py │ │ └── two_stage.py │ │ ├── detectors │ │ ├── __init__.py │ │ └── dino.py │ │ ├── necks │ │ └── fpn.py │ │ └── utils │ │ ├── __init__.py │ │ ├── convModule_norm.py │ │ ├── query_denoising.py │ │ └── transformer.py ├── ops_dcnv3 │ ├── functions │ │ ├── __init__.py │ │ └── dcnv3_func.py │ ├── make.sh │ ├── modules │ │ ├── __init__.py │ │ └── dcnv3.py │ ├── setup.py │ ├── src │ │ ├── cpu │ │ │ ├── dcnv3_cpu.cpp │ │ │ └── dcnv3_cpu.h │ │ ├── cuda │ │ │ ├── dcnv3_cuda.cu │ │ │ ├── dcnv3_cuda.h │ │ │ └── dcnv3_im2col_cuda.cuh │ │ ├── dcnv3.h │ │ └── vision.cpp │ └── test.py ├── slurm_test.sh ├── slurm_train.sh ├── test.py ├── tools │ ├── create_crowd_anno.py │ └── evaluate │ │ └── __init__.py └── train.py └── segmentation ├── README.md ├── configs ├── _base_ │ ├── datasets │ │ ├── ade20k.py │ │ ├── ade20k_640x640.py │ │ ├── chase_db1.py │ │ ├── cityscapes.py │ │ ├── cityscapes_1024x1024.py │ │ ├── cityscapes_extra.py │ │ ├── coco-stuff10k.py │ │ ├── coco-stuff164k.py │ │ ├── drive.py │ │ ├── hrf.py │ │ ├── loveda.py │ │ ├── mapillary.py │ │ ├── mapillary_1024x1024.py │ │ ├── nyu_depth_v2.py │ │ ├── pascal_context.py │ │ ├── pascal_context_59.py │ │ ├── pascal_voc12.py │ │ ├── pascal_voc12_aug.py │ │ ├── potsdam.py │ │ └── stare.py │ ├── default_runtime.py │ ├── models │ │ ├── mask2former_beit.py │ │ ├── segformer_mit-b0.py │ │ ├── upernet_convnext.py │ │ ├── upernet_r50.py │ │ └── upernet_swin.py │ └── schedules │ │ ├── schedule_160k.py │ │ ├── schedule_20k.py │ │ ├── schedule_320k.py │ │ ├── schedule_40k.py │ │ └── schedule_80k.py └── ade20k │ ├── README.md │ ├── mask2former_flash_internimage_b_640_160k_ade20k_ss.py │ ├── mask2former_flash_internimage_l_640_160k_ade20k_ss.py │ ├── mask2former_flash_internimage_s_640_160k_ade20k_ss.py │ ├── mask2former_flash_internimage_s_640_160k_ade20k_ss_nsmx.py │ ├── mask2former_flash_internimage_t_512_160k_ade20k_ss.py │ ├── upernet_flash_internimage_b_512_160k_ade20k.py │ ├── upernet_flash_internimage_l_640_160k_ade20k.py │ ├── upernet_flash_internimage_s_512_160k_ade20k.py │ └── upernet_flash_internimage_t_512_160k_ade20k.py ├── deploy ├── configs │ ├── _base_ │ │ ├── backends │ │ │ └── tensorrt.py │ │ └── onnx_config.py │ └── mmseg │ │ ├── segmentation_static.py │ │ └── segmentation_tensorrt_static-512x512.py └── demo.png ├── dist_test.sh ├── dist_train.sh ├── get_flops.py ├── image_demo.py ├── mmcv_custom ├── __init__.py ├── custom_layer_decay_optimizer_constructor.py ├── layer_decay.py └── layer_decay_vit.py ├── mmseg_custom ├── __init__.py ├── core │ ├── __init__.py │ ├── anchor │ │ ├── __init__.py │ │ ├── builder.py │ │ └── point_generator.py │ ├── box │ │ ├── __init__.py │ │ ├── builder.py │ │ └── samplers │ │ │ ├── __init__.py │ │ │ ├── base_sampler.py │ │ │ ├── mask_pseudo_sampler.py │ │ │ ├── mask_sampling_result.py │ │ │ └── sampling_result.py │ ├── evaluation │ │ ├── __init__.py │ │ └── panoptic_utils.py │ ├── mask │ │ ├── __init__.py │ │ └── utils.py │ └── utils │ │ ├── __init__.py │ │ ├── dist_utils.py │ │ └── misc.py ├── datasets │ ├── __init__.py │ ├── dataset_wrappers.py │ ├── mapillary.py │ ├── nyu_depth_v2.py │ └── pipelines │ │ ├── __init__.py │ │ ├── formatting.py │ │ └── transform.py └── models │ ├── __init__.py │ ├── backbones │ ├── __init__.py │ └── flash_intern_image.py │ ├── builder.py │ ├── decode_heads │ ├── __init__.py │ ├── mask2former_head.py │ ├── maskformer_head.py │ └── msda.py │ ├── losses │ ├── __init__.py │ ├── cross_entropy_loss.py │ ├── dice_loss.py │ ├── focal_loss.py │ ├── match_costs.py │ └── match_loss.py │ ├── plugins │ ├── __init__.py │ ├── msdeformattn_pixel_decoder.py │ └── pixel_decoder.py │ ├── segmentors │ ├── __init__.py │ ├── encoder_decoder_mask2former.py │ └── encoder_decoder_mask2former_aug.py │ └── utils │ ├── __init__.py │ ├── assigner.py │ ├── point_sample.py │ ├── positional_encoding.py │ └── transformer.py ├── ops_dcnv3 ├── functions │ ├── __init__.py │ └── dcnv3_func.py ├── make.sh ├── modules │ ├── __init__.py │ └── dcnv3.py ├── setup.py ├── src │ ├── cpu │ │ ├── dcnv3_cpu.cpp │ │ └── dcnv3_cpu.h │ ├── cuda │ │ ├── dcnv3_cuda.cu │ │ ├── dcnv3_cuda.h │ │ └── dcnv3_im2col_cuda.cuh │ ├── dcnv3.h │ └── vision.cpp └── test.py ├── slurm_test.sh ├── slurm_train.sh ├── test.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .idea/ 3 | .DS_Store 4 | __pycache__/ 5 | classification/convertor/ 6 | segmentation/convertor/ 7 | checkpoint_dir/ 8 | demo/ 9 | detection/work_dirs 10 | *.pth 11 | ops_dcnv3/build 12 | ops_dcnv3/dist 13 | ops_dcnv3/DCNv3.egg-info 14 | DCNv4_op/DCNv4.egg-info 15 | build/ 16 | dist/ 17 | ckpts/ 18 | ckpts 19 | data 20 | data/ 21 | detection/data 22 | detection/ckpts 23 | segmentation/data 24 | segmentation/ckpts 25 | work_dirs/ 26 | output -------------------------------------------------------------------------------- /DCNv4_op/DCNv4/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import DCNv4Function, FlashDeformAttnFunction 2 | from .modules import DCNv4, FlashDeformAttn -------------------------------------------------------------------------------- /DCNv4_op/DCNv4/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # from .ms_flash_deform_attn_func import FlashMSDeformAttnFunction 10 | from .flash_deform_attn_func import FlashDeformAttnFunction 11 | from .dcnv4_func import DCNv4Function -------------------------------------------------------------------------------- /DCNv4_op/DCNv4/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .flash_deform_attn import FlashDeformAttn 10 | from .dcnv4 import DCNv4 11 | -------------------------------------------------------------------------------- /DCNv4_op/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include src/* 2 | include src/cuda/* 3 | -------------------------------------------------------------------------------- /DCNv4_op/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/DCNv4/4b848f7dd7da74ff03f7d278f902c6fd05b391b5/DCNv4_op/__init__.py -------------------------------------------------------------------------------- /DCNv4_op/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | python setup.py build install 11 | -------------------------------------------------------------------------------- /DCNv4_op/scripts/find_best.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | class LineParser: 4 | def __init__(self) -> None: 5 | self.data = {} 6 | 7 | def parse(self, line): 8 | def startswith(line, lst): 9 | for ele in lst: 10 | if line.startswith(ele): 11 | return True 12 | return False 13 | 14 | if not startswith(line, ['1', '2', '3', '4', '5', '6', '7', '8', '9']): 15 | return 16 | 17 | eles = line.strip().split() 18 | key = eles[0] 19 | if key not in self.data: 20 | self.data[key] = [] 21 | 22 | self.data[key].append([eles[1], float(eles[2])]) 23 | 24 | def sort(self): 25 | for k, v in self.data.items(): 26 | nv = sorted(v, key=lambda x: x[1]) 27 | self.data[k] = nv 28 | 29 | def display_best(self): 30 | for k, v in self.data.items(): 31 | print(f'{k} \t {v[0][0]} \t {v[0][1]:.4f} \t {v[1][0]} \t {v[1][1]:.4f}') 32 | 33 | def display_best_python(self, output): 34 | res = {} 35 | def parse(spec): 36 | d_stride = int(spec.split('/')[0]) 37 | thread = int(spec.split('/')[1].split('(')[0]) 38 | m = int(spec.split('(')[1].split(')')[0]) 39 | return d_stride, thread, m 40 | 41 | for k, v in self.data.items(): 42 | res[k] = parse(v[0][0]) 43 | 44 | with open(output, "w") as f: 45 | json.dump(res, f, indent=4) 46 | 47 | if __name__ == '__main__': 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument('--input', type=str) 50 | parser.add_argument('--output', type=str) 51 | args = parser.parse_args() 52 | 53 | with open(args.input) as f: 54 | lines = f.readlines() 55 | 56 | lineparser = LineParser() 57 | for line in lines: 58 | lineparser.parse(line) 59 | lineparser.sort() 60 | lineparser.display_best() 61 | lineparser.display_best_python(args.output) -------------------------------------------------------------------------------- /DCNv4_op/scripts/search_bwd.sh: -------------------------------------------------------------------------------- 1 | python search_dcnv4_bwd_engine.py > res_bwd.txt 2 | python find_best.py --input res_bwd.txt --output table_bwd.py -------------------------------------------------------------------------------- /DCNv4_op/scripts/search_dcnv4_bwd_engine.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def factors(N): 4 | res = [] 5 | for i in range(1, N+1): 6 | if N % i == 0: 7 | res.append(i) 8 | return res 9 | 10 | if __name__ == '__main__': 11 | BATCH=64 12 | for N, Hin, Win in [(BATCH, 56, 56), (BATCH, 28, 28), (BATCH, 14, 14), (BATCH, 7, 7), 13 | (1, 200, 320), (1, 100, 160), (1, 50, 80), (1, 25, 40), (1, 64, 64)]: 14 | for group_channel in [16, 32, 64]: 15 | for group in [4, 5, 6, 7, 8]: 16 | for d_stride in [1, 2, 4]: 17 | for m in factors(N*Hin*Win): 18 | if m > 64: 19 | break 20 | block_thread = group * (group_channel//d_stride) * m 21 | if block_thread > 1024: 22 | break 23 | cmd = f"python search_dcnv4_bwd.py --n {N} --h {Hin} --w {Win} --g {group} --c {group_channel} --dstride {d_stride} --blockthread {block_thread} --multiplier {m}" 24 | os.system(cmd) -------------------------------------------------------------------------------- /DCNv4_op/scripts/search_dcnv4_engine.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def factors(N): 4 | res = [] 5 | for i in range(1, N+1): 6 | if N % i == 0: 7 | res.append(i) 8 | return res 9 | 10 | if __name__ == '__main__': 11 | BATCH=64 12 | for group_channel in [16, 32, 64]: 13 | for group in [4, 5, 6, 7, 8]: 14 | for N, Hin, Win in [(BATCH, 56, 56), (BATCH, 28, 28), (BATCH, 14, 14), (BATCH, 7, 7), 15 | (1, 200, 320), (1, 100, 160), (1, 50, 80), (1, 25, 40), (1, 64, 64)]: 16 | for d_stride in [2, 4, 8, 16]: 17 | for m in factors(N*Hin*Win): 18 | if m > 64: 19 | break 20 | block_thread = group * (group_channel//d_stride) * m 21 | if block_thread > 1024: 22 | break 23 | cmd = f"python search_dcnv4.py --n {N} --h {Hin} --w {Win} --g {group} --c {group_channel} --dstride {d_stride} --blockthread {block_thread} --multiplier {m}" 24 | os.system(cmd) -------------------------------------------------------------------------------- /DCNv4_op/scripts/search_fwd.sh: -------------------------------------------------------------------------------- 1 | python search_dcnv4_engine.py > res.txt 2 | python find_best.py --input res.txt --output table.py -------------------------------------------------------------------------------- /DCNv4_op/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable Convolution v4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import os 10 | import glob 11 | 12 | import torch 13 | 14 | from torch.utils.cpp_extension import CUDA_HOME 15 | from torch.utils.cpp_extension import CppExtension 16 | from torch.utils.cpp_extension import CUDAExtension 17 | 18 | from setuptools import find_packages 19 | from setuptools import setup 20 | 21 | requirements = ["torch", "torchvision"] 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | if torch.cuda.is_available() and CUDA_HOME is not None: 37 | extension = CUDAExtension 38 | sources += source_cuda 39 | define_macros += [("WITH_CUDA", None)] 40 | extra_compile_args["nvcc"] = [ 41 | "-DCUDA_HAS_FP16=1", 42 | "-D__CUDA_NO_HALF_OPERATORS__", 43 | "-D__CUDA_NO_HALF_CONVERSIONS__", 44 | "-D__CUDA_NO_HALF2_OPERATORS__", 45 | "-O3", 46 | ] 47 | else: 48 | raise NotImplementedError('Cuda is not available') 49 | 50 | sources = [os.path.join(extensions_dir, s) for s in sources] 51 | include_dirs = [extensions_dir] 52 | ext_modules = [ 53 | extension( 54 | "DCNv4.ext", 55 | sources, 56 | include_dirs=include_dirs, 57 | define_macros=define_macros, 58 | extra_compile_args=extra_compile_args, 59 | ) 60 | ] 61 | return ext_modules 62 | 63 | setup( 64 | name="DCNv4", 65 | version="1.0.0.post2", 66 | author="Yuwen Xiong, Feng Wang", 67 | url="", 68 | description="PyTorch Wrapper for CUDA Functions of DCNv4", 69 | packages=['DCNv4', 'DCNv4/functions', 'DCNv4/modules'], 70 | ext_modules=get_extensions(), 71 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 72 | ) 73 | -------------------------------------------------------------------------------- /DCNv4_op/src/cuda/dcnv4_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #pragma once 13 | #include 14 | 15 | at::Tensor dcnv4_cuda_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &p_offset, 18 | const int kernel_h, const int kernel_w, const int stride_h, 19 | const int stride_w, const int pad_h, const int pad_w, const int dilation_h, 20 | const int dilation_w, const int group, const int group_channels, 21 | const float offset_scale, const int im2col_step, const int remove_center, 22 | const int d_stride, const int block_thread, const bool softmax); 23 | 24 | std::vector 25 | dcnv4_cuda_backward( 26 | const at::Tensor &value, 27 | const at::Tensor &p_offset, 28 | const int kernel_h, const int kernel_w, const int stride_h, 29 | const int stride_w, const int pad_h, const int pad_w, const int dilation_h, 30 | const int dilation_w, const int group, const int group_channels, 31 | const float offset_scale, const int im2col_step, const at::Tensor &grad_output, 32 | const int remove_center, const int d_stride, const int block_thread, 33 | const bool softmax); -------------------------------------------------------------------------------- /DCNv4_op/src/cuda/flash_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #pragma once 13 | #include 14 | 15 | at::Tensor flash_deform_attn_cuda_forward( 16 | const at::Tensor &value, const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, const at::Tensor &sampling_loc_attn, 18 | const int im2col_step, const int K, const int d_stride, const int block_thread); 19 | 20 | std::vector 21 | flash_deform_attn_cuda_backward( 22 | const at::Tensor &value, const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, const at::Tensor &sampling_loc_attn, 24 | const at::Tensor &grad_output, const int im2col_step, const int K, 25 | const int d_stride, const int block_thread); -------------------------------------------------------------------------------- /DCNv4_op/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #include "dcnv4.h" 13 | 14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 15 | m.def("flash_deform_attn_forward", &flash_deform_attn_forward, 16 | "flash_deform_attn_forward"); 17 | m.def("flash_deform_attn_backward", &flash_deform_attn_backward, 18 | "flash_deform_attn_backward"); 19 | m.def("dcnv4_forward", &dcnv4_forward, "dcnv4_forward"); 20 | m.def("dcnv4_backward", &dcnv4_backward, "dcnv4_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 OpenGVLab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /classification/configs/flash_intern_image_b_1k_224.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | MODEL: 4 | TYPE: flash_intern_image 5 | DROP_PATH_RATE: 0.5 6 | FLASH_INTERN_IMAGE: 7 | CORE_OP: 'DCNv4' 8 | DEPTHS: [4, 4, 21, 4] 9 | GROUPS: [7, 14, 28, 56] 10 | CHANNELS: 112 11 | LAYER_SCALE: 1e-5 12 | OFFSET_SCALE: 0.5 13 | MLP_RATIO: 4.0 14 | POST_NORM: True 15 | DW_KERNEL_SIZE: 3 16 | TRAIN: 17 | EMA: 18 | ENABLE: True 19 | DECAY: 0.9999 20 | BASE_LR: 5e-4 -------------------------------------------------------------------------------- /classification/configs/flash_intern_image_l_22kto1k_384.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_SIZE: 384 3 | IMG_ON_MEMORY: False 4 | AUG: 5 | MIXUP: 0.0 6 | CUTMIX: 0.0 7 | REPROB: 0.0 8 | MODEL: 9 | TYPE: flash_intern_image 10 | DROP_PATH_RATE: 0.1 11 | LABEL_SMOOTHING: 0.3 12 | FLASH_INTERN_IMAGE: 13 | CORE_OP: 'DCNv4' 14 | DEPTHS: [5, 5, 22, 5] 15 | GROUPS: [10, 20, 40, 80] 16 | CHANNELS: 160 17 | LAYER_SCALE: 1e-5 18 | OFFSET_SCALE: 2.0 19 | MLP_RATIO: 4.0 20 | POST_NORM: True 21 | DW_KERNEL_SIZE: 3 22 | DCN_OUTPUT_BIAS: True 23 | MLP_FC2_BIAS: True 24 | TRAIN: 25 | EMA: 26 | ENABLE: true 27 | DECAY: 0.9999 28 | EPOCHS: 20 29 | WARMUP_EPOCHS: 2 30 | WEIGHT_DECAY: 0.05 31 | BASE_LR: 2e-05 # 512 32 | WARMUP_LR: .0 33 | MIN_LR: .0 34 | LR_LAYER_DECAY: true 35 | LR_LAYER_DECAY_RATIO: 0.9 36 | USE_CHECKPOINT: true 37 | OPTIMIZER: 38 | DCN_LR_MUL: 0.1 39 | AMP_OPT_LEVEL: O0 40 | EVAL_FREQ: 1 -------------------------------------------------------------------------------- /classification/configs/flash_intern_image_s_1k_224.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | MODEL: 4 | TYPE: flash_intern_image 5 | DROP_PATH_RATE: 0.4 6 | FLASH_INTERN_IMAGE: 7 | CORE_OP: 'DCNv4' 8 | DEPTHS: [4, 4, 21, 4] 9 | GROUPS: [5, 10, 20, 40] 10 | CHANNELS: 80 11 | LAYER_SCALE: 1e-5 12 | OFFSET_SCALE: 1.0 13 | MLP_RATIO: 4.0 14 | POST_NORM: True 15 | DW_KERNEL_SIZE: 3 16 | TRAIN: 17 | EMA: 18 | ENABLE: True 19 | DECAY: 0.9999 20 | BASE_LR: 5e-4 21 | -------------------------------------------------------------------------------- /classification/configs/flash_intern_image_t_1k_224.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | IMG_ON_MEMORY: False 3 | MODEL: 4 | TYPE: flash_intern_image 5 | DROP_PATH_RATE: 0.1 6 | FLASH_INTERN_IMAGE: 7 | CORE_OP: 'DCNv4' 8 | DEPTHS: [4, 4, 18, 4] 9 | GROUPS: [4, 8, 16, 32] 10 | CHANNELS: 64 11 | OFFSET_SCALE: 1.0 12 | MLP_RATIO: 4.0 13 | TRAIN: 14 | EMA: 15 | ENABLE: True 16 | DECAY: 0.9999 17 | BASE_LR: 5e-4 18 | -------------------------------------------------------------------------------- /classification/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .build import build_loader, build_loader2 -------------------------------------------------------------------------------- /classification/dataset/zipreader.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | import os 8 | import zipfile 9 | import io 10 | import numpy as np 11 | from PIL import Image 12 | from PIL import ImageFile 13 | 14 | ImageFile.LOAD_TRUNCATED_IMAGES = True 15 | 16 | 17 | def is_zip_path(img_or_path): 18 | """judge if this is a zip path""" 19 | return '.zip@' in img_or_path 20 | 21 | 22 | class ZipReader(object): 23 | """A class to read zipped files""" 24 | zip_bank = dict() 25 | 26 | def __init__(self): 27 | super(ZipReader, self).__init__() 28 | 29 | @staticmethod 30 | def get_zipfile(path): 31 | zip_bank = ZipReader.zip_bank 32 | if path not in zip_bank: 33 | zfile = zipfile.ZipFile(path, 'r') 34 | zip_bank[path] = zfile 35 | return zip_bank[path] 36 | 37 | @staticmethod 38 | def split_zip_style_path(path): 39 | pos_at = path.index('@') 40 | assert pos_at != -1, "character '@' is not found from the given path '%s'" % path 41 | 42 | zip_path = path[0:pos_at] 43 | folder_path = path[pos_at + 1:] 44 | folder_path = str.strip(folder_path, '/') 45 | return zip_path, folder_path 46 | 47 | @staticmethod 48 | def list_folder(path): 49 | zip_path, folder_path = ZipReader.split_zip_style_path(path) 50 | 51 | zfile = ZipReader.get_zipfile(zip_path) 52 | folder_list = [] 53 | for file_foler_name in zfile.namelist(): 54 | file_foler_name = str.strip(file_foler_name, '/') 55 | if file_foler_name.startswith(folder_path) and \ 56 | len(os.path.splitext(file_foler_name)[-1]) == 0 and \ 57 | file_foler_name != folder_path: 58 | if len(folder_path) == 0: 59 | folder_list.append(file_foler_name) 60 | else: 61 | folder_list.append(file_foler_name[len(folder_path) + 1:]) 62 | 63 | return folder_list 64 | 65 | @staticmethod 66 | def list_files(path, extension=None): 67 | if extension is None: 68 | extension = ['.*'] 69 | zip_path, folder_path = ZipReader.split_zip_style_path(path) 70 | 71 | zfile = ZipReader.get_zipfile(zip_path) 72 | file_lists = [] 73 | for file_foler_name in zfile.namelist(): 74 | file_foler_name = str.strip(file_foler_name, '/') 75 | if file_foler_name.startswith(folder_path) and \ 76 | str.lower(os.path.splitext(file_foler_name)[-1]) in extension: 77 | if len(folder_path) == 0: 78 | file_lists.append(file_foler_name) 79 | else: 80 | file_lists.append(file_foler_name[len(folder_path) + 1:]) 81 | 82 | return file_lists 83 | 84 | @staticmethod 85 | def read(path): 86 | zip_path, path_img = ZipReader.split_zip_style_path(path) 87 | zfile = ZipReader.get_zipfile(zip_path) 88 | data = zfile.read(path_img) 89 | return data 90 | 91 | @staticmethod 92 | def imread(path): 93 | zip_path, path_img = ZipReader.split_zip_style_path(path) 94 | zfile = ZipReader.get_zipfile(zip_path) 95 | data = zfile.read(path_img) 96 | try: 97 | im = Image.open(io.BytesIO(data)) 98 | except: 99 | print("ERROR IMG LOADED: ", path_img) 100 | random_img = np.random.rand(224, 224, 3) * 255 101 | im = Image.fromarray(np.uint8(random_img)) 102 | return im 103 | -------------------------------------------------------------------------------- /classification/eval.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.launch --nproc_per_node 1 --master_port 12345 main.py --eval \ 2 | --cfg configs/flash_intern_image_l_22k_384.yaml --data-path /path/to/imagenet1k 3 | -------------------------------------------------------------------------------- /classification/export.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | 8 | import os 9 | import time 10 | import argparse 11 | 12 | import torch 13 | from tqdm import tqdm 14 | 15 | from config import get_config 16 | from models import build_model 17 | 18 | def get_args(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--model_name', type=str, 21 | default='internimage_t_1k_224') 22 | parser.add_argument('--ckpt_dir', type=str, 23 | default='/mnt/petrelfs/share_data/huangzhenhang/code/internimage/checkpoint_dir/new/cls') 24 | parser.add_argument('--onnx', default=False, action='store_true') 25 | parser.add_argument('--trt', default=False, action='store_true') 26 | 27 | args = parser.parse_args() 28 | args.cfg = os.path.join('./configs', f'{args.model_name}.yaml') 29 | args.ckpt = os.path.join(args.ckpt_dir, f'{args.model_name}.pth') 30 | args.size = int(args.model_name.split('.')[0].split('_')[-1]) 31 | 32 | cfg = get_config(args) 33 | return args, cfg 34 | 35 | def get_model(args, cfg): 36 | model = build_model(cfg) 37 | ckpt = torch.load(args.ckpt, map_location='cpu')['model'] 38 | 39 | model.load_state_dict(ckpt) 40 | return model 41 | 42 | def speed_test(model, input): 43 | # warmup 44 | for _ in tqdm(range(100)): 45 | _ = model(input) 46 | 47 | # speed test 48 | torch.cuda.synchronize() 49 | start = time.time() 50 | for _ in tqdm(range(100)): 51 | _ = model(input) 52 | end = time.time() 53 | th = 100 / (end - start) 54 | print(f"using time: {end - start}, throughput {th}") 55 | 56 | def torch2onnx(args, cfg): 57 | model = get_model(args, cfg).cuda() 58 | 59 | # speed_test(model) 60 | 61 | onnx_name = f'{args.model_name}.onnx' 62 | torch.onnx.export(model, 63 | torch.rand(1, 3, args.size, args.size).cuda(), 64 | onnx_name, 65 | input_names=['input'], 66 | output_names=['output']) 67 | 68 | return model 69 | 70 | def onnx2trt(args): 71 | from mmdeploy.backend.tensorrt import from_onnx 72 | 73 | onnx_name = f'{args.model_name}.onnx' 74 | from_onnx( 75 | onnx_name, 76 | args.model_name, 77 | dict( 78 | input=dict( 79 | min_shape=[1, 3, args.size, args.size], 80 | opt_shape=[1, 3, args.size, args.size], 81 | max_shape=[1, 3, args.size, args.size], 82 | ) 83 | ), 84 | max_workspace_size=2**30, 85 | ) 86 | 87 | def check(args, cfg): 88 | from mmdeploy.backend.tensorrt.wrapper import TRTWrapper 89 | 90 | model = get_model(args, cfg).cuda() 91 | model.eval() 92 | trt_model = TRTWrapper(f'{args.model_name}.engine', 93 | ['output']) 94 | 95 | x = torch.randn(1, 3, args.size, args.size).cuda() 96 | 97 | torch_out = model(x) 98 | trt_out = trt_model(dict(input=x))['output'] 99 | 100 | print('torch out shape:', torch_out.shape) 101 | print('trt out shape:', trt_out.shape) 102 | 103 | print('max delta:', (torch_out - trt_out).abs().max()) 104 | print('mean delta:', (torch_out - trt_out).abs().mean()) 105 | 106 | speed_test(model, x) 107 | speed_test(trt_model, dict(input=x)) 108 | 109 | def main(): 110 | args, cfg = get_args() 111 | 112 | if args.onnx or args.trt: 113 | torch2onnx(args, cfg) 114 | print('torch -> onnx: succeess') 115 | 116 | if args.trt: 117 | onnx2trt(args) 118 | print('onnx -> trt: success') 119 | check(args, cfg) 120 | 121 | if __name__ == '__main__': 122 | main() 123 | -------------------------------------------------------------------------------- /classification/logger.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | import os 8 | import sys 9 | import logging 10 | import functools 11 | from termcolor import colored 12 | 13 | 14 | @functools.lru_cache() 15 | def create_logger(output_dir, dist_rank=0, name=''): 16 | # create logger 17 | logger = logging.getLogger(name) 18 | logger.setLevel(logging.DEBUG) 19 | logger.propagate = False 20 | 21 | # create formatter 22 | fmt = '[%(asctime)s %(name)s] (%(filename)s %(lineno)d): %(levelname)s %(message)s' 23 | color_fmt = colored('[%(asctime)s %(name)s]', 'green') + \ 24 | colored('(%(filename)s %(lineno)d)', 'yellow') + \ 25 | ': %(levelname)s %(message)s' 26 | 27 | # create console handlers for master process 28 | if dist_rank == 0: 29 | console_handler = logging.StreamHandler(sys.stdout) 30 | console_handler.setLevel(logging.DEBUG) 31 | console_handler.setFormatter( 32 | logging.Formatter(fmt=color_fmt, datefmt='%Y-%m-%d %H:%M:%S')) 33 | logger.addHandler(console_handler) 34 | 35 | # create file handlers 36 | file_handler = logging.FileHandler(os.path.join( 37 | output_dir, f'log_rank{dist_rank}.txt'), 38 | mode='a') 39 | file_handler.setLevel(logging.DEBUG) 40 | file_handler.setFormatter( 41 | logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S')) 42 | logger.addHandler(file_handler) 43 | 44 | return logger 45 | -------------------------------------------------------------------------------- /classification/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | 8 | import torch 9 | from timm.scheduler.cosine_lr import CosineLRScheduler 10 | from timm.scheduler.step_lr import StepLRScheduler 11 | from timm.scheduler.scheduler import Scheduler 12 | 13 | 14 | def build_scheduler(config, optimizer, n_iter_per_epoch): 15 | num_steps = int(config.TRAIN.EPOCHS * n_iter_per_epoch) 16 | warmup_steps = int(config.TRAIN.WARMUP_EPOCHS * n_iter_per_epoch) 17 | decay_steps = int(config.TRAIN.LR_SCHEDULER.DECAY_EPOCHS * 18 | n_iter_per_epoch) 19 | 20 | lr_scheduler = None 21 | if config.TRAIN.LR_SCHEDULER.NAME == 'cosine': 22 | lr_scheduler = CosineLRScheduler( 23 | optimizer, 24 | t_initial=num_steps, 25 | # t_mul=1., 26 | lr_min=config.TRAIN.MIN_LR, 27 | warmup_lr_init=config.TRAIN.WARMUP_LR, 28 | warmup_t=warmup_steps, 29 | cycle_limit=1, 30 | t_in_epochs=False, 31 | ) 32 | elif config.TRAIN.LR_SCHEDULER.NAME == 'linear': 33 | lr_scheduler = LinearLRScheduler( 34 | optimizer, 35 | t_initial=num_steps, 36 | lr_min_rate=0.01, 37 | warmup_lr_init=config.TRAIN.WARMUP_LR, 38 | warmup_t=warmup_steps, 39 | t_in_epochs=False, 40 | ) 41 | elif config.TRAIN.LR_SCHEDULER.NAME == 'step': 42 | lr_scheduler = StepLRScheduler( 43 | optimizer, 44 | decay_t=decay_steps, 45 | decay_rate=config.TRAIN.LR_SCHEDULER.DECAY_RATE, 46 | warmup_lr_init=config.TRAIN.WARMUP_LR, 47 | warmup_t=warmup_steps, 48 | t_in_epochs=False, 49 | ) 50 | 51 | return lr_scheduler 52 | 53 | 54 | class LinearLRScheduler(Scheduler): 55 | 56 | def __init__( 57 | self, 58 | optimizer: torch.optim.Optimizer, 59 | t_initial: int, 60 | lr_min_rate: float, 61 | warmup_t=0, 62 | warmup_lr_init=0., 63 | t_in_epochs=True, 64 | noise_range_t=None, 65 | noise_pct=0.67, 66 | noise_std=1.0, 67 | noise_seed=42, 68 | initialize=True, 69 | ) -> None: 70 | super().__init__(optimizer, 71 | param_group_field="lr", 72 | noise_range_t=noise_range_t, 73 | noise_pct=noise_pct, 74 | noise_std=noise_std, 75 | noise_seed=noise_seed, 76 | initialize=initialize) 77 | 78 | self.t_initial = t_initial 79 | self.lr_min_rate = lr_min_rate 80 | self.warmup_t = warmup_t 81 | self.warmup_lr_init = warmup_lr_init 82 | self.t_in_epochs = t_in_epochs 83 | if self.warmup_t: 84 | self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t 85 | for v in self.base_values] 86 | super().update_groups(self.warmup_lr_init) 87 | else: 88 | self.warmup_steps = [1 for _ in self.base_values] 89 | 90 | def _get_lr(self, t): 91 | if t < self.warmup_t: 92 | lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps] 93 | else: 94 | t = t - self.warmup_t 95 | total_t = self.t_initial - self.warmup_t 96 | lrs = [ 97 | v - ((v - v * self.lr_min_rate) * (t / total_t)) 98 | for v in self.base_values 99 | ] 100 | return lrs 101 | 102 | def get_epoch_values(self, epoch: int): 103 | if self.t_in_epochs: 104 | return self._get_lr(epoch) 105 | else: 106 | return None 107 | 108 | def get_update_values(self, num_updates: int): 109 | if not self.t_in_epochs: 110 | return self._get_lr(num_updates) 111 | else: 112 | return None 113 | -------------------------------------------------------------------------------- /classification/meta_data/meta: -------------------------------------------------------------------------------- 1 | /mnt/petrelfs/share/images/meta/ -------------------------------------------------------------------------------- /classification/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .build import build_model -------------------------------------------------------------------------------- /classification/models/build.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | 8 | from .intern_image import InternImage 9 | from .flash_intern_image import FlashInternImage 10 | 11 | def build_model(config): 12 | model_type = config.MODEL.TYPE 13 | if model_type == 'intern_image': 14 | model = InternImage( 15 | core_op=config.MODEL.INTERN_IMAGE.CORE_OP, 16 | num_classes=config.MODEL.NUM_CLASSES, 17 | channels=config.MODEL.INTERN_IMAGE.CHANNELS, 18 | depths=config.MODEL.INTERN_IMAGE.DEPTHS, 19 | groups=config.MODEL.INTERN_IMAGE.GROUPS, 20 | layer_scale=config.MODEL.INTERN_IMAGE.LAYER_SCALE, 21 | offset_scale=config.MODEL.INTERN_IMAGE.OFFSET_SCALE, 22 | post_norm=config.MODEL.INTERN_IMAGE.POST_NORM, 23 | mlp_ratio=config.MODEL.INTERN_IMAGE.MLP_RATIO, 24 | with_cp=config.TRAIN.USE_CHECKPOINT, 25 | drop_path_rate=config.MODEL.DROP_PATH_RATE, 26 | res_post_norm=config.MODEL.INTERN_IMAGE.RES_POST_NORM, # for InternImage-H/G 27 | dw_kernel_size=config.MODEL.INTERN_IMAGE.DW_KERNEL_SIZE, # for InternImage-H/G 28 | use_clip_projector=config.MODEL.INTERN_IMAGE.USE_CLIP_PROJECTOR, # for InternImage-H/G 29 | level2_post_norm=config.MODEL.INTERN_IMAGE.LEVEL2_POST_NORM, # for InternImage-H/G 30 | level2_post_norm_block_ids=config.MODEL.INTERN_IMAGE.LEVEL2_POST_NORM_BLOCK_IDS, # for InternImage-H/G 31 | center_feature_scale=config.MODEL.INTERN_IMAGE.CENTER_FEATURE_SCALE # for InternImage-H/G 32 | ) 33 | elif model_type == 'flash_intern_image': 34 | model = FlashInternImage( 35 | core_op=config.MODEL.FLASH_INTERN_IMAGE.CORE_OP, 36 | num_classes=config.MODEL.NUM_CLASSES, 37 | channels=config.MODEL.FLASH_INTERN_IMAGE.CHANNELS, 38 | depths=config.MODEL.FLASH_INTERN_IMAGE.DEPTHS, 39 | groups=config.MODEL.FLASH_INTERN_IMAGE.GROUPS, 40 | layer_scale=config.MODEL.FLASH_INTERN_IMAGE.LAYER_SCALE, 41 | offset_scale=config.MODEL.FLASH_INTERN_IMAGE.OFFSET_SCALE, 42 | post_norm=config.MODEL.FLASH_INTERN_IMAGE.POST_NORM, 43 | mlp_ratio=config.MODEL.FLASH_INTERN_IMAGE.MLP_RATIO, 44 | with_cp=config.TRAIN.USE_CHECKPOINT, 45 | drop_path_rate=config.MODEL.DROP_PATH_RATE, 46 | mlp_fc2_bias=config.MODEL.FLASH_INTERN_IMAGE.MLP_FC2_BIAS, 47 | dcn_output_bias=config.MODEL.FLASH_INTERN_IMAGE.DCN_OUTPUT_BIAS, 48 | res_post_norm=config.MODEL.FLASH_INTERN_IMAGE.RES_POST_NORM, # for InternImage-H/G 49 | dw_kernel_size=config.MODEL.FLASH_INTERN_IMAGE.DW_KERNEL_SIZE, 50 | use_clip_projector=config.MODEL.FLASH_INTERN_IMAGE.USE_CLIP_PROJECTOR, # for InternImage-H/G 51 | level2_post_norm=config.MODEL.FLASH_INTERN_IMAGE.LEVEL2_POST_NORM, # for InternImage-H/G 52 | level2_post_norm_block_ids=config.MODEL.FLASH_INTERN_IMAGE.LEVEL2_POST_NORM_BLOCK_IDS, # for InternImage-H/G 53 | center_feature_scale=config.MODEL.FLASH_INTERN_IMAGE.CENTER_FEATURE_SCALE # for InternImage-H/G 54 | ) 55 | else: 56 | raise NotImplementedError(f"Unkown model: {model_type}") 57 | 58 | return model 59 | -------------------------------------------------------------------------------- /classification/ops_dcnv3/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .dcnv3_func import DCNv3Function, dcnv3_core_pytorch 8 | -------------------------------------------------------------------------------- /classification/ops_dcnv3/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # -------------------------------------------------------- 3 | # DCNv4 4 | # Copyright (c) 2024 OpenGVLab 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | python setup.py build install 9 | -------------------------------------------------------------------------------- /classification/ops_dcnv3/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .dcnv3 import DCNv3, DCNv3_pytorch -------------------------------------------------------------------------------- /classification/ops_dcnv3/setup.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | import os 8 | import glob 9 | 10 | import torch 11 | 12 | from torch.utils.cpp_extension import CUDA_HOME 13 | from torch.utils.cpp_extension import CppExtension 14 | from torch.utils.cpp_extension import CUDAExtension 15 | 16 | from setuptools import find_packages 17 | from setuptools import setup 18 | 19 | requirements = ["torch", "torchvision"] 20 | 21 | 22 | def get_extensions(): 23 | this_dir = os.path.dirname(os.path.abspath(__file__)) 24 | extensions_dir = os.path.join(this_dir, "src") 25 | 26 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 27 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 28 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 29 | 30 | sources = main_file + source_cpu 31 | extension = CppExtension 32 | extra_compile_args = {"cxx": []} 33 | define_macros = [] 34 | 35 | if torch.cuda.is_available() and CUDA_HOME is not None: 36 | extension = CUDAExtension 37 | sources += source_cuda 38 | define_macros += [("WITH_CUDA", None)] 39 | extra_compile_args["nvcc"] = [ 40 | # "-DCUDA_HAS_FP16=1", 41 | # "-D__CUDA_NO_HALF_OPERATORS__", 42 | # "-D__CUDA_NO_HALF_CONVERSIONS__", 43 | # "-D__CUDA_NO_HALF2_OPERATORS__", 44 | ] 45 | else: 46 | raise NotImplementedError('Cuda is not availabel') 47 | 48 | sources = [os.path.join(extensions_dir, s) for s in sources] 49 | include_dirs = [extensions_dir] 50 | ext_modules = [ 51 | extension( 52 | "DCNv3", 53 | sources, 54 | include_dirs=include_dirs, 55 | define_macros=define_macros, 56 | extra_compile_args=extra_compile_args, 57 | ) 58 | ] 59 | return ext_modules 60 | 61 | 62 | setup( 63 | name="DCNv3", 64 | version="1.1", 65 | author="InternImage", 66 | url="https://github.com/OpenGVLab/InternImage", 67 | description= 68 | "PyTorch Wrapper for CUDA Functions of DCNv3", 69 | packages=find_packages(exclude=( 70 | "configs", 71 | "tests", 72 | )), 73 | ext_modules=get_extensions(), 74 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 75 | ) 76 | -------------------------------------------------------------------------------- /classification/ops_dcnv3/src/cpu/dcnv3_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset, 18 | const at::Tensor &mask, const int kernel_h, 19 | const int kernel_w, const int stride_h, 20 | const int stride_w, const int pad_h, 21 | const int pad_w, const int dilation_h, 22 | const int dilation_w, const int group, 23 | const int group_channels, const float offset_scale, 24 | const int im2col_step) { 25 | AT_ERROR("Not implement on cpu"); 26 | } 27 | 28 | std::vector 29 | dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset, 30 | const at::Tensor &mask, const int kernel_h, 31 | const int kernel_w, const int stride_h, const int stride_w, 32 | const int pad_h, const int pad_w, const int dilation_h, 33 | const int dilation_w, const int group, 34 | const int group_channels, const float offset_scale, 35 | const at::Tensor &grad_output, const int im2col_step) { 36 | AT_ERROR("Not implement on cpu"); 37 | } 38 | -------------------------------------------------------------------------------- /classification/ops_dcnv3/src/cpu/dcnv3_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #pragma once 13 | #include 14 | 15 | at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset, 16 | const at::Tensor &mask, const int kernel_h, 17 | const int kernel_w, const int stride_h, 18 | const int stride_w, const int pad_h, 19 | const int pad_w, const int dilation_h, 20 | const int dilation_w, const int group, 21 | const int group_channels, const float offset_scale, 22 | const int im2col_step); 23 | 24 | std::vector 25 | dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset, 26 | const at::Tensor &mask, const int kernel_h, 27 | const int kernel_w, const int stride_h, const int stride_w, 28 | const int pad_h, const int pad_w, const int dilation_h, 29 | const int dilation_w, const int group, 30 | const int group_channels, const float offset_scale, 31 | const at::Tensor &grad_output, const int im2col_step); 32 | -------------------------------------------------------------------------------- /classification/ops_dcnv3/src/cuda/dcnv3_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #pragma once 13 | #include 14 | 15 | at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, 16 | const at::Tensor &mask, const int kernel_h, 17 | const int kernel_w, const int stride_h, 18 | const int stride_w, const int pad_h, 19 | const int pad_w, const int dilation_h, 20 | const int dilation_w, const int group, 21 | const int group_channels, 22 | const float offset_scale, const int im2col_step, const int remove_center); 23 | 24 | std::vector 25 | dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, 26 | const at::Tensor &mask, const int kernel_h, 27 | const int kernel_w, const int stride_h, const int stride_w, 28 | const int pad_h, const int pad_w, const int dilation_h, 29 | const int dilation_w, const int group, 30 | const int group_channels, const float offset_scale, 31 | const at::Tensor &grad_output, const int im2col_step, const int remove_center); 32 | -------------------------------------------------------------------------------- /classification/ops_dcnv3/src/dcnv3.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #pragma once 13 | 14 | #include "cpu/dcnv3_cpu.h" 15 | 16 | #ifdef WITH_CUDA 17 | #include "cuda/dcnv3_cuda.h" 18 | #endif 19 | 20 | at::Tensor dcnv3_forward(const at::Tensor &input, const at::Tensor &offset, 21 | const at::Tensor &mask, const int kernel_h, 22 | const int kernel_w, const int stride_h, 23 | const int stride_w, const int pad_h, const int pad_w, 24 | const int dilation_h, const int dilation_w, 25 | const int group, const int group_channels, 26 | const float offset_scale, const int im2col_step, const int remove_center) { 27 | if (input.type().is_cuda()) { 28 | #ifdef WITH_CUDA 29 | return dcnv3_cuda_forward(input, offset, mask, kernel_h, kernel_w, 30 | stride_h, stride_w, pad_h, pad_w, dilation_h, 31 | dilation_w, group, group_channels, 32 | offset_scale, im2col_step, remove_center); 33 | #else 34 | AT_ERROR("Not compiled with GPU support"); 35 | #endif 36 | } 37 | AT_ERROR("Not implemented on the CPU"); 38 | } 39 | 40 | std::vector 41 | dcnv3_backward(const at::Tensor &input, const at::Tensor &offset, 42 | const at::Tensor &mask, const int kernel_h, const int kernel_w, 43 | const int stride_h, const int stride_w, const int pad_h, 44 | const int pad_w, const int dilation_h, const int dilation_w, 45 | const int group, const int group_channels, 46 | const float offset_scale, const at::Tensor &grad_output, 47 | const int im2col_step, const int remove_center) { 48 | if (input.type().is_cuda()) { 49 | #ifdef WITH_CUDA 50 | return dcnv3_cuda_backward(input, offset, mask, kernel_h, kernel_w, 51 | stride_h, stride_w, pad_h, pad_w, dilation_h, 52 | dilation_w, group, group_channels, 53 | offset_scale, grad_output, im2col_step, remove_center); 54 | #else 55 | AT_ERROR("Not compiled with GPU support"); 56 | #endif 57 | } 58 | AT_ERROR("Not implemented on the CPU"); 59 | } 60 | -------------------------------------------------------------------------------- /classification/ops_dcnv3/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #include "dcnv3.h" 13 | 14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 15 | m.def("dcnv3_forward", &dcnv3_forward, "dcnv3_forward"); 16 | m.def("dcnv3_backward", &dcnv3_backward, "dcnv3_backward"); 17 | } 18 | -------------------------------------------------------------------------------- /classification/train_in1k.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | WORK_DIR=$4 9 | GPUS=${GPUS:-1} 10 | GPUS_PER_NODE=${GPUS_PER_NODE:-1} 11 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 12 | SRUN_ARGS=${SRUN_ARGS:-""} 13 | PY_ARGS=${@:5} 14 | 15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 16 | srun -p ${PARTITION} \ 17 | --job-name=${JOB_NAME} \ 18 | --gres=gpu:${GPUS_PER_NODE} \ 19 | --ntasks=${GPUS} \ 20 | --ntasks-per-node=${GPUS_PER_NODE} \ 21 | --cpus-per-task=${CPUS_PER_TASK} \ 22 | --kill-on-bad-exit=1 \ 23 | --quotatype=reserved \ 24 | ${SRUN_ARGS} \ 25 | python -u main.py \ 26 | --cfg ${CONFIG} \ 27 | --accumulation-steps 1 \ 28 | --local-rank 0 \ 29 | --batch-size 128 \ 30 | --data-path /mnt/petrelfs/share/images \ 31 | --output work_dirs ${@:4} --launcher="slurm" 32 | -------------------------------------------------------------------------------- /classification/train_in1k_deepspeed.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | GPUS=${GPUS:-8} 9 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 10 | CPUS_PER_TASK=${CPUS_PER_TASK:-12} 11 | SRUN_ARGS=${SRUN_ARGS:-""} 12 | 13 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 14 | srun -p ${PARTITION} \ 15 | --job-name=${JOB_NAME} \ 16 | --gres=gpu:${GPUS_PER_NODE} \ 17 | --ntasks=${GPUS} \ 18 | --ntasks-per-node=${GPUS_PER_NODE} \ 19 | --cpus-per-task=${CPUS_PER_TASK} \ 20 | --kill-on-bad-exit=1 \ 21 | --quotatype=spot \ 22 | ${SRUN_ARGS} \ 23 | python -u main_deepspeed.py \ 24 | --cfg ${CONFIG} \ 25 | --local-rank 0 \ 26 | --data-path /mnt/lustre/share/images \ 27 | --output work_dirs_deepspeed ${@:4} 28 | -------------------------------------------------------------------------------- /detection/README.md: -------------------------------------------------------------------------------- 1 | # FlashInternImage for Object Detection 2 | 3 | This folder contains the implementation of the FlashInternImage for object detection. 4 | 5 | Our detection code is developed on top of [MMDetection v2.28.1](https://github.com/open-mmlab/mmdetection/tree/v2.28.1). 6 | 7 | 8 | ## Usage 9 | 10 | ### Install 11 | 12 | - Clone this repo: 13 | 14 | ```bash 15 | git clone https://github.com/OpenGVLab/DCNv4.git 16 | cd DCNv4 17 | ``` 18 | 19 | - Create a conda virtual environment and activate it: 20 | 21 | ```bash 22 | conda create -n dcnv4 python=3.7 -y 23 | conda activate dcnv4 24 | ``` 25 | 26 | - Install `CUDA>=10.2` with `cudnn>=7` following 27 | the [official installation instructions](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) 28 | - Install `PyTorch>=1.10.0` and `torchvision>=0.9.0` with `CUDA>=10.2`: 29 | 30 | For examples, to install torch==1.11 with CUDA==11.3: 31 | ```bash 32 | pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html 33 | ``` 34 | 35 | - Install `timm==0.6.11` and `mmcv-full==1.5.0`: 36 | 37 | ```bash 38 | pip install -U openmim 39 | mim install mmcv-full==1.5.0 40 | pip install timm==0.6.11 mmdet==2.28.1 41 | ``` 42 | 43 | - Install other requirements: 44 | 45 | ```bash 46 | pip install opencv-python termcolor yacs pyyaml scipy 47 | ``` 48 | 49 | - Install DCNv4 50 | ```bash 51 | pip install DCNv4 52 | ``` 53 | 54 | 55 | ### Data Preparation 56 | 57 | Prepare COCO according to the guidelines in [MMDetection v2.28.1](https://github.com/open-mmlab/mmdetection/resolve/master/docs/en/1_exist_data_model.md). 58 | 59 | 60 | ### Evaluation 61 | 62 | To evaluate our `FlashInternImage` on COCO val, run: 63 | 64 | ```bash 65 | sh dist_test.sh --eval bbox segm 66 | ``` 67 | 68 | For example, to evaluate the `FlashInternImage-T` with a single GPU: 69 | 70 | ```bash 71 | python test.py configs/coco/mask_rcnn_flash_intern_image_t_fpn_1x_coco.py checkpoint_dir/det/mask_rcnn_flash_internimage_t_fpn_1x_coco.pth --eval bbox segm 72 | ``` 73 | 74 | For example, to evaluate the `FlashInternImage-B` with a single node with 8 GPUs: 75 | 76 | ```bash 77 | sh dist_test.sh configs/coco/mask_rcnn_flash_intern_image_b_fpn_1x_coco.py checkpoint_dir/det/mask_rcnn_flash_internimage_b_fpn_1x_coco.py 8 --eval bbox segm 78 | ``` 79 | 80 | ### Training on COCO 81 | 82 | To train an `FlashInternImage` on COCO, run: 83 | 84 | ```bash 85 | sh dist_train.sh 86 | ``` 87 | 88 | For example, to train `FlashInternImage-T` with 8 GPU on 1 node, run: 89 | 90 | ```bash 91 | sh dist_train.sh configs/coco/mask_rcnn_flash_intern_image_t_fpn_1x_coco.py 8 92 | ``` 93 | 94 | -------------------------------------------------------------------------------- /detection/configs/_base_/datasets/coco_detection.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'CocoDataset' 3 | data_root = 'data/coco/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | train_pipeline = [ 7 | dict(type='LoadImageFromFile'), 8 | dict(type='LoadAnnotations', with_bbox=True), 9 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 10 | dict(type='RandomFlip', flip_ratio=0.5), 11 | dict(type='Normalize', **img_norm_cfg), 12 | dict(type='Pad', size_divisor=32), 13 | dict(type='DefaultFormatBundle'), 14 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 15 | ] 16 | test_pipeline = [ 17 | dict(type='LoadImageFromFile'), 18 | dict( 19 | type='MultiScaleFlipAug', 20 | img_scale=(1333, 800), 21 | flip=False, 22 | transforms=[ 23 | dict(type='Resize', keep_ratio=True), 24 | dict(type='RandomFlip'), 25 | dict(type='Normalize', **img_norm_cfg), 26 | dict(type='Pad', size_divisor=32), 27 | dict(type='ImageToTensor', keys=['img']), 28 | dict(type='Collect', keys=['img']), 29 | ]) 30 | ] 31 | data = dict( 32 | samples_per_gpu=2, 33 | workers_per_gpu=2, 34 | train=dict( 35 | type=dataset_type, 36 | ann_file=data_root + 'annotations/instances_train2017.json', 37 | img_prefix=data_root + 'train2017/', 38 | pipeline=train_pipeline), 39 | val=dict( 40 | type=dataset_type, 41 | ann_file=data_root + 'annotations/instances_val2017.json', 42 | img_prefix=data_root + 'val2017/', 43 | pipeline=test_pipeline), 44 | test=dict( 45 | type=dataset_type, 46 | ann_file=data_root + 'annotations/instances_val2017.json', 47 | img_prefix=data_root + 'val2017/', 48 | pipeline=test_pipeline)) 49 | evaluation = dict(interval=1, metric='bbox', classwise=True) -------------------------------------------------------------------------------- /detection/configs/_base_/datasets/coco_instance.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'CocoDataset' 3 | data_root = 'data/coco/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | train_pipeline = [ 7 | dict(type='LoadImageFromFile'), 8 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 9 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 10 | dict(type='RandomFlip', flip_ratio=0.5), 11 | dict(type='Normalize', **img_norm_cfg), 12 | dict(type='Pad', size_divisor=32), 13 | dict(type='DefaultFormatBundle'), 14 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), 15 | ] 16 | test_pipeline = [ 17 | dict(type='LoadImageFromFile'), 18 | dict( 19 | type='MultiScaleFlipAug', 20 | img_scale=(1333, 800), 21 | flip=False, 22 | transforms=[ 23 | dict(type='Resize', keep_ratio=True), 24 | dict(type='RandomFlip'), 25 | dict(type='Normalize', **img_norm_cfg), 26 | dict(type='Pad', size_divisor=32), 27 | dict(type='ImageToTensor', keys=['img']), 28 | dict(type='Collect', keys=['img']), 29 | ]) 30 | ] 31 | data = dict( 32 | samples_per_gpu=2, 33 | workers_per_gpu=2, 34 | train=dict( 35 | type=dataset_type, 36 | ann_file=data_root + 'annotations/instances_train2017.json', 37 | img_prefix=data_root + 'train2017/', 38 | pipeline=train_pipeline), 39 | val=dict( 40 | type=dataset_type, 41 | ann_file=data_root + 'annotations/instances_val2017.json', 42 | img_prefix=data_root + 'val2017/', 43 | pipeline=test_pipeline), 44 | test=dict( 45 | type=dataset_type, 46 | ann_file=data_root + 'annotations/instances_val2017.json', 47 | img_prefix=data_root + 'val2017/', 48 | pipeline=test_pipeline)) 49 | evaluation = dict(metric=['bbox', 'segm'], classwise=True) 50 | -------------------------------------------------------------------------------- /detection/configs/_base_/datasets/crowd_human.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'CrowdHumanDataset' 3 | data_root = 'data/CrowdHuman/' 4 | classes = ('person',) 5 | img_norm_cfg = dict( 6 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', with_bbox=True), 10 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 11 | dict(type='RandomFlip', flip_ratio=0.5), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size_divisor=32), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict( 20 | type='MultiScaleFlipAug', 21 | img_scale=(1333, 800), 22 | flip=False, 23 | transforms=[ 24 | dict(type='Resize', keep_ratio=True), 25 | dict(type='RandomFlip'), 26 | dict(type='Normalize', **img_norm_cfg), 27 | dict(type='Pad', size_divisor=32), 28 | dict(type='ImageToTensor', keys=['img']), 29 | dict(type='Collect', keys=['img']), 30 | ]) 31 | ] 32 | data = dict( 33 | samples_per_gpu=2, 34 | workers_per_gpu=2, 35 | train=dict( 36 | type=dataset_type, 37 | classes=classes, 38 | filter_empty_gt=True, 39 | ann_file=data_root + 'annotations/annotation_train.json', 40 | img_prefix=data_root + 'Images', 41 | pipeline=train_pipeline), 42 | val=dict( 43 | type=dataset_type, 44 | classes=classes, 45 | ann_file=data_root + 'annotations/annotation_val.json', 46 | img_prefix=data_root + 'Images', 47 | pipeline=test_pipeline), 48 | test=dict( 49 | type=dataset_type, 50 | classes=classes, 51 | ann_file=data_root + 'annotations/annotation_val.json', 52 | img_prefix=data_root + 'Images', 53 | pipeline=test_pipeline)) 54 | evaluation = dict(interval=100, metric='bbox') 55 | -------------------------------------------------------------------------------- /detection/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | checkpoint_config = dict(interval=1) 2 | # yapf:disable 3 | log_config = dict( 4 | interval=50, 5 | hooks=[ 6 | dict(type='TextLoggerHook'), 7 | # dict(type='TensorboardLoggerHook') 8 | ]) 9 | # yapf:enable 10 | custom_hooks = [dict(type='NumClassCheckHook')] 11 | 12 | dist_params = dict(backend='nccl') 13 | log_level = 'INFO' 14 | load_from = None 15 | resume_from = None 16 | workflow = [('train', 1)] 17 | -------------------------------------------------------------------------------- /detection/configs/_base_/models/fast_rcnn_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='FastRCNN', 4 | backbone=dict( 5 | type='ResNet', 6 | depth=50, 7 | num_stages=4, 8 | out_indices=(0, 1, 2, 3), 9 | frozen_stages=1, 10 | norm_cfg=dict(type='BN', requires_grad=True), 11 | norm_eval=True, 12 | style='pytorch', 13 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | num_outs=5), 19 | roi_head=dict( 20 | type='StandardRoIHead', 21 | bbox_roi_extractor=dict( 22 | type='SingleRoIExtractor', 23 | roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), 24 | out_channels=256, 25 | featmap_strides=[4, 8, 16, 32]), 26 | bbox_head=dict( 27 | type='Shared2FCBBoxHead', 28 | in_channels=256, 29 | fc_out_channels=1024, 30 | roi_feat_size=7, 31 | num_classes=80, 32 | bbox_coder=dict( 33 | type='DeltaXYWHBBoxCoder', 34 | target_means=[0., 0., 0., 0.], 35 | target_stds=[0.1, 0.1, 0.2, 0.2]), 36 | reg_class_agnostic=False, 37 | loss_cls=dict( 38 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), 39 | loss_bbox=dict(type='L1Loss', loss_weight=1.0))), 40 | # model training and testing settings 41 | train_cfg=dict( 42 | rcnn=dict( 43 | assigner=dict( 44 | type='MaxIoUAssigner', 45 | pos_iou_thr=0.5, 46 | neg_iou_thr=0.5, 47 | min_pos_iou=0.5, 48 | match_low_quality=False, 49 | ignore_iof_thr=-1), 50 | sampler=dict( 51 | type='RandomSampler', 52 | num=512, 53 | pos_fraction=0.25, 54 | neg_pos_ub=-1, 55 | add_gt_as_proposals=True), 56 | pos_weight=-1, 57 | debug=False)), 58 | test_cfg=dict( 59 | rcnn=dict( 60 | score_thr=0.05, 61 | nms=dict(type='nms', iou_threshold=0.5), 62 | max_per_img=100))) 63 | -------------------------------------------------------------------------------- /detection/configs/_base_/models/faster_rcnn_r50_caffe_dc5.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='BN', requires_grad=False) 3 | model = dict( 4 | type='FasterRCNN', 5 | backbone=dict( 6 | type='ResNet', 7 | depth=50, 8 | num_stages=4, 9 | strides=(1, 2, 2, 1), 10 | dilations=(1, 1, 1, 2), 11 | out_indices=(3, ), 12 | frozen_stages=1, 13 | norm_cfg=norm_cfg, 14 | norm_eval=True, 15 | style='caffe', 16 | init_cfg=dict( 17 | type='Pretrained', 18 | checkpoint='open-mmlab://detectron2/resnet50_caffe')), 19 | rpn_head=dict( 20 | type='RPNHead', 21 | in_channels=2048, 22 | feat_channels=2048, 23 | anchor_generator=dict( 24 | type='AnchorGenerator', 25 | scales=[2, 4, 8, 16, 32], 26 | ratios=[0.5, 1.0, 2.0], 27 | strides=[16]), 28 | bbox_coder=dict( 29 | type='DeltaXYWHBBoxCoder', 30 | target_means=[.0, .0, .0, .0], 31 | target_stds=[1.0, 1.0, 1.0, 1.0]), 32 | loss_cls=dict( 33 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), 34 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 35 | roi_head=dict( 36 | type='StandardRoIHead', 37 | bbox_roi_extractor=dict( 38 | type='SingleRoIExtractor', 39 | roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), 40 | out_channels=2048, 41 | featmap_strides=[16]), 42 | bbox_head=dict( 43 | type='Shared2FCBBoxHead', 44 | in_channels=2048, 45 | fc_out_channels=1024, 46 | roi_feat_size=7, 47 | num_classes=80, 48 | bbox_coder=dict( 49 | type='DeltaXYWHBBoxCoder', 50 | target_means=[0., 0., 0., 0.], 51 | target_stds=[0.1, 0.1, 0.2, 0.2]), 52 | reg_class_agnostic=False, 53 | loss_cls=dict( 54 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), 55 | loss_bbox=dict(type='L1Loss', loss_weight=1.0))), 56 | # model training and testing settings 57 | train_cfg=dict( 58 | rpn=dict( 59 | assigner=dict( 60 | type='MaxIoUAssigner', 61 | pos_iou_thr=0.7, 62 | neg_iou_thr=0.3, 63 | min_pos_iou=0.3, 64 | match_low_quality=True, 65 | ignore_iof_thr=-1), 66 | sampler=dict( 67 | type='RandomSampler', 68 | num=256, 69 | pos_fraction=0.5, 70 | neg_pos_ub=-1, 71 | add_gt_as_proposals=False), 72 | allowed_border=0, 73 | pos_weight=-1, 74 | debug=False), 75 | rpn_proposal=dict( 76 | nms_pre=12000, 77 | max_per_img=2000, 78 | nms=dict(type='nms', iou_threshold=0.7), 79 | min_bbox_size=0), 80 | rcnn=dict( 81 | assigner=dict( 82 | type='MaxIoUAssigner', 83 | pos_iou_thr=0.5, 84 | neg_iou_thr=0.5, 85 | min_pos_iou=0.5, 86 | match_low_quality=False, 87 | ignore_iof_thr=-1), 88 | sampler=dict( 89 | type='RandomSampler', 90 | num=512, 91 | pos_fraction=0.25, 92 | neg_pos_ub=-1, 93 | add_gt_as_proposals=True), 94 | pos_weight=-1, 95 | debug=False)), 96 | test_cfg=dict( 97 | rpn=dict( 98 | nms=dict(type='nms', iou_threshold=0.7), 99 | nms_pre=6000, 100 | max_per_img=1000, 101 | min_bbox_size=0), 102 | rcnn=dict( 103 | score_thr=0.05, 104 | nms=dict(type='nms', iou_threshold=0.5), 105 | max_per_img=100))) 106 | -------------------------------------------------------------------------------- /detection/configs/_base_/models/retinanet_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RetinaNet', 4 | backbone=dict( 5 | type='ResNet', 6 | depth=50, 7 | num_stages=4, 8 | out_indices=(0, 1, 2, 3), 9 | frozen_stages=1, 10 | norm_cfg=dict(type='BN', requires_grad=True), 11 | norm_eval=True, 12 | style='pytorch', 13 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | start_level=1, 19 | add_extra_convs='on_input', 20 | num_outs=5), 21 | bbox_head=dict( 22 | type='RetinaHead', 23 | num_classes=80, 24 | in_channels=256, 25 | stacked_convs=4, 26 | feat_channels=256, 27 | anchor_generator=dict( 28 | type='AnchorGenerator', 29 | octave_base_scale=4, 30 | scales_per_octave=3, 31 | ratios=[0.5, 1.0, 2.0], 32 | strides=[8, 16, 32, 64, 128]), 33 | bbox_coder=dict( 34 | type='DeltaXYWHBBoxCoder', 35 | target_means=[.0, .0, .0, .0], 36 | target_stds=[1.0, 1.0, 1.0, 1.0]), 37 | loss_cls=dict( 38 | type='FocalLoss', 39 | use_sigmoid=True, 40 | gamma=2.0, 41 | alpha=0.25, 42 | loss_weight=1.0), 43 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 44 | # model training and testing settings 45 | train_cfg=dict( 46 | assigner=dict( 47 | type='MaxIoUAssigner', 48 | pos_iou_thr=0.5, 49 | neg_iou_thr=0.4, 50 | min_pos_iou=0, 51 | ignore_iof_thr=-1), 52 | allowed_border=-1, 53 | pos_weight=-1, 54 | debug=False), 55 | test_cfg=dict( 56 | nms_pre=1000, 57 | min_bbox_size=0, 58 | score_thr=0.05, 59 | nms=dict(type='nms', iou_threshold=0.5), 60 | max_per_img=100)) 61 | -------------------------------------------------------------------------------- /detection/configs/_base_/models/rpn_r50_caffe_c4.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RPN', 4 | backbone=dict( 5 | type='ResNet', 6 | depth=50, 7 | num_stages=3, 8 | strides=(1, 2, 2), 9 | dilations=(1, 1, 1), 10 | out_indices=(2, ), 11 | frozen_stages=1, 12 | norm_cfg=dict(type='BN', requires_grad=False), 13 | norm_eval=True, 14 | style='caffe', 15 | init_cfg=dict( 16 | type='Pretrained', 17 | checkpoint='open-mmlab://detectron2/resnet50_caffe')), 18 | neck=None, 19 | rpn_head=dict( 20 | type='RPNHead', 21 | in_channels=1024, 22 | feat_channels=1024, 23 | anchor_generator=dict( 24 | type='AnchorGenerator', 25 | scales=[2, 4, 8, 16, 32], 26 | ratios=[0.5, 1.0, 2.0], 27 | strides=[16]), 28 | bbox_coder=dict( 29 | type='DeltaXYWHBBoxCoder', 30 | target_means=[.0, .0, .0, .0], 31 | target_stds=[1.0, 1.0, 1.0, 1.0]), 32 | loss_cls=dict( 33 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), 34 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 35 | # model training and testing settings 36 | train_cfg=dict( 37 | rpn=dict( 38 | assigner=dict( 39 | type='MaxIoUAssigner', 40 | pos_iou_thr=0.7, 41 | neg_iou_thr=0.3, 42 | min_pos_iou=0.3, 43 | ignore_iof_thr=-1), 44 | sampler=dict( 45 | type='RandomSampler', 46 | num=256, 47 | pos_fraction=0.5, 48 | neg_pos_ub=-1, 49 | add_gt_as_proposals=False), 50 | allowed_border=0, 51 | pos_weight=-1, 52 | debug=False)), 53 | test_cfg=dict( 54 | rpn=dict( 55 | nms_pre=12000, 56 | max_per_img=2000, 57 | nms=dict(type='nms', iou_threshold=0.7), 58 | min_bbox_size=0))) 59 | -------------------------------------------------------------------------------- /detection/configs/_base_/models/rpn_r50_fpn.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='RPN', 4 | backbone=dict( 5 | type='ResNet', 6 | depth=50, 7 | num_stages=4, 8 | out_indices=(0, 1, 2, 3), 9 | frozen_stages=1, 10 | norm_cfg=dict(type='BN', requires_grad=True), 11 | norm_eval=True, 12 | style='pytorch', 13 | init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), 14 | neck=dict( 15 | type='FPN', 16 | in_channels=[256, 512, 1024, 2048], 17 | out_channels=256, 18 | num_outs=5), 19 | rpn_head=dict( 20 | type='RPNHead', 21 | in_channels=256, 22 | feat_channels=256, 23 | anchor_generator=dict( 24 | type='AnchorGenerator', 25 | scales=[8], 26 | ratios=[0.5, 1.0, 2.0], 27 | strides=[4, 8, 16, 32, 64]), 28 | bbox_coder=dict( 29 | type='DeltaXYWHBBoxCoder', 30 | target_means=[.0, .0, .0, .0], 31 | target_stds=[1.0, 1.0, 1.0, 1.0]), 32 | loss_cls=dict( 33 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), 34 | loss_bbox=dict(type='L1Loss', loss_weight=1.0)), 35 | # model training and testing settings 36 | train_cfg=dict( 37 | rpn=dict( 38 | assigner=dict( 39 | type='MaxIoUAssigner', 40 | pos_iou_thr=0.7, 41 | neg_iou_thr=0.3, 42 | min_pos_iou=0.3, 43 | ignore_iof_thr=-1), 44 | sampler=dict( 45 | type='RandomSampler', 46 | num=256, 47 | pos_fraction=0.5, 48 | neg_pos_ub=-1, 49 | add_gt_as_proposals=False), 50 | allowed_border=0, 51 | pos_weight=-1, 52 | debug=False)), 53 | test_cfg=dict( 54 | rpn=dict( 55 | nms_pre=2000, 56 | max_per_img=1000, 57 | nms=dict(type='nms', iou_threshold=0.7), 58 | min_bbox_size=0))) 59 | -------------------------------------------------------------------------------- /detection/configs/_base_/models/ssd300.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | input_size = 300 3 | model = dict( 4 | type='SingleStageDetector', 5 | backbone=dict( 6 | type='SSDVGG', 7 | depth=16, 8 | with_last_pool=False, 9 | ceil_mode=True, 10 | out_indices=(3, 4), 11 | out_feature_indices=(22, 34), 12 | init_cfg=dict( 13 | type='Pretrained', checkpoint='open-mmlab://vgg16_caffe')), 14 | neck=dict( 15 | type='SSDNeck', 16 | in_channels=(512, 1024), 17 | out_channels=(512, 1024, 512, 256, 256, 256), 18 | level_strides=(2, 2, 1, 1), 19 | level_paddings=(1, 1, 0, 0), 20 | l2_norm_scale=20), 21 | bbox_head=dict( 22 | type='SSDHead', 23 | in_channels=(512, 1024, 512, 256, 256, 256), 24 | num_classes=80, 25 | anchor_generator=dict( 26 | type='SSDAnchorGenerator', 27 | scale_major=False, 28 | input_size=input_size, 29 | basesize_ratio_range=(0.15, 0.9), 30 | strides=[8, 16, 32, 64, 100, 300], 31 | ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]), 32 | bbox_coder=dict( 33 | type='DeltaXYWHBBoxCoder', 34 | target_means=[.0, .0, .0, .0], 35 | target_stds=[0.1, 0.1, 0.2, 0.2])), 36 | # model training and testing settings 37 | train_cfg=dict( 38 | assigner=dict( 39 | type='MaxIoUAssigner', 40 | pos_iou_thr=0.5, 41 | neg_iou_thr=0.5, 42 | min_pos_iou=0., 43 | ignore_iof_thr=-1, 44 | gt_max_assign_all=False), 45 | smoothl1_beta=1., 46 | allowed_border=-1, 47 | pos_weight=-1, 48 | neg_pos_ratio=3, 49 | debug=False), 50 | test_cfg=dict( 51 | nms_pre=1000, 52 | nms=dict(type='nms', iou_threshold=0.45), 53 | min_bbox_size=0, 54 | score_thr=0.02, 55 | max_per_img=200)) 56 | cudnn_benchmark = True 57 | -------------------------------------------------------------------------------- /detection/configs/_base_/schedules/schedule_1x.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=500, 9 | warmup_ratio=0.001, 10 | step=[8, 11]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=12) 12 | -------------------------------------------------------------------------------- /detection/configs/_base_/schedules/schedule_3x.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) 3 | optimizer_config = dict(grad_clip=None) 4 | # learning policy 5 | lr_config = dict( 6 | policy='step', 7 | warmup='linear', 8 | warmup_iters=500, 9 | warmup_ratio=0.001, 10 | step=[27, 33]) 11 | runner = dict(type='EpochBasedRunner', max_epochs=36) 12 | -------------------------------------------------------------------------------- /detection/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | PORT=${PORT:-29511} 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 8 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ 9 | $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} 10 | -------------------------------------------------------------------------------- /detection/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | CONFIG=$1 3 | GPUS=$2 4 | PORT=${PORT:-29500} 5 | # cat /proc/193481/cmdline 6 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 7 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=63667 \ 8 | $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} -------------------------------------------------------------------------------- /detection/image_demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import asyncio 3 | from argparse import ArgumentParser 4 | 5 | from mmdet.apis import (async_inference_detector, inference_detector, 6 | init_detector, show_result_pyplot) 7 | import mmcv 8 | import mmcv_custom # noqa: F401,F403 9 | import mmdet_custom # noqa: F401,F403 10 | import os.path as osp 11 | 12 | 13 | def parse_args(): 14 | parser = ArgumentParser() 15 | parser.add_argument('img', help='Image file') 16 | parser.add_argument('config', help='Config file') 17 | parser.add_argument('checkpoint', help='Checkpoint file') 18 | parser.add_argument('--out', type=str, default="demo", help='out dir') 19 | parser.add_argument( 20 | '--device', default='cuda:0', help='Device used for inference') 21 | parser.add_argument( 22 | '--palette', 23 | default='coco', 24 | choices=['coco', 'voc', 'citys', 'random'], 25 | help='Color palette used for visualization') 26 | parser.add_argument( 27 | '--score-thr', type=float, default=0.3, help='bbox score threshold') 28 | parser.add_argument( 29 | '--async-test', 30 | action='store_true', 31 | help='whether to set async options for async inference.') 32 | args = parser.parse_args() 33 | return args 34 | 35 | 36 | def main(args): 37 | # build the model from a config file and a checkpoint file 38 | model = init_detector(args.config, args.checkpoint, device=args.device) 39 | # test a single image 40 | result = inference_detector(model, args.img) 41 | 42 | mmcv.mkdir_or_exist(args.out) 43 | out_file = osp.join(args.out, osp.basename(args.img)) 44 | # show the results 45 | model.show_result( 46 | args.img, 47 | result, 48 | score_thr=args.score_thr, 49 | show=False, 50 | bbox_color=args.palette, 51 | text_color=(200, 200, 200), 52 | mask_color=args.palette, 53 | out_file=out_file 54 | ) 55 | print(f"Result is save at {out_file}") 56 | 57 | 58 | 59 | if __name__ == '__main__': 60 | args = parse_args() 61 | main(args) -------------------------------------------------------------------------------- /detection/mmcv_custom/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | # -*- coding: utf-8 -*- 8 | from .custom_layer_decay_optimizer_constructor import CustomLayerDecayOptimizerConstructor 9 | __all__ = ['CustomLayerDecayOptimizerConstructor'] 10 | -------------------------------------------------------------------------------- /detection/mmdet_custom/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .models import * # noqa: F401,F403 8 | from .datasets import * -------------------------------------------------------------------------------- /detection/mmdet_custom/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .crowd_human import CrowdHumanDataset -------------------------------------------------------------------------------- /detection/mmdet_custom/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .backbones import * # noqa: F401,F403 8 | from .dense_heads import * # noqa: F401,F403 9 | from .detectors import * # noqa: F401,F403 10 | from .utils import * # noqa: F401,F403 11 | from .necks.fpn import * -------------------------------------------------------------------------------- /detection/mmdet_custom/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2023 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | from .flash_intern_image import FlashInternImage 7 | 8 | __all__ = ['FlashInternImage'] 9 | -------------------------------------------------------------------------------- /detection/mmdet_custom/models/dense_heads/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .deformable_detr_head import DeformableDETRHead 8 | from .detr_head import DETRHead 9 | from .dino_head import DINOHead 10 | from .msda import FlashMultiScaleDeformableAttention 11 | from .bbox_head import DCNv4FCBBoxHead 12 | from .mask_rcnn import MaskRCNN_ 13 | __all__ = ['DeformableDETRHead', 'DETRHead', 'DINOHead'] -------------------------------------------------------------------------------- /detection/mmdet_custom/models/dense_heads/mask_rcnn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmdet.models.builder import DETECTORS 3 | from .two_stage import TwoStageDetector 4 | 5 | 6 | @DETECTORS.register_module() 7 | class MaskRCNN_(TwoStageDetector): 8 | """Implementation of `Mask R-CNN `_""" 9 | 10 | def __init__(self, 11 | backbone, 12 | rpn_head, 13 | roi_head, 14 | train_cfg, 15 | test_cfg, 16 | neck=None, 17 | pretrained=None, 18 | init_cfg=None): 19 | super(MaskRCNN_, self).__init__( 20 | backbone=backbone, 21 | neck=neck, 22 | rpn_head=rpn_head, 23 | roi_head=roi_head, 24 | train_cfg=train_cfg, 25 | test_cfg=test_cfg, 26 | pretrained=pretrained, 27 | init_cfg=init_cfg) 28 | -------------------------------------------------------------------------------- /detection/mmdet_custom/models/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .dino import DINO 8 | 9 | __all__ = ['DINO'] -------------------------------------------------------------------------------- /detection/mmdet_custom/models/detectors/dino.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmdet.models.builder import DETECTORS 3 | from mmdet.models.detectors.detr import DETR 4 | 5 | 6 | @DETECTORS.register_module() 7 | class DINO(DETR): 8 | 9 | def __init__(self, *args, **kwargs): 10 | super(DETR, self).__init__(*args, **kwargs) -------------------------------------------------------------------------------- /detection/mmdet_custom/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .query_denoising import build_dn_generator 2 | from .transformer import (DinoTransformer, DinoTransformerDecoder) 3 | from .convModule_norm import ConvModule_Norm 4 | 5 | 6 | __all__ = ['build_dn_generator', 'DinoTransformer', 'DinoTransformerDecoder'] -------------------------------------------------------------------------------- /detection/mmdet_custom/models/utils/convModule_norm.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from mmcv.cnn.bricks.conv_module import ConvModule 8 | 9 | class ConvModule_Norm(ConvModule): 10 | def __init__(self, in_channels, 11 | out_channels, 12 | kernel, **kwargs): 13 | super().__init__(in_channels, out_channels, kernel, **kwargs) 14 | 15 | self.normType = kwargs.get('norm_cfg', {'type':''}) 16 | if self.normType is not None: 17 | self.normType = self.normType['type'] 18 | 19 | def forward(self, x, activate=True, norm=True): 20 | for layer in self.order: 21 | if layer == 'conv': 22 | if self.with_explicit_padding: 23 | x = self.padding_layer(x) 24 | x = self.conv(x) 25 | elif layer == 'norm' and norm and self.with_norm: 26 | if 'LN' in self.normType: 27 | x = x.permute(0, 2, 3, 1) 28 | x = self.norm(x) 29 | x = x.permute(0, 3, 1, 2).contiguous() 30 | else: 31 | x = self.norm(x) 32 | elif layer == 'act' and activate and self.with_activation: 33 | x = self.activate(x) 34 | return x -------------------------------------------------------------------------------- /detection/ops_dcnv3/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .dcnv3_func import DCNv3Function, dcnv3_core_pytorch 8 | -------------------------------------------------------------------------------- /detection/ops_dcnv3/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # -------------------------------------------------------- 3 | # DCNv4 4 | # Copyright (c) 2024 OpenGVLab 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | python setup.py build install 9 | -------------------------------------------------------------------------------- /detection/ops_dcnv3/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .dcnv3 import DCNv3, DCNv3_pytorch -------------------------------------------------------------------------------- /detection/ops_dcnv3/setup.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | import os 8 | import glob 9 | 10 | import torch 11 | 12 | from torch.utils.cpp_extension import CUDA_HOME 13 | from torch.utils.cpp_extension import CppExtension 14 | from torch.utils.cpp_extension import CUDAExtension 15 | 16 | from setuptools import find_packages 17 | from setuptools import setup 18 | 19 | requirements = ["torch", "torchvision"] 20 | 21 | 22 | def get_extensions(): 23 | this_dir = os.path.dirname(os.path.abspath(__file__)) 24 | extensions_dir = os.path.join(this_dir, "src") 25 | 26 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 27 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 28 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 29 | 30 | sources = main_file + source_cpu 31 | extension = CppExtension 32 | extra_compile_args = {"cxx": []} 33 | define_macros = [] 34 | 35 | if torch.cuda.is_available() and CUDA_HOME is not None: 36 | extension = CUDAExtension 37 | sources += source_cuda 38 | define_macros += [("WITH_CUDA", None)] 39 | extra_compile_args["nvcc"] = [ 40 | # "-DCUDA_HAS_FP16=1", 41 | # "-D__CUDA_NO_HALF_OPERATORS__", 42 | # "-D__CUDA_NO_HALF_CONVERSIONS__", 43 | # "-D__CUDA_NO_HALF2_OPERATORS__", 44 | ] 45 | else: 46 | raise NotImplementedError('Cuda is not availabel') 47 | 48 | sources = [os.path.join(extensions_dir, s) for s in sources] 49 | include_dirs = [extensions_dir] 50 | ext_modules = [ 51 | extension( 52 | "DCNv3", 53 | sources, 54 | include_dirs=include_dirs, 55 | define_macros=define_macros, 56 | extra_compile_args=extra_compile_args, 57 | ) 58 | ] 59 | return ext_modules 60 | 61 | 62 | setup( 63 | name="DCNv3", 64 | version="1.0", 65 | author="InternImage", 66 | url="https://github.com/OpenGVLab/InternImage", 67 | description= 68 | "PyTorch Wrapper for CUDA Functions of DCNv3", 69 | packages=find_packages(exclude=( 70 | "configs", 71 | "tests", 72 | )), 73 | ext_modules=get_extensions(), 74 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 75 | ) 76 | -------------------------------------------------------------------------------- /detection/ops_dcnv3/src/cpu/dcnv3_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset, 18 | const at::Tensor &mask, const int kernel_h, 19 | const int kernel_w, const int stride_h, 20 | const int stride_w, const int pad_h, 21 | const int pad_w, const int dilation_h, 22 | const int dilation_w, const int group, 23 | const int group_channels, const float offset_scale, 24 | const int im2col_step) { 25 | AT_ERROR("Not implement on cpu"); 26 | } 27 | 28 | std::vector 29 | dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset, 30 | const at::Tensor &mask, const int kernel_h, 31 | const int kernel_w, const int stride_h, const int stride_w, 32 | const int pad_h, const int pad_w, const int dilation_h, 33 | const int dilation_w, const int group, 34 | const int group_channels, const float offset_scale, 35 | const at::Tensor &grad_output, const int im2col_step) { 36 | AT_ERROR("Not implement on cpu"); 37 | } 38 | -------------------------------------------------------------------------------- /detection/ops_dcnv3/src/cpu/dcnv3_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #pragma once 13 | #include 14 | 15 | at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset, 16 | const at::Tensor &mask, const int kernel_h, 17 | const int kernel_w, const int stride_h, 18 | const int stride_w, const int pad_h, 19 | const int pad_w, const int dilation_h, 20 | const int dilation_w, const int group, 21 | const int group_channels, const float offset_scale, 22 | const int im2col_step); 23 | 24 | std::vector 25 | dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset, 26 | const at::Tensor &mask, const int kernel_h, 27 | const int kernel_w, const int stride_h, const int stride_w, 28 | const int pad_h, const int pad_w, const int dilation_h, 29 | const int dilation_w, const int group, 30 | const int group_channels, const float offset_scale, 31 | const at::Tensor &grad_output, const int im2col_step); 32 | -------------------------------------------------------------------------------- /detection/ops_dcnv3/src/cuda/dcnv3_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #pragma once 13 | #include 14 | 15 | at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, 16 | const at::Tensor &mask, const int kernel_h, 17 | const int kernel_w, const int stride_h, 18 | const int stride_w, const int pad_h, 19 | const int pad_w, const int dilation_h, 20 | const int dilation_w, const int group, 21 | const int group_channels, 22 | const float offset_scale, const int im2col_step); 23 | 24 | std::vector 25 | dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, 26 | const at::Tensor &mask, const int kernel_h, 27 | const int kernel_w, const int stride_h, const int stride_w, 28 | const int pad_h, const int pad_w, const int dilation_h, 29 | const int dilation_w, const int group, 30 | const int group_channels, const float offset_scale, 31 | const at::Tensor &grad_output, const int im2col_step); 32 | -------------------------------------------------------------------------------- /detection/ops_dcnv3/src/dcnv3.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #pragma once 13 | 14 | #include "cpu/dcnv3_cpu.h" 15 | 16 | #ifdef WITH_CUDA 17 | #include "cuda/dcnv3_cuda.h" 18 | #endif 19 | 20 | at::Tensor dcnv3_forward(const at::Tensor &input, const at::Tensor &offset, 21 | const at::Tensor &mask, const int kernel_h, 22 | const int kernel_w, const int stride_h, 23 | const int stride_w, const int pad_h, const int pad_w, 24 | const int dilation_h, const int dilation_w, 25 | const int group, const int group_channels, 26 | const float offset_scale, const int im2col_step) { 27 | if (input.type().is_cuda()) { 28 | #ifdef WITH_CUDA 29 | return dcnv3_cuda_forward(input, offset, mask, kernel_h, kernel_w, 30 | stride_h, stride_w, pad_h, pad_w, dilation_h, 31 | dilation_w, group, group_channels, 32 | offset_scale, im2col_step); 33 | #else 34 | AT_ERROR("Not compiled with GPU support"); 35 | #endif 36 | } 37 | AT_ERROR("Not implemented on the CPU"); 38 | } 39 | 40 | std::vector 41 | dcnv3_backward(const at::Tensor &input, const at::Tensor &offset, 42 | const at::Tensor &mask, const int kernel_h, const int kernel_w, 43 | const int stride_h, const int stride_w, const int pad_h, 44 | const int pad_w, const int dilation_h, const int dilation_w, 45 | const int group, const int group_channels, 46 | const float offset_scale, const at::Tensor &grad_output, 47 | const int im2col_step) { 48 | if (input.type().is_cuda()) { 49 | #ifdef WITH_CUDA 50 | return dcnv3_cuda_backward(input, offset, mask, kernel_h, kernel_w, 51 | stride_h, stride_w, pad_h, pad_w, dilation_h, 52 | dilation_w, group, group_channels, 53 | offset_scale, grad_output, im2col_step); 54 | #else 55 | AT_ERROR("Not compiled with GPU support"); 56 | #endif 57 | } 58 | AT_ERROR("Not implemented on the CPU"); 59 | } 60 | -------------------------------------------------------------------------------- /detection/ops_dcnv3/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #include "dcnv3.h" 13 | 14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 15 | m.def("dcnv3_forward", &dcnv3_forward, "dcnv3_forward"); 16 | m.def("dcnv3_backward", &dcnv3_backward, "dcnv3_backward"); 17 | } 18 | -------------------------------------------------------------------------------- /detection/slurm_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | CHECKPOINT=$4 9 | GPUS=${GPUS:-8} 10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 12 | PY_ARGS=${@:5} 13 | SRUN_ARGS=${SRUN_ARGS:-""} 14 | 15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 16 | srun -p ${PARTITION} \ 17 | --job-name=${JOB_NAME} \ 18 | --gres=gpu:${GPUS_PER_NODE} \ 19 | --ntasks=${GPUS} \ 20 | --ntasks-per-node=${GPUS_PER_NODE} \ 21 | --cpus-per-task=${CPUS_PER_TASK} \ 22 | --kill-on-bad-exit=1 \ 23 | --quotatype=spot \ 24 | ${SRUN_ARGS} \ 25 | python -u test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} 26 | -------------------------------------------------------------------------------- /detection/slurm_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | WORK_DIR=$4 9 | GPUS=${GPUS:-8} 10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 11 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 12 | SRUN_ARGS=${SRUN_ARGS:-""} 13 | PY_ARGS=${@:5} 14 | 15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 16 | srun -p ${PARTITION} \ 17 | --job-name=${JOB_NAME} \ 18 | --gres=gpu:${GPUS_PER_NODE} \ 19 | --ntasks=${GPUS} \ 20 | --ntasks-per-node=${GPUS_PER_NODE} \ 21 | --cpus-per-task=${CPUS_PER_TASK} \ 22 | --kill-on-bad-exit=1 \ 23 | --quotatype=reserved \ 24 | ${SRUN_ARGS} \ 25 | python -u train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS} -------------------------------------------------------------------------------- /detection/tools/create_crowd_anno.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pickle as pkl 4 | import numpy as np 5 | import random 6 | from PIL import Image 7 | import concurrent.futures 8 | import json 9 | import mmcv 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser(description='Generate MMDetection Annotations for Crowdhuman-like dataset') 13 | parser.add_argument('--dataset', help='dataset name', type=str) 14 | parser.add_argument('--dataset-split', help='dataset split, e.g. train, val', type=str) 15 | 16 | args = parser.parse_args() 17 | return args.dataset, args.dataset_split 18 | 19 | def load_func(fpath): 20 | assert os.path.exists(fpath) 21 | with open(fpath, 'r') as fid: 22 | lines = fid.readlines() 23 | records = [json.loads(line.strip('\n')) for line in lines] 24 | return records 25 | 26 | def decode_annotations(records, dataset_path): 27 | rec_ids = list(range(len(records))) 28 | img_list = [] 29 | ann_list = [] 30 | ann_id = 1 31 | for idx, rec_id in enumerate(rec_ids): 32 | img_id = records[rec_id]['ID'] 33 | img_url = dataset_path + 'Images/' + img_id + '.jpg' 34 | assert os.path.exists(img_url) 35 | im = Image.open(img_url) 36 | im_w, im_h = im.width, im.height 37 | 38 | gt_box = records[rec_id]['gtboxes'] 39 | gt_box_len = len(gt_box) 40 | img_dict = dict( 41 | file_name=img_id + '.jpg', 42 | height=im_h, 43 | width=im_w, 44 | id=idx 45 | ) 46 | img_list.append(img_dict) 47 | for ii in range(gt_box_len): 48 | each_data = gt_box[ii] 49 | x, y, w, h = each_data['fbox'] 50 | 51 | if w <= 0 or h <= 0: 52 | continue 53 | # x1 = x; y1 = y; x2 = x + w; y2 = y + h 54 | 55 | valid_bbox = [x, y, w, h] 56 | if each_data['tag'] == 'person': 57 | tag = 1 58 | else: 59 | tag = -2 60 | if 'extra' in each_data: 61 | if 'ignore' in each_data['extra']: 62 | if each_data['extra']['ignore'] != 0: 63 | tag = -2 64 | ann_dict = dict( 65 | area=w * h, 66 | iscrowd=1 if tag == -2 else 0, 67 | image_id=idx, 68 | bbox=[x, y, w, h], 69 | category_id=1, 70 | id=ann_id, 71 | # ignore=1 if tag == -2 else 1, 72 | ) 73 | ann_id += 1 74 | ann_list.append(ann_dict) 75 | cate_list = [{'supercategory': 'none', 'id': 1, 'name': 'person'}] 76 | json_dict = dict( 77 | images=img_list, 78 | annotations=ann_list, 79 | categories=cate_list 80 | ) 81 | return json_dict 82 | 83 | if __name__ == "__main__": 84 | dataset_name, dataset_type = parse_args() 85 | dataset_path = 'data/%s/' % dataset_name 86 | ch_file_path = dataset_path + 'annotations/annotation_%s.odgt' % dataset_type 87 | json_file_path = dataset_path + 'annotations/annotation_%s.json' % dataset_type 88 | 89 | records = load_func(ch_file_path) 90 | print("Loading Annotations Done") 91 | 92 | json_dict = decode_annotations(records, dataset_path) 93 | 94 | print("Parsing Bbox Number: %d" % len(json_dict['annotations'])) 95 | mmcv.dump(json_dict, json_file_path) 96 | -------------------------------------------------------------------------------- /detection/tools/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | from .compute_APMR import compute_APMR 2 | from .compute_JI import compute_JI_with_ignore -------------------------------------------------------------------------------- /segmentation/README.md: -------------------------------------------------------------------------------- 1 | # FlashInternImage for Semantic Segmentation 2 | 3 | This folder contains the implementation of the InternImage for semantic segmentation. 4 | 5 | Our segmentation code is developed on top of [MMSegmentation v0.27.0](https://github.com/open-mmlab/mmsegmentation/tree/v0.27.0). 6 | 7 | ## Usage 8 | 9 | ### Install 10 | 11 | - Clone this repo: 12 | 13 | ```bash 14 | git clone https://github.com/OpenGVLab/DCNv4.git 15 | cd DCNv4 16 | ``` 17 | 18 | - Create a conda virtual environment and activate it: 19 | 20 | ```bash 21 | conda create -n dcnv4 python=3.7 -y 22 | conda activate dcnv4 23 | ``` 24 | 25 | - Install `CUDA>=10.2` with `cudnn>=7` following 26 | the [official installation instructions](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) 27 | - Install `PyTorch>=1.10.0` and `torchvision>=0.9.0` with `CUDA>=10.2`: 28 | 29 | For examples, to install torch==1.11 with CUDA==11.3 and nvcc: 30 | ```bash 31 | conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch -y 32 | conda install -c conda-forge cudatoolkit-dev=11.3 -y # to install nvcc 33 | ``` 34 | 35 | - Install other requirements: 36 | 37 | note: conda opencv will break torchvision as not to support GPU, so we need to install opencv using pip. 38 | 39 | ```bash 40 | conda install -c conda-forge termcolor yacs pyyaml scipy pip -y 41 | pip install opencv-python 42 | ``` 43 | 44 | - Install `timm` and `mmcv-full` and `mmsegmentation': 45 | 46 | ```bash 47 | pip install -U openmim 48 | mim install mmcv-full==1.5.0 49 | mim install mmsegmentation==0.27.0 50 | pip install timm==0.6.11 mmdet==2.28.1 51 | ``` 52 | 53 | - Install DCNv4 54 | ```bash 55 | pip install DCNv4 56 | ``` 57 | 58 | ### Data Preparation 59 | 60 | Prepare datasets according to the [guidelines](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#prepare-datasets) in MMSegmentation. 61 | 62 | 63 | ### Evaluation 64 | 65 | To evaluate our `FlashInternImage` on ADE20K val, run: 66 | 67 | ```bash 68 | sh dist_test.sh --eval mIoU 69 | ``` 70 | You can download checkpoint files from [here](https://huggingface.co/OpenGVLab/DCNv4). Then place it to segmentation/checkpoint_dir/seg. 71 | 72 | For example, to evaluate the `FlashInternImage-T` with a single GPU: 73 | 74 | ```bash 75 | python test.py configs/ade20k/upernet_flash_internimage_t_512_160k_ade20k.py checkpoint_dir/seg/upernet_flash_internimage_t_512_160k_ade20k.pth --eval mIoU 76 | ``` 77 | 78 | For example, to evaluate the `FlashInternImage-B` with a single node with 8 GPUs: 79 | 80 | ```bash 81 | sh dist_test.sh configs/ade20k/upernet_flash_internimage_b_512_160k_ade20k.py checkpoint_dir/seg/upernet_flash_internimage_b_512_160k_ade20k.pth 8 --eval mIoU 82 | ``` 83 | 84 | ### Training 85 | 86 | To train an `FlashInternImage` on ADE20K, run: 87 | 88 | ```bash 89 | sh dist_train.sh 90 | ``` 91 | 92 | For example, to train `FlashInternImage-T` with 8 GPU on 1 node (total batch size 16), run: 93 | 94 | ```bash 95 | sh dist_train.sh configs/ade20k/upernet_flash_internimage_t_512_160k_ade20k.py 8 96 | ``` 97 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/ade20k.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'ADE20KDataset' 3 | data_root = 'data/ADEChallengeData2016' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 512) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', reduce_zero_label=True), 10 | dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(2048, 512), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=4, 36 | workers_per_gpu=4, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | img_dir='images/training', 41 | ann_dir='annotations/training', 42 | pipeline=train_pipeline), 43 | val=dict( 44 | type=dataset_type, 45 | data_root=data_root, 46 | img_dir='images/validation', 47 | ann_dir='annotations/validation', 48 | pipeline=test_pipeline), 49 | test=dict( 50 | type=dataset_type, 51 | data_root=data_root, 52 | img_dir='images/validation', 53 | ann_dir='annotations/validation', 54 | pipeline=test_pipeline)) 55 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/ade20k_640x640.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'ADE20KDataset' 3 | data_root = 'data/ADEChallengeData2016' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (640, 640) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', reduce_zero_label=True), 10 | dict(type='Resize', img_scale=(2560, 640), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(2560, 640), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=4, 36 | workers_per_gpu=4, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | img_dir='images/training', 41 | ann_dir='annotations/training', 42 | pipeline=train_pipeline), 43 | val=dict( 44 | type=dataset_type, 45 | data_root=data_root, 46 | img_dir='images/validation', 47 | ann_dir='annotations/validation', 48 | pipeline=test_pipeline), 49 | test=dict( 50 | type=dataset_type, 51 | data_root=data_root, 52 | img_dir='images/validation', 53 | ann_dir='annotations/validation', 54 | pipeline=test_pipeline)) 55 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/chase_db1.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'ChaseDB1Dataset' 3 | data_root = 'data/CHASE_DB1' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | img_scale = (960, 999) 7 | crop_size = (128, 128) 8 | train_pipeline = [ 9 | dict(type='LoadImageFromFile'), 10 | dict(type='LoadAnnotations'), 11 | dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), 12 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 13 | dict(type='RandomFlip', prob=0.5), 14 | dict(type='PhotoMetricDistortion'), 15 | dict(type='Normalize', **img_norm_cfg), 16 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 17 | dict(type='DefaultFormatBundle'), 18 | dict(type='Collect', keys=['img', 'gt_semantic_seg']) 19 | ] 20 | test_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='MultiScaleFlipAug', 24 | img_scale=img_scale, 25 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], 26 | flip=False, 27 | transforms=[ 28 | dict(type='Resize', keep_ratio=True), 29 | dict(type='RandomFlip'), 30 | dict(type='Normalize', **img_norm_cfg), 31 | dict(type='ImageToTensor', keys=['img']), 32 | dict(type='Collect', keys=['img']) 33 | ]) 34 | ] 35 | 36 | data = dict( 37 | samples_per_gpu=4, 38 | workers_per_gpu=4, 39 | train=dict( 40 | type='RepeatDataset', 41 | times=40000, 42 | dataset=dict( 43 | type=dataset_type, 44 | data_root=data_root, 45 | img_dir='images/training', 46 | ann_dir='annotations/training', 47 | pipeline=train_pipeline)), 48 | val=dict( 49 | type=dataset_type, 50 | data_root=data_root, 51 | img_dir='images/validation', 52 | ann_dir='annotations/validation', 53 | pipeline=test_pipeline), 54 | test=dict( 55 | type=dataset_type, 56 | data_root=data_root, 57 | img_dir='images/validation', 58 | ann_dir='annotations/validation', 59 | pipeline=test_pipeline)) 60 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/cityscapes.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'CityscapesDataset' 3 | data_root = 'data/cityscapes/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 1024) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations'), 10 | dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(2048, 1024), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=2, 36 | workers_per_gpu=2, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | img_dir='leftImg8bit/train', 41 | ann_dir='gtFine/train', 42 | pipeline=train_pipeline), 43 | val=dict( 44 | type=dataset_type, 45 | data_root=data_root, 46 | img_dir='leftImg8bit/val', 47 | ann_dir='gtFine/val', 48 | pipeline=test_pipeline), 49 | test=dict( 50 | type=dataset_type, 51 | data_root=data_root, 52 | img_dir='leftImg8bit/val', 53 | ann_dir='gtFine/val', 54 | pipeline=test_pipeline)) 55 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/cityscapes_1024x1024.py: -------------------------------------------------------------------------------- 1 | _base_ = './cityscapes.py' 2 | img_norm_cfg = dict( 3 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 4 | crop_size = (1024, 1024) 5 | train_pipeline = [ 6 | dict(type='LoadImageFromFile'), 7 | dict(type='LoadAnnotations'), 8 | dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), 9 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 10 | dict(type='RandomFlip', prob=0.5), 11 | dict(type='PhotoMetricDistortion'), 12 | dict(type='Normalize', **img_norm_cfg), 13 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 14 | dict(type='DefaultFormatBundle'), 15 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 16 | ] 17 | test_pipeline = [ 18 | dict(type='LoadImageFromFile'), 19 | dict( 20 | type='MultiScaleFlipAug', 21 | img_scale=(2048, 1024), 22 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 23 | flip=False, 24 | transforms=[ 25 | dict(type='Resize', keep_ratio=True), 26 | dict(type='RandomFlip'), 27 | dict(type='Normalize', **img_norm_cfg), 28 | dict(type='ImageToTensor', keys=['img']), 29 | dict(type='Collect', keys=['img']), 30 | ]) 31 | ] 32 | data = dict( 33 | train=dict(pipeline=train_pipeline), 34 | val=dict(pipeline=test_pipeline), 35 | test=dict(pipeline=test_pipeline)) 36 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/cityscapes_extra.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'CityscapesDataset' 3 | data_root = 'data/cityscapes/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 1024) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations'), 10 | dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(2048, 1024), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=2, 36 | workers_per_gpu=2, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | img_dir=['leftImg8bit/train', 'leftImg8bit/train_extra'], 41 | ann_dir=['gtFine/train', 'refinement_final/train_extra'], 42 | pipeline=train_pipeline), 43 | val=dict( 44 | type=dataset_type, 45 | data_root=data_root, 46 | img_dir='leftImg8bit/val', 47 | ann_dir='gtFine/val', 48 | pipeline=test_pipeline), 49 | test=dict( 50 | type=dataset_type, 51 | data_root=data_root, 52 | img_dir='leftImg8bit/val', 53 | ann_dir='gtFine/val', 54 | pipeline=test_pipeline)) 55 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/coco-stuff10k.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'COCOStuffDataset' 3 | data_root = 'data/coco_stuff10k' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 512) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', reduce_zero_label=True), 10 | dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(2048, 512), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=4, 36 | workers_per_gpu=4, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | reduce_zero_label=True, 41 | img_dir='images/train2014', 42 | ann_dir='annotations/train2014', 43 | pipeline=train_pipeline), 44 | val=dict( 45 | type=dataset_type, 46 | data_root=data_root, 47 | reduce_zero_label=True, 48 | img_dir='images/test2014', 49 | ann_dir='annotations/test2014', 50 | pipeline=test_pipeline), 51 | test=dict( 52 | type=dataset_type, 53 | data_root=data_root, 54 | reduce_zero_label=True, 55 | img_dir='images/test2014', 56 | ann_dir='annotations/test2014', 57 | pipeline=test_pipeline)) 58 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/coco-stuff164k.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'COCOStuffDataset' 3 | data_root = 'data/coco_stuff164k' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 512) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations'), 10 | dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(2048, 512), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=4, 36 | workers_per_gpu=4, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | img_dir='images/train2017', 41 | ann_dir='annotations/train2017', 42 | pipeline=train_pipeline), 43 | val=dict( 44 | type=dataset_type, 45 | data_root=data_root, 46 | img_dir='images/val2017', 47 | ann_dir='annotations/val2017', 48 | pipeline=test_pipeline), 49 | test=dict( 50 | type=dataset_type, 51 | data_root=data_root, 52 | img_dir='images/val2017', 53 | ann_dir='annotations/val2017', 54 | pipeline=test_pipeline)) 55 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/drive.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'DRIVEDataset' 3 | data_root = 'data/DRIVE' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | img_scale = (584, 565) 7 | crop_size = (64, 64) 8 | train_pipeline = [ 9 | dict(type='LoadImageFromFile'), 10 | dict(type='LoadAnnotations'), 11 | dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), 12 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 13 | dict(type='RandomFlip', prob=0.5), 14 | dict(type='PhotoMetricDistortion'), 15 | dict(type='Normalize', **img_norm_cfg), 16 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 17 | dict(type='DefaultFormatBundle'), 18 | dict(type='Collect', keys=['img', 'gt_semantic_seg']) 19 | ] 20 | test_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='MultiScaleFlipAug', 24 | img_scale=img_scale, 25 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], 26 | flip=False, 27 | transforms=[ 28 | dict(type='Resize', keep_ratio=True), 29 | dict(type='RandomFlip'), 30 | dict(type='Normalize', **img_norm_cfg), 31 | dict(type='ImageToTensor', keys=['img']), 32 | dict(type='Collect', keys=['img']) 33 | ]) 34 | ] 35 | 36 | data = dict( 37 | samples_per_gpu=4, 38 | workers_per_gpu=4, 39 | train=dict( 40 | type='RepeatDataset', 41 | times=40000, 42 | dataset=dict( 43 | type=dataset_type, 44 | data_root=data_root, 45 | img_dir='images/training', 46 | ann_dir='annotations/training', 47 | pipeline=train_pipeline)), 48 | val=dict( 49 | type=dataset_type, 50 | data_root=data_root, 51 | img_dir='images/validation', 52 | ann_dir='annotations/validation', 53 | pipeline=test_pipeline), 54 | test=dict( 55 | type=dataset_type, 56 | data_root=data_root, 57 | img_dir='images/validation', 58 | ann_dir='annotations/validation', 59 | pipeline=test_pipeline)) 60 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/hrf.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'HRFDataset' 3 | data_root = 'data/HRF' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | img_scale = (2336, 3504) 7 | crop_size = (256, 256) 8 | train_pipeline = [ 9 | dict(type='LoadImageFromFile'), 10 | dict(type='LoadAnnotations'), 11 | dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), 12 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 13 | dict(type='RandomFlip', prob=0.5), 14 | dict(type='PhotoMetricDistortion'), 15 | dict(type='Normalize', **img_norm_cfg), 16 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 17 | dict(type='DefaultFormatBundle'), 18 | dict(type='Collect', keys=['img', 'gt_semantic_seg']) 19 | ] 20 | test_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='MultiScaleFlipAug', 24 | img_scale=img_scale, 25 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], 26 | flip=False, 27 | transforms=[ 28 | dict(type='Resize', keep_ratio=True), 29 | dict(type='RandomFlip'), 30 | dict(type='Normalize', **img_norm_cfg), 31 | dict(type='ImageToTensor', keys=['img']), 32 | dict(type='Collect', keys=['img']) 33 | ]) 34 | ] 35 | 36 | data = dict( 37 | samples_per_gpu=4, 38 | workers_per_gpu=4, 39 | train=dict( 40 | type='RepeatDataset', 41 | times=40000, 42 | dataset=dict( 43 | type=dataset_type, 44 | data_root=data_root, 45 | img_dir='images/training', 46 | ann_dir='annotations/training', 47 | pipeline=train_pipeline)), 48 | val=dict( 49 | type=dataset_type, 50 | data_root=data_root, 51 | img_dir='images/validation', 52 | ann_dir='annotations/validation', 53 | pipeline=test_pipeline), 54 | test=dict( 55 | type=dataset_type, 56 | data_root=data_root, 57 | img_dir='images/validation', 58 | ann_dir='annotations/validation', 59 | pipeline=test_pipeline)) 60 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/loveda.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'LoveDADataset' 3 | data_root = 'data/loveDA' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 512) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations', reduce_zero_label=True), 10 | dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(1024, 1024), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=4, 36 | workers_per_gpu=4, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | img_dir='img_dir/train', 41 | ann_dir='ann_dir/train', 42 | pipeline=train_pipeline), 43 | val=dict( 44 | type=dataset_type, 45 | data_root=data_root, 46 | img_dir='img_dir/val', 47 | ann_dir='ann_dir/val', 48 | pipeline=test_pipeline), 49 | test=dict( 50 | type=dataset_type, 51 | data_root=data_root, 52 | img_dir='img_dir/val', 53 | ann_dir='ann_dir/val', 54 | pipeline=test_pipeline)) 55 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/mapillary.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'MapillaryDataset' 3 | data_root = 'data/Mapillary/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 1024) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations'), 10 | dict(type='MapillaryHack'), 11 | dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 1.0)), 12 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 13 | dict(type='RandomFlip', prob=0.5), 14 | dict(type='PhotoMetricDistortion'), 15 | dict(type='Normalize', **img_norm_cfg), 16 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 17 | dict(type='DefaultFormatBundle'), 18 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 19 | ] 20 | test_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='MultiScaleFlipAug', 24 | img_scale=(2048, 1024), 25 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 26 | flip=False, 27 | transforms=[ 28 | dict(type='Resize', keep_ratio=True), 29 | dict(type='RandomFlip'), 30 | dict(type='Normalize', **img_norm_cfg), 31 | dict(type='ImageToTensor', keys=['img']), 32 | dict(type='Collect', keys=['img']), 33 | ]) 34 | ] 35 | data = dict( 36 | samples_per_gpu=2, 37 | workers_per_gpu=2, 38 | train=dict( 39 | type=dataset_type, 40 | data_root='data/Mapillary/', 41 | img_dir=['training/images', 'validation/images'], 42 | ann_dir=['training/labels', 'validation/labels'], 43 | pipeline=train_pipeline), 44 | val=dict( 45 | type='CityscapesDataset', 46 | data_root='data/cityscapes/', 47 | img_dir='leftImg8bit/val', 48 | ann_dir='gtFine/val', 49 | pipeline=test_pipeline), 50 | test=dict( 51 | type='CityscapesDataset', 52 | data_root='data/cityscapes/', 53 | img_dir='leftImg8bit/val', 54 | ann_dir='gtFine/val', 55 | pipeline=test_pipeline)) 56 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/mapillary_1024x1024.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'MapillaryDataset' 3 | data_root = 'data/Mapillary/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (1024, 1024) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations'), 10 | dict(type='MapillaryHack'), 11 | dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 1.0)), 12 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 13 | dict(type='RandomFlip', prob=0.5), 14 | dict(type='PhotoMetricDistortion'), 15 | dict(type='Normalize', **img_norm_cfg), 16 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 17 | dict(type='DefaultFormatBundle'), 18 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 19 | ] 20 | test_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='MultiScaleFlipAug', 24 | img_scale=(2048, 1024), 25 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 26 | flip=False, 27 | transforms=[ 28 | dict(type='Resize', keep_ratio=True), 29 | dict(type='RandomFlip'), 30 | dict(type='Normalize', **img_norm_cfg), 31 | dict(type='ImageToTensor', keys=['img']), 32 | dict(type='Collect', keys=['img']), 33 | ]) 34 | ] 35 | data = dict( 36 | samples_per_gpu=2, 37 | workers_per_gpu=2, 38 | train=dict( 39 | type=dataset_type, 40 | data_root='data/Mapillary/', 41 | img_dir=['training/images', 'validation/images'], 42 | ann_dir=['training/labels', 'validation/labels'], 43 | pipeline=train_pipeline), 44 | val=dict( 45 | type='CityscapesDataset', 46 | data_root='data/cityscapes/', 47 | img_dir='leftImg8bit/val', 48 | ann_dir='gtFine/val', 49 | pipeline=test_pipeline), 50 | test=dict( 51 | type='CityscapesDataset', 52 | data_root='data/cityscapes/', 53 | img_dir='leftImg8bit/val', 54 | ann_dir='gtFine/val', 55 | pipeline=test_pipeline)) 56 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/nyu_depth_v2.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'NYUDepthV2Dataset' 3 | data_root = 'data/nyu_depth_v2/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | 7 | crop_size = (480, 480) 8 | 9 | train_pipeline = [ 10 | dict(type='LoadImageFromFile'), 11 | dict(type='LoadAnnotations', reduce_zero_label=True), 12 | dict(type='Resize', img_scale=(640, 480), ratio_range=(0.5, 2.0)), 13 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 14 | dict(type='RandomFlip', prob=0.5), 15 | dict(type='PhotoMetricDistortion'), 16 | dict(type='Normalize', **img_norm_cfg), 17 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 18 | dict(type='DefaultFormatBundle'), 19 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 20 | ] 21 | test_pipeline = [ 22 | dict(type='LoadImageFromFile'), 23 | dict( 24 | type='MultiScaleFlipAug', 25 | img_scale=(640, 480), 26 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 27 | flip=False, 28 | transforms=[ 29 | dict(type='Resize', keep_ratio=True), 30 | dict(type='RandomFlip'), 31 | dict(type='Normalize', **img_norm_cfg), 32 | dict(type='ImageToTensor', keys=['img']), 33 | dict(type='Collect', keys=['img']), 34 | ]) 35 | ] 36 | data = dict( 37 | samples_per_gpu=4, 38 | workers_per_gpu=4, 39 | train=dict( 40 | type=dataset_type, 41 | data_root=data_root, 42 | img_dir='image', 43 | ann_dir='label40', 44 | split='train.txt', 45 | pipeline=train_pipeline), 46 | val=dict( 47 | type=dataset_type, 48 | data_root=data_root, 49 | img_dir='image', 50 | ann_dir='label40', 51 | split='test.txt', 52 | pipeline=test_pipeline), 53 | test=dict( 54 | type=dataset_type, 55 | data_root=data_root, 56 | img_dir='image', 57 | ann_dir='label40', 58 | split='test.txt', 59 | pipeline=test_pipeline)) 60 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/pascal_context.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'PascalContextDataset' 3 | data_root = 'data/VOCdevkit/VOC2010/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | 7 | img_scale = (520, 520) 8 | crop_size = (480, 480) 9 | 10 | train_pipeline = [ 11 | dict(type='LoadImageFromFile'), 12 | dict(type='LoadAnnotations'), 13 | dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), 14 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 15 | dict(type='RandomFlip', prob=0.5), 16 | dict(type='PhotoMetricDistortion'), 17 | dict(type='Normalize', **img_norm_cfg), 18 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 19 | dict(type='DefaultFormatBundle'), 20 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 21 | ] 22 | test_pipeline = [ 23 | dict(type='LoadImageFromFile'), 24 | dict( 25 | type='MultiScaleFlipAug', 26 | img_scale=img_scale, 27 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 28 | flip=False, 29 | transforms=[ 30 | dict(type='Resize', keep_ratio=True), 31 | dict(type='RandomFlip'), 32 | dict(type='Normalize', **img_norm_cfg), 33 | dict(type='ImageToTensor', keys=['img']), 34 | dict(type='Collect', keys=['img']), 35 | ]) 36 | ] 37 | data = dict( 38 | samples_per_gpu=4, 39 | workers_per_gpu=4, 40 | train=dict( 41 | type=dataset_type, 42 | data_root=data_root, 43 | img_dir='JPEGImages', 44 | ann_dir='SegmentationClassContext', 45 | split='ImageSets/SegmentationContext/train.txt', 46 | pipeline=train_pipeline), 47 | val=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | img_dir='JPEGImages', 51 | ann_dir='SegmentationClassContext', 52 | split='ImageSets/SegmentationContext/val.txt', 53 | pipeline=test_pipeline), 54 | test=dict( 55 | type=dataset_type, 56 | data_root=data_root, 57 | img_dir='JPEGImages', 58 | ann_dir='SegmentationClassContext', 59 | split='ImageSets/SegmentationContext/val.txt', 60 | pipeline=test_pipeline)) 61 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/pascal_context_59.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'PascalContextDataset59' 3 | data_root = 'data/VOCdevkit/VOC2010/' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | 7 | img_scale = (520, 520) 8 | crop_size = (480, 480) 9 | 10 | train_pipeline = [ 11 | dict(type='LoadImageFromFile'), 12 | dict(type='LoadAnnotations', reduce_zero_label=True), 13 | dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), 14 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 15 | dict(type='RandomFlip', prob=0.5), 16 | dict(type='PhotoMetricDistortion'), 17 | dict(type='Normalize', **img_norm_cfg), 18 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 19 | dict(type='DefaultFormatBundle'), 20 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 21 | ] 22 | test_pipeline = [ 23 | dict(type='LoadImageFromFile'), 24 | dict( 25 | type='MultiScaleFlipAug', 26 | img_scale=img_scale, 27 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 28 | flip=False, 29 | transforms=[ 30 | dict(type='Resize', keep_ratio=True), 31 | dict(type='RandomFlip'), 32 | dict(type='Normalize', **img_norm_cfg), 33 | dict(type='ImageToTensor', keys=['img']), 34 | dict(type='Collect', keys=['img']), 35 | ]) 36 | ] 37 | data = dict( 38 | samples_per_gpu=4, 39 | workers_per_gpu=4, 40 | train=dict( 41 | type=dataset_type, 42 | data_root=data_root, 43 | img_dir='JPEGImages', 44 | ann_dir='SegmentationClassContext', 45 | split='ImageSets/SegmentationContext/train.txt', 46 | pipeline=train_pipeline), 47 | val=dict( 48 | type=dataset_type, 49 | data_root=data_root, 50 | img_dir='JPEGImages', 51 | ann_dir='SegmentationClassContext', 52 | split='ImageSets/SegmentationContext/val.txt', 53 | pipeline=test_pipeline), 54 | test=dict( 55 | type=dataset_type, 56 | data_root=data_root, 57 | img_dir='JPEGImages', 58 | ann_dir='SegmentationClassContext', 59 | split='ImageSets/SegmentationContext/val.txt', 60 | pipeline=test_pipeline)) 61 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/pascal_voc12.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'PascalVOCDataset' 3 | data_root = 'data/VOCdevkit/VOC2012' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | crop_size = (512, 512) 7 | train_pipeline = [ 8 | dict(type='LoadImageFromFile'), 9 | dict(type='LoadAnnotations'), 10 | dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), 11 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 12 | dict(type='RandomFlip', prob=0.5), 13 | dict(type='PhotoMetricDistortion'), 14 | dict(type='Normalize', **img_norm_cfg), 15 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 16 | dict(type='DefaultFormatBundle'), 17 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 18 | ] 19 | test_pipeline = [ 20 | dict(type='LoadImageFromFile'), 21 | dict( 22 | type='MultiScaleFlipAug', 23 | img_scale=(2048, 512), 24 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 25 | flip=False, 26 | transforms=[ 27 | dict(type='Resize', keep_ratio=True), 28 | dict(type='RandomFlip'), 29 | dict(type='Normalize', **img_norm_cfg), 30 | dict(type='ImageToTensor', keys=['img']), 31 | dict(type='Collect', keys=['img']), 32 | ]) 33 | ] 34 | data = dict( 35 | samples_per_gpu=4, 36 | workers_per_gpu=4, 37 | train=dict( 38 | type=dataset_type, 39 | data_root=data_root, 40 | img_dir='JPEGImages', 41 | ann_dir='SegmentationClass', 42 | split='ImageSets/Segmentation/train.txt', 43 | pipeline=train_pipeline), 44 | val=dict( 45 | type=dataset_type, 46 | data_root=data_root, 47 | img_dir='JPEGImages', 48 | ann_dir='SegmentationClass', 49 | split='ImageSets/Segmentation/val.txt', 50 | pipeline=test_pipeline), 51 | test=dict( 52 | type=dataset_type, 53 | data_root=data_root, 54 | img_dir='JPEGImages', 55 | ann_dir='SegmentationClass', 56 | split='ImageSets/Segmentation/val.txt', 57 | pipeline=test_pipeline)) 58 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/pascal_voc12_aug.py: -------------------------------------------------------------------------------- 1 | _base_ = './pascal_voc12.py' 2 | # dataset settings 3 | data = dict( 4 | train=dict( 5 | ann_dir=['SegmentationClass', 'SegmentationClassAug'], 6 | split=[ 7 | 'ImageSets/Segmentation/train.txt', 8 | 'ImageSets/Segmentation/aug.txt' 9 | ])) 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/potsdam.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/DCNv4/4b848f7dd7da74ff03f7d278f902c6fd05b391b5/segmentation/configs/_base_/datasets/potsdam.py -------------------------------------------------------------------------------- /segmentation/configs/_base_/datasets/stare.py: -------------------------------------------------------------------------------- 1 | # dataset settings 2 | dataset_type = 'STAREDataset' 3 | data_root = 'data/STARE' 4 | img_norm_cfg = dict( 5 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 6 | img_scale = (605, 700) 7 | crop_size = (128, 128) 8 | train_pipeline = [ 9 | dict(type='LoadImageFromFile'), 10 | dict(type='LoadAnnotations'), 11 | dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), 12 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 13 | dict(type='RandomFlip', prob=0.5), 14 | dict(type='PhotoMetricDistortion'), 15 | dict(type='Normalize', **img_norm_cfg), 16 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 17 | dict(type='DefaultFormatBundle'), 18 | dict(type='Collect', keys=['img', 'gt_semantic_seg']) 19 | ] 20 | test_pipeline = [ 21 | dict(type='LoadImageFromFile'), 22 | dict( 23 | type='MultiScaleFlipAug', 24 | img_scale=img_scale, 25 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], 26 | flip=False, 27 | transforms=[ 28 | dict(type='Resize', keep_ratio=True), 29 | dict(type='RandomFlip'), 30 | dict(type='Normalize', **img_norm_cfg), 31 | dict(type='ImageToTensor', keys=['img']), 32 | dict(type='Collect', keys=['img']) 33 | ]) 34 | ] 35 | 36 | data = dict( 37 | samples_per_gpu=4, 38 | workers_per_gpu=4, 39 | train=dict( 40 | type='RepeatDataset', 41 | times=40000, 42 | dataset=dict( 43 | type=dataset_type, 44 | data_root=data_root, 45 | img_dir='images/training', 46 | ann_dir='annotations/training', 47 | pipeline=train_pipeline)), 48 | val=dict( 49 | type=dataset_type, 50 | data_root=data_root, 51 | img_dir='images/validation', 52 | ann_dir='annotations/validation', 53 | pipeline=test_pipeline), 54 | test=dict( 55 | type=dataset_type, 56 | data_root=data_root, 57 | img_dir='images/validation', 58 | ann_dir='annotations/validation', 59 | pipeline=test_pipeline)) 60 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | # yapf:disable 2 | log_config = dict( 3 | interval=50, 4 | hooks=[ 5 | dict(type='TextLoggerHook', by_epoch=False), 6 | # dict(type='TensorboardLoggerHook') 7 | ]) 8 | # yapf:enable 9 | dist_params = dict(backend='nccl') 10 | log_level = 'INFO' 11 | load_from = None 12 | resume_from = None 13 | workflow = [('train', 1)] 14 | cudnn_benchmark = True 15 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/segformer_mit-b0.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained=None, 6 | backbone=dict( 7 | type='MixVisionTransformer', 8 | in_channels=3, 9 | embed_dims=32, 10 | num_stages=4, 11 | num_layers=[2, 2, 2, 2], 12 | num_heads=[1, 2, 5, 8], 13 | patch_sizes=[7, 3, 3, 3], 14 | sr_ratios=[8, 4, 2, 1], 15 | out_indices=(0, 1, 2, 3), 16 | mlp_ratio=4, 17 | qkv_bias=True, 18 | drop_rate=0.0, 19 | attn_drop_rate=0.0, 20 | drop_path_rate=0.1), 21 | decode_head=dict( 22 | type='SegformerHead', 23 | in_channels=[32, 64, 160, 256], 24 | in_index=[0, 1, 2, 3], 25 | channels=256, 26 | dropout_ratio=0.1, 27 | num_classes=19, 28 | norm_cfg=norm_cfg, 29 | align_corners=False, 30 | loss_decode=dict( 31 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 32 | # model training and testing settings 33 | train_cfg=dict(), 34 | test_cfg=dict(mode='whole')) -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/upernet_convnext.py: -------------------------------------------------------------------------------- 1 | norm_cfg = dict(type='SyncBN', requires_grad=True) 2 | custom_imports = dict(imports='mmcls.models', allow_failed_imports=False) 3 | # checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-base_3rdparty_32xb128-noema_in1k_20220301-2a0ee547.pth' # noqa 4 | model = dict( 5 | type='EncoderDecoder', 6 | pretrained=None, 7 | backbone=dict( 8 | type='mmcls.ConvNeXt', 9 | arch='base', 10 | norm_cfg=dict(type='LN2dv2', eps=1e-6), 11 | out_indices=[0, 1, 2, 3], 12 | drop_path_rate=0.4, 13 | layer_scale_init_value=1.0, 14 | gap_before_final_norm=False, 15 | # init_cfg=dict( 16 | # type='Pretrained', checkpoint=checkpoint_file, 17 | # prefix='backbone.') 18 | ), 19 | decode_head=dict( 20 | type='UPerHead', 21 | in_channels=[128, 256, 512, 1024], 22 | in_index=[0, 1, 2, 3], 23 | pool_scales=(1, 2, 3, 6), 24 | channels=512, 25 | dropout_ratio=0.1, 26 | num_classes=19, 27 | norm_cfg=norm_cfg, 28 | align_corners=False, 29 | loss_decode=dict( 30 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 31 | auxiliary_head=dict( 32 | type='FCNHead', 33 | in_channels=384, 34 | in_index=2, 35 | channels=256, 36 | num_convs=1, 37 | concat_input=False, 38 | dropout_ratio=0.1, 39 | num_classes=19, 40 | norm_cfg=norm_cfg, 41 | align_corners=False, 42 | loss_decode=dict( 43 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 44 | # model training and testing settings 45 | train_cfg=dict(), 46 | test_cfg=dict(mode='whole')) 47 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/upernet_r50.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 1, 1), 12 | strides=(1, 2, 2, 2), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | decode_head=dict( 18 | type='UPerHead', 19 | in_channels=[256, 512, 1024, 2048], 20 | in_index=[0, 1, 2, 3], 21 | pool_scales=(1, 2, 3, 6), 22 | channels=512, 23 | dropout_ratio=0.1, 24 | num_classes=19, 25 | norm_cfg=norm_cfg, 26 | align_corners=False, 27 | loss_decode=dict( 28 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 29 | auxiliary_head=dict( 30 | type='FCNHead', 31 | in_channels=1024, 32 | in_index=2, 33 | channels=256, 34 | num_convs=1, 35 | concat_input=False, 36 | dropout_ratio=0.1, 37 | num_classes=19, 38 | norm_cfg=norm_cfg, 39 | align_corners=False, 40 | loss_decode=dict( 41 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 42 | # model training and testing settings 43 | train_cfg=dict(), 44 | test_cfg=dict(mode='whole')) 45 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/models/upernet_swin.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | backbone_norm_cfg = dict(type='LN', requires_grad=True) 4 | model = dict( 5 | type='EncoderDecoder', 6 | pretrained=None, 7 | backbone=dict( 8 | type='SwinTransformer', 9 | pretrain_img_size=224, 10 | embed_dims=96, 11 | patch_size=4, 12 | window_size=7, 13 | mlp_ratio=4, 14 | depths=[2, 2, 6, 2], 15 | num_heads=[3, 6, 12, 24], 16 | strides=(4, 2, 2, 2), 17 | out_indices=(0, 1, 2, 3), 18 | qkv_bias=True, 19 | qk_scale=None, 20 | patch_norm=True, 21 | drop_rate=0., 22 | attn_drop_rate=0., 23 | drop_path_rate=0.3, 24 | use_abs_pos_embed=False, 25 | act_cfg=dict(type='GELU'), 26 | norm_cfg=backbone_norm_cfg), 27 | decode_head=dict( 28 | type='UPerHead', 29 | in_channels=[96, 192, 384, 768], 30 | in_index=[0, 1, 2, 3], 31 | pool_scales=(1, 2, 3, 6), 32 | channels=512, 33 | dropout_ratio=0.1, 34 | num_classes=19, 35 | norm_cfg=norm_cfg, 36 | align_corners=False, 37 | loss_decode=dict( 38 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 39 | auxiliary_head=dict( 40 | type='FCNHead', 41 | in_channels=384, 42 | in_index=2, 43 | channels=256, 44 | num_convs=1, 45 | concat_input=False, 46 | dropout_ratio=0.1, 47 | num_classes=19, 48 | norm_cfg=norm_cfg, 49 | align_corners=False, 50 | loss_decode=dict( 51 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), 52 | # model training and testing settings 53 | train_cfg=dict(), 54 | test_cfg=dict(mode='whole')) 55 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_160k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=160000) 8 | checkpoint_config = dict(by_epoch=False, interval=16000) 9 | evaluation = dict(interval=16000, metric='mIoU', pre_eval=True) 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_20k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=20000) 8 | checkpoint_config = dict(by_epoch=False, interval=2000) 9 | evaluation = dict(interval=2000, metric='mIoU', pre_eval=True) 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_320k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=320000) 8 | checkpoint_config = dict(by_epoch=False, interval=32000) 9 | evaluation = dict(interval=32000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_40k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=40000) 8 | checkpoint_config = dict(by_epoch=False, interval=4000) 9 | evaluation = dict(interval=4000, metric='mIoU', pre_eval=True) 10 | -------------------------------------------------------------------------------- /segmentation/configs/_base_/schedules/schedule_80k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=80000) 8 | checkpoint_config = dict(by_epoch=False, interval=8000) 9 | evaluation = dict(interval=8000, metric='mIoU', pre_eval=True) 10 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/README.md: -------------------------------------------------------------------------------- 1 | # ADE20K 2 | 3 | Introduced by Zhou et al. in [Scene Parsing Through ADE20K Dataset](https://paperswithcode.com/paper/scene-parsing-through-ade20k-dataset). 4 | 5 | The ADE20K semantic segmentation dataset contains more than 20K scene-centric images exhaustively annotated with pixel-level objects and object parts labels. There are totally 150 semantic categories, which include stuffs like sky, road, grass, and discrete objects like person, car, bed. 6 | 7 | 8 | ## Model Zoo 9 | 10 | ### UperNet + InternImage 11 | 12 | 13 | | backbone | resolution | mIoU (ss/ms) | Config | Download | 14 | |:--------------:|:----------:|:-----------:|:-----------:|:----------: 15 | | FlashInternImage-T | 512x512 | 49.3 / 50.3 | [config](./upernet_flash_internimage_t_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_t_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_t_512_160k_ade20k.log) | 16 | | FlashInternImage-S | 512x512 | 50.6 / 51.6 | [config](./upernet_flash_internimage_s_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_s_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_s_512_160k_ade20k.log) | 17 | | FlashInternImage-B | 512x512 | 52.0 / 52.6 | [config](./upernet_flash_internimage_b_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_b_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_s_512_160k_ade20k.log) | 18 | | FlashInternImage-L | 640x640 | 55.6 / 56.0 | [config](./upernet_flash_internimage_l_640_160k_ade20k.py)| [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_l_640_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_l_640_160k_ade20k.log) | 19 | 20 | - Training speed is measured with A100 GPU. 21 | - Please set `with_cp=True` to save memory if you meet `out-of-memory` issues. 22 | - The logs are our recent newly trained ones. There are slight differences between the results in logs and our paper. 23 | 24 | 25 | ### Mask2Former + InternImage 26 | 27 | | backbone | resolution | mIoU (ss) | Config | Download | 28 | |:--------------:|:----------:|:-----------:|:-----------:|:----------: 29 | | FlashInternImage-T | 512x512 | 51.2 | [config](./mask2former_flash_internimage_t_512_160k_ade20k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_t_512_160k_ade20k_ss.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_t_512_160k_ade20k_ss.log) | 30 | | FlashInternImage-S | 640x640 | 52.2 | [config](./mask2former_flash_internimage_s_640_160k_ade20k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_s_640_160k_ade20k_ss.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_s_640_160k_ade20k_ss.log) | 31 | | FlashInternImage-B | 640x640 | 53.4 | [config](./mask2former_flash_internimage_b_640_160k_ade20k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_b_640_160k_ade20k_ss.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_b_640_160k_ade20k_ss.log) | 32 | | FlashInternImage-L | 640x640 | 56.7 | [config](./mask2former_flash_internimage_l_640_160k_ade20k_ss.py)| [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_l_640_160k_ade20k_ss.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_l_640_160k_ade20k_ss.log) | 33 | 34 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/upernet_flash_internimage_b_512_160k_ade20k.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | _base_ = [ 7 | '../_base_/models/upernet_r50.py', '../_base_/datasets/ade20k.py', 8 | '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' 9 | ] 10 | pretrained = 'https://huggingface.co/OpenGVLab/DCNv4/resolve/main/flash_intern_image_b_1k_224.pth' 11 | model = dict( 12 | backbone=dict( 13 | _delete_=True, 14 | type='FlashInternImage', 15 | core_op='DCNv4', 16 | channels=112, 17 | depths=[4, 4, 21, 4], 18 | groups=[7, 14, 28, 56], 19 | mlp_ratio=4., 20 | drop_path_rate=0.3, 21 | norm_layer='LN', 22 | layer_scale=1.0, 23 | offset_scale=0.5, 24 | post_norm=True, 25 | with_cp=False, 26 | dw_kernel_size=3, 27 | out_indices=(0, 1, 2, 3), 28 | init_cfg=dict(type='Pretrained', checkpoint=pretrained)), 29 | decode_head=dict(num_classes=150, in_channels=[112, 224, 448, 896]), 30 | auxiliary_head=dict(num_classes=150, in_channels=448), 31 | test_cfg=dict(mode='whole') 32 | ) 33 | img_norm_cfg = dict( 34 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 35 | test_pipeline = [ 36 | dict(type='LoadImageFromFile'), 37 | dict( 38 | type='MultiScaleFlipAug', 39 | img_scale=(2048, 512), 40 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 41 | flip=False, 42 | transforms=[ 43 | dict(type='Resize', keep_ratio=True), 44 | dict(type='ResizeToMultiple', size_divisor=32), 45 | dict(type='RandomFlip'), 46 | dict(type='Normalize', **img_norm_cfg), 47 | dict(type='ImageToTensor', keys=['img']), 48 | dict(type='Collect', keys=['img']), 49 | ]) 50 | ] 51 | optimizer = dict( 52 | _delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05, 53 | constructor='CustomLayerDecayOptimizerConstructor', 54 | paramwise_cfg=dict(num_layers=33, layer_decay_rate=1.0, 55 | depths=[4, 4, 21, 4])) 56 | lr_config = dict(_delete_=True, policy='poly', 57 | warmup='linear', 58 | warmup_iters=1500, 59 | warmup_ratio=1e-6, 60 | power=1.0, min_lr=0.0, by_epoch=False) 61 | # By default, models are trained on 8 GPUs with 2 images per GPU 62 | data=dict(samples_per_gpu=2, 63 | val=dict(pipeline=test_pipeline), 64 | test=dict(pipeline=test_pipeline)) 65 | runner = dict(type='IterBasedRunner') 66 | checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1) 67 | evaluation = dict(interval=16000, metric='mIoU', save_best='mIoU') 68 | # fp16 = dict(loss_scale=dict(init_scale=512)) 69 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/upernet_flash_internimage_l_640_160k_ade20k.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | _base_ = [ 7 | '../_base_/models/upernet_r50.py', '../_base_/datasets/ade20k.py', 8 | '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' 9 | ] 10 | pretrained = 'https://huggingface.co/OpenGVLab/DCNv4/resolve/main/flash_intern_image_l_22k_384.pth' 11 | model = dict( 12 | backbone=dict( 13 | _delete_=True, 14 | type='FlashInternImage', 15 | core_op='DCNv4', 16 | channels=160, 17 | depths=[5, 5, 22, 5], 18 | groups=[10, 20, 40, 80], 19 | mlp_ratio=4., 20 | drop_path_rate=0.4, 21 | norm_layer='LN', 22 | layer_scale=1.0, 23 | offset_scale=2.0, 24 | post_norm=True, 25 | with_cp=False, 26 | dcn_output_bias=True, 27 | mlp_fc2_bias=True, 28 | dw_kernel_size=3, 29 | out_indices=(0, 1, 2, 3), 30 | init_cfg=dict(type='Pretrained', checkpoint=pretrained)), 31 | decode_head=dict(num_classes=150, in_channels=[160, 320, 640, 1280]), 32 | auxiliary_head=dict(num_classes=150, in_channels=640), 33 | test_cfg=dict(mode='whole')) 34 | img_norm_cfg = dict( 35 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 36 | crop_size = (640, 640) 37 | train_pipeline = [ 38 | dict(type='LoadImageFromFile'), 39 | dict(type='LoadAnnotations', reduce_zero_label=True), 40 | dict(type='Resize', img_scale=(2560, 640), ratio_range=(0.5, 2.0)), 41 | dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), 42 | dict(type='RandomFlip', prob=0.5), 43 | dict(type='PhotoMetricDistortion'), 44 | dict(type='Normalize', **img_norm_cfg), 45 | dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), 46 | dict(type='DefaultFormatBundle'), 47 | dict(type='Collect', keys=['img', 'gt_semantic_seg']), 48 | ] 49 | test_pipeline = [ 50 | dict(type='LoadImageFromFile'), 51 | dict( 52 | type='MultiScaleFlipAug', 53 | img_scale=(2560, 640), 54 | # img_ratios=[0.75, 1.0, 1.25], 55 | flip=False, 56 | transforms=[ 57 | dict(type='Resize', keep_ratio=True), 58 | dict(type='ResizeToMultiple', size_divisor=32), 59 | dict(type='RandomFlip'), 60 | dict(type='Normalize', **img_norm_cfg), 61 | dict(type='ImageToTensor', keys=['img']), 62 | dict(type='Collect', keys=['img']), 63 | ]) 64 | ] 65 | optimizer = dict( 66 | _delete_=True, type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05, 67 | constructor='CustomLayerDecayOptimizerConstructor', 68 | paramwise_cfg=dict(num_layers=37, layer_decay_rate=0.94, 69 | depths=[5, 5, 22, 5], offset_lr_scale=1.0)) 70 | lr_config = dict(_delete_=True, policy='poly', 71 | warmup='linear', 72 | warmup_iters=1500, 73 | warmup_ratio=1e-6, 74 | power=1.0, min_lr=0.0, by_epoch=False) 75 | # By default, models are trained on 8 GPUs with 2 images per GPU 76 | data = dict(samples_per_gpu=2, 77 | train=dict(pipeline=train_pipeline), 78 | val=dict(pipeline=test_pipeline), 79 | test=dict(pipeline=test_pipeline)) 80 | runner = dict(type='IterBasedRunner') 81 | optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2)) 82 | checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1) 83 | evaluation = dict(interval=16000, metric='mIoU', save_best='mIoU') 84 | # fp16 = dict(loss_scale=dict(init_scale=512)) 85 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/upernet_flash_internimage_s_512_160k_ade20k.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | _base_ = [ 7 | '../_base_/models/upernet_r50.py', '../_base_/datasets/ade20k.py', 8 | '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' 9 | ] 10 | pretrained = 'https://huggingface.co/OpenGVLab/DCNv4/resolve/main/flash_intern_image_s_1k_224.pth' 11 | model = dict( 12 | backbone=dict( 13 | _delete_=True, 14 | type='FlashInternImage', 15 | core_op='DCNv4', 16 | channels=80, 17 | depths=[4, 4, 21, 4], 18 | groups=[5, 10, 20, 40], 19 | mlp_ratio=4., 20 | drop_path_rate=0.3, 21 | norm_layer='LN', 22 | layer_scale=1.0, 23 | offset_scale=1.0, 24 | post_norm=True, 25 | with_cp=True, 26 | dw_kernel_size=3, 27 | out_indices=(0, 1, 2, 3), 28 | init_cfg=dict(type='Pretrained', checkpoint=pretrained)), 29 | decode_head=dict(num_classes=150, in_channels=[80, 160, 320, 640]), 30 | auxiliary_head=dict(num_classes=150, in_channels=320), 31 | test_cfg=dict(mode='whole') 32 | ) 33 | img_norm_cfg = dict( 34 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 35 | test_pipeline = [ 36 | dict(type='LoadImageFromFile'), 37 | dict( 38 | type='MultiScaleFlipAug', 39 | img_scale=(2048, 512), 40 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 41 | flip=False, 42 | transforms=[ 43 | dict(type='Resize', keep_ratio=True), 44 | dict(type='ResizeToMultiple', size_divisor=32), 45 | dict(type='RandomFlip'), 46 | dict(type='Normalize', **img_norm_cfg), 47 | dict(type='ImageToTensor', keys=['img']), 48 | dict(type='Collect', keys=['img']), 49 | ]) 50 | ] 51 | optimizer = dict( 52 | _delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05, 53 | constructor='CustomLayerDecayOptimizerConstructor', 54 | paramwise_cfg=dict(num_layers=33, layer_decay_rate=1.0, 55 | depths=[4, 4, 21, 4])) 56 | lr_config = dict(_delete_=True, policy='poly', 57 | warmup='linear', 58 | warmup_iters=1500, 59 | warmup_ratio=1e-6, 60 | power=1.0, min_lr=0.0, by_epoch=False) 61 | # By default, models are trained on 8 GPUs with 2 images per GPU 62 | data=dict(samples_per_gpu=2, 63 | val=dict(pipeline=test_pipeline), 64 | test=dict(pipeline=test_pipeline)) 65 | runner = dict(type='IterBasedRunner') 66 | checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1) 67 | evaluation = dict(interval=16000, metric='mIoU', save_best='mIoU') 68 | # fp16 = dict(loss_scale=dict(init_scale=512)) 69 | -------------------------------------------------------------------------------- /segmentation/configs/ade20k/upernet_flash_internimage_t_512_160k_ade20k.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | _base_ = [ 7 | '../_base_/models/upernet_r50.py', '../_base_/datasets/ade20k.py', 8 | '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' 9 | ] 10 | pretrained = 'https://huggingface.co/OpenGVLab/DCNv4/resolve/main/flash_intern_image_t_1k_224.pth' 11 | model = dict( 12 | backbone=dict( 13 | _delete_=True, 14 | type='FlashInternImage', 15 | core_op='DCNv4', 16 | channels=64, 17 | depths=[4, 4, 18, 4], 18 | groups=[4, 8, 16, 32], 19 | mlp_ratio=4., 20 | drop_path_rate=0.2, 21 | norm_layer='LN', 22 | layer_scale=1.0, 23 | offset_scale=1.0, 24 | post_norm=False, 25 | with_cp=True, 26 | out_indices=(0, 1, 2, 3), 27 | init_cfg=dict(type='Pretrained', checkpoint=pretrained)), 28 | decode_head=dict(num_classes=150, in_channels=[64, 128, 256, 512]), 29 | auxiliary_head=dict(num_classes=150, in_channels=256), 30 | test_cfg=dict(mode='whole') 31 | ) 32 | img_norm_cfg = dict( 33 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 34 | test_pipeline = [ 35 | dict(type='LoadImageFromFile'), 36 | dict( 37 | type='MultiScaleFlipAug', 38 | img_scale=(2048, 512), 39 | # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], 40 | flip=False, 41 | transforms=[ 42 | dict(type='Resize', keep_ratio=True), 43 | dict(type='ResizeToMultiple', size_divisor=32), 44 | dict(type='RandomFlip'), 45 | dict(type='Normalize', **img_norm_cfg), 46 | dict(type='ImageToTensor', keys=['img']), 47 | dict(type='Collect', keys=['img']), 48 | ]) 49 | ] 50 | optimizer = dict( 51 | _delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05, 52 | constructor='CustomLayerDecayOptimizerConstructor', 53 | paramwise_cfg=dict(num_layers=30, layer_decay_rate=1.0, 54 | depths=[4, 4, 18, 4])) 55 | lr_config = dict(_delete_=True, policy='poly', 56 | warmup='linear', 57 | warmup_iters=1500, 58 | warmup_ratio=1e-6, 59 | power=1.0, min_lr=0.0, by_epoch=False) 60 | # By default, models are trained on 8 GPUs with 2 images per GPU 61 | data=dict(samples_per_gpu=2, 62 | # val=dict(pipeline=test_pipeline), 63 | # test=dict(pipeline=test_pipeline) 64 | ) 65 | runner = dict(type='IterBasedRunner') 66 | checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1) 67 | evaluation = dict(interval=16000, metric='mIoU', save_best='mIoU') 68 | # fp16 = dict(loss_scale=dict(init_scale=512)) 69 | -------------------------------------------------------------------------------- /segmentation/deploy/configs/_base_/backends/tensorrt.py: -------------------------------------------------------------------------------- 1 | backend_config = dict( 2 | type='tensorrt', common_config=dict(fp16_mode=False, max_workspace_size=0)) 3 | -------------------------------------------------------------------------------- /segmentation/deploy/configs/_base_/onnx_config.py: -------------------------------------------------------------------------------- 1 | onnx_config = dict( 2 | type='onnx', 3 | export_params=True, 4 | keep_initializers_as_inputs=False, 5 | opset_version=11, 6 | save_file='end2end.onnx', 7 | input_names=['input'], 8 | output_names=['output'], 9 | input_shape=None, 10 | optimize=True) 11 | -------------------------------------------------------------------------------- /segmentation/deploy/configs/mmseg/segmentation_static.py: -------------------------------------------------------------------------------- 1 | _base_ = ['../_base_/onnx_config.py'] 2 | codebase_config = dict(type='mmseg', task='Segmentation', with_argmax=True) 3 | -------------------------------------------------------------------------------- /segmentation/deploy/configs/mmseg/segmentation_tensorrt_static-512x512.py: -------------------------------------------------------------------------------- 1 | _base_ = ['./segmentation_static.py', '../_base_/backends/tensorrt.py'] 2 | 3 | onnx_config = dict(input_shape=[512, 512]) 4 | backend_config = dict( 5 | common_config=dict(max_workspace_size=1 << 30), 6 | model_inputs=[ 7 | dict( 8 | input_shapes=dict( 9 | input=dict( 10 | min_shape=[1, 3, 512, 512], 11 | opt_shape=[1, 3, 512, 512], 12 | max_shape=[1, 3, 512, 512]))) 13 | ]) 14 | -------------------------------------------------------------------------------- /segmentation/deploy/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/DCNv4/4b848f7dd7da74ff03f7d278f902c6fd05b391b5/segmentation/deploy/demo.png -------------------------------------------------------------------------------- /segmentation/dist_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | CHECKPOINT=$2 5 | GPUS=$3 6 | PORT=${PORT:-29510} 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 8 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ 9 | $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} 10 | -------------------------------------------------------------------------------- /segmentation/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONFIG=$1 4 | GPUS=$2 5 | PORT=${PORT:-29300} 6 | 7 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 8 | python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ 9 | $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} 10 | -------------------------------------------------------------------------------- /segmentation/image_demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from argparse import ArgumentParser 3 | 4 | import mmcv 5 | 6 | import mmcv_custom # noqa: F401,F403 7 | import mmseg_custom # noqa: F401,F403 8 | from mmseg.apis import inference_segmentor, init_segmentor, show_result_pyplot 9 | from mmseg.core.evaluation import get_palette 10 | from mmcv.runner import load_checkpoint 11 | from mmseg.core import get_classes 12 | import cv2 13 | import os.path as osp 14 | import os 15 | 16 | 17 | def test_single_image(model, img_name, out_dir, color_palette, opacity): 18 | result = inference_segmentor(model, img_name) 19 | 20 | # show the results 21 | if hasattr(model, 'module'): 22 | model = model.module 23 | img = model.show_result(img_name, result, 24 | palette=color_palette, 25 | show=False, opacity=opacity) 26 | 27 | # save the results 28 | mmcv.mkdir_or_exist(out_dir) 29 | out_path = osp.join(out_dir, osp.basename(img_name)) 30 | cv2.imwrite(out_path, img) 31 | print(f"Result is save at {out_path}") 32 | 33 | 34 | def main(): 35 | parser = ArgumentParser() 36 | parser.add_argument('img', help='Image file or a directory contains images') 37 | parser.add_argument('config', help='Config file') 38 | parser.add_argument('checkpoint', help='Checkpoint file') 39 | parser.add_argument('--out', type=str, default="demo", help='out dir') 40 | parser.add_argument( 41 | '--device', default='cuda:0', help='Device used for inference') 42 | parser.add_argument( 43 | '--palette', 44 | default='ade20k', 45 | choices=['ade20k', 'cityscapes', 'cocostuff'], 46 | help='Color palette used for segmentation map') 47 | parser.add_argument( 48 | '--opacity', 49 | type=float, 50 | default=0.5, 51 | help='Opacity of painted segmentation map. In (0, 1] range.') 52 | args = parser.parse_args() 53 | 54 | # build the model from a config file and a checkpoint file 55 | model = init_segmentor(args.config, checkpoint=None, device=args.device) 56 | checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') 57 | if 'CLASSES' in checkpoint.get('meta', {}): 58 | model.CLASSES = checkpoint['meta']['CLASSES'] 59 | else: 60 | model.CLASSES = get_classes(args.palette) 61 | 62 | # check arg.img is directory of a single image. 63 | if osp.isdir(args.img): 64 | for img in os.listdir(args.img): 65 | test_single_image(model, osp.join(args.img, img), args.out, get_palette(args.palette), args.opacity) 66 | else: 67 | test_single_image(model, args.img, args.out, get_palette(args.palette), args.opacity) 68 | 69 | if __name__ == '__main__': 70 | main() -------------------------------------------------------------------------------- /segmentation/mmcv_custom/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | # -*- coding: utf-8 -*- 8 | from .custom_layer_decay_optimizer_constructor import CustomLayerDecayOptimizerConstructor 9 | from .layer_decay import LearningRateDecayOptimizerConstructor 10 | from .layer_decay_vit import LayerDecayOptimizerConstructor_vit 11 | __all__ = ['CustomLayerDecayOptimizerConstructor',] 12 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .models import * # noqa: F401,F403 8 | from .datasets import * # noqa: F401,F403 9 | from .core import * # noqa: F401,F403 10 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from mmseg.core.evaluation import * # noqa: F401, F403 3 | from mmseg.core.seg import * # noqa: F401, F403 4 | 5 | from .anchor import * # noqa: F401,F403 6 | from .box import * # noqa: F401,F403 7 | from .evaluation import * # noqa: F401,F403 8 | from .mask import * # noqa: F401,F403 9 | from .utils import * # noqa: F401, F403 10 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/anchor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .point_generator import MlvlPointGenerator # noqa: F401,F403 3 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/anchor/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import warnings 3 | 4 | from mmcv.utils import Registry, build_from_cfg 5 | 6 | PRIOR_GENERATORS = Registry('Generator for anchors and points') 7 | 8 | ANCHOR_GENERATORS = PRIOR_GENERATORS 9 | 10 | 11 | def build_prior_generator(cfg, default_args=None): 12 | return build_from_cfg(cfg, PRIOR_GENERATORS, default_args) 13 | 14 | 15 | def build_anchor_generator(cfg, default_args=None): 16 | warnings.warn( 17 | '``build_anchor_generator`` would be deprecated soon, please use ' 18 | '``build_prior_generator`` ') 19 | return build_prior_generator(cfg, default_args=default_args) 20 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/box/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .builder import * # noqa: F401,F403 3 | from .samplers import MaskPseudoSampler # noqa: F401,F403 4 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/box/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmcv.utils import Registry, build_from_cfg 3 | 4 | BBOX_SAMPLERS = Registry('bbox_sampler') 5 | BBOX_CODERS = Registry('bbox_coder') 6 | 7 | 8 | def build_sampler(cfg, **default_args): 9 | """Builder of box sampler.""" 10 | return build_from_cfg(cfg, BBOX_SAMPLERS, default_args) 11 | 12 | 13 | def build_bbox_coder(cfg, **default_args): 14 | """Builder of box coder.""" 15 | return build_from_cfg(cfg, BBOX_CODERS, default_args) 16 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/box/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .mask_pseudo_sampler import MaskPseudoSampler # noqa: F401,F403 3 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/box/samplers/mask_pseudo_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | """copy from 3 | https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py.""" 4 | 5 | import torch 6 | 7 | from ..builder import BBOX_SAMPLERS 8 | from .base_sampler import BaseSampler 9 | from .mask_sampling_result import MaskSamplingResult 10 | 11 | 12 | @BBOX_SAMPLERS.register_module() 13 | class MaskPseudoSampler(BaseSampler): 14 | """A pseudo sampler that does not do sampling actually.""" 15 | def __init__(self, **kwargs): 16 | pass 17 | 18 | def _sample_pos(self, **kwargs): 19 | """Sample positive samples.""" 20 | raise NotImplementedError 21 | 22 | def _sample_neg(self, **kwargs): 23 | """Sample negative samples.""" 24 | raise NotImplementedError 25 | 26 | def sample(self, assign_result, masks, gt_masks, **kwargs): 27 | """Directly returns the positive and negative indices of samples. 28 | 29 | Args: 30 | assign_result (:obj:`AssignResult`): Assigned results 31 | masks (torch.Tensor): Bounding boxes 32 | gt_masks (torch.Tensor): Ground truth boxes 33 | Returns: 34 | :obj:`SamplingResult`: sampler results 35 | """ 36 | pos_inds = torch.nonzero(assign_result.gt_inds > 0, 37 | as_tuple=False).squeeze(-1).unique() 38 | neg_inds = torch.nonzero(assign_result.gt_inds == 0, 39 | as_tuple=False).squeeze(-1).unique() 40 | gt_flags = masks.new_zeros(masks.shape[0], dtype=torch.uint8) 41 | sampling_result = MaskSamplingResult(pos_inds, neg_inds, masks, 42 | gt_masks, assign_result, gt_flags) 43 | return sampling_result 44 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/box/samplers/mask_sampling_result.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | """copy from 3 | https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py.""" 4 | 5 | import torch 6 | 7 | from .sampling_result import SamplingResult 8 | 9 | 10 | class MaskSamplingResult(SamplingResult): 11 | """Mask sampling result.""" 12 | def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result, 13 | gt_flags): 14 | self.pos_inds = pos_inds 15 | self.neg_inds = neg_inds 16 | self.pos_masks = masks[pos_inds] 17 | self.neg_masks = masks[neg_inds] 18 | self.pos_is_gt = gt_flags[pos_inds] 19 | 20 | self.num_gts = gt_masks.shape[0] 21 | self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 22 | 23 | if gt_masks.numel() == 0: 24 | # hack for index error case 25 | assert self.pos_assigned_gt_inds.numel() == 0 26 | self.pos_gt_masks = torch.empty_like(gt_masks) 27 | else: 28 | self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :] 29 | 30 | if assign_result.labels is not None: 31 | self.pos_gt_labels = assign_result.labels[pos_inds] 32 | else: 33 | self.pos_gt_labels = None 34 | 35 | @property 36 | def masks(self): 37 | """torch.Tensor: concatenated positive and negative boxes""" 38 | return torch.cat([self.pos_masks, self.neg_masks]) 39 | 40 | def __nice__(self): 41 | data = self.info.copy() 42 | data['pos_masks'] = data.pop('pos_masks').shape 43 | data['neg_masks'] = data.pop('neg_masks').shape 44 | parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())] 45 | body = ' ' + ',\n '.join(parts) 46 | return '{\n' + body + '\n}' 47 | 48 | @property 49 | def info(self): 50 | """Returns a dictionary of info about the object.""" 51 | return { 52 | 'pos_inds': self.pos_inds, 53 | 'neg_inds': self.neg_inds, 54 | 'pos_masks': self.pos_masks, 55 | 'neg_masks': self.neg_masks, 56 | 'pos_is_gt': self.pos_is_gt, 57 | 'num_gts': self.num_gts, 58 | 'pos_assigned_gt_inds': self.pos_assigned_gt_inds, 59 | } 60 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .panoptic_utils import INSTANCE_OFFSET # noqa: F401,F403 3 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/evaluation/panoptic_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # A custom value to distinguish instance ID and category ID; need to 3 | # be greater than the number of categories. 4 | # For a pixel in the panoptic result map: 5 | # pan_id = ins_id * INSTANCE_OFFSET + cat_id 6 | INSTANCE_OFFSET = 1000 7 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/mask/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .utils import mask2bbox # noqa: F401,F403 3 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/mask/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import mmcv 3 | import numpy as np 4 | import pycocotools.mask as mask_util 5 | import torch 6 | 7 | 8 | def split_combined_polys(polys, poly_lens, polys_per_mask): 9 | """Split the combined 1-D polys into masks. 10 | 11 | A mask is represented as a list of polys, and a poly is represented as 12 | a 1-D array. In dataset, all masks are concatenated into a single 1-D 13 | tensor. Here we need to split the tensor into original representations. 14 | 15 | Args: 16 | polys (list): a list (length = image num) of 1-D tensors 17 | poly_lens (list): a list (length = image num) of poly length 18 | polys_per_mask (list): a list (length = image num) of poly number 19 | of each mask 20 | 21 | Returns: 22 | list: a list (length = image num) of list (length = mask num) of \ 23 | list (length = poly num) of numpy array. 24 | """ 25 | mask_polys_list = [] 26 | for img_id in range(len(polys)): 27 | polys_single = polys[img_id] 28 | polys_lens_single = poly_lens[img_id].tolist() 29 | polys_per_mask_single = polys_per_mask[img_id].tolist() 30 | 31 | split_polys = mmcv.slice_list(polys_single, polys_lens_single) 32 | mask_polys = mmcv.slice_list(split_polys, polys_per_mask_single) 33 | mask_polys_list.append(mask_polys) 34 | return mask_polys_list 35 | 36 | 37 | # TODO: move this function to more proper place 38 | def encode_mask_results(mask_results): 39 | """Encode bitmap mask to RLE code. 40 | 41 | Args: 42 | mask_results (list | tuple[list]): bitmap mask results. 43 | In mask scoring rcnn, mask_results is a tuple of (segm_results, 44 | segm_cls_score). 45 | 46 | Returns: 47 | list | tuple: RLE encoded mask. 48 | """ 49 | if isinstance(mask_results, tuple): # mask scoring 50 | cls_segms, cls_mask_scores = mask_results 51 | else: 52 | cls_segms = mask_results 53 | num_classes = len(cls_segms) 54 | encoded_mask_results = [[] for _ in range(num_classes)] 55 | for i in range(len(cls_segms)): 56 | for cls_segm in cls_segms[i]: 57 | encoded_mask_results[i].append( 58 | mask_util.encode( 59 | np.array( 60 | cls_segm[:, :, np.newaxis], order='F', 61 | dtype='uint8'))[0]) # encoded with RLE 62 | if isinstance(mask_results, tuple): 63 | return encoded_mask_results, cls_mask_scores 64 | else: 65 | return encoded_mask_results 66 | 67 | 68 | def mask2bbox(masks): 69 | """Obtain tight bounding boxes of binary masks. 70 | 71 | Args: 72 | masks (Tensor): Binary mask of shape (n, h, w). 73 | 74 | Returns: 75 | Tensor: Bboxe with shape (n, 4) of \ 76 | positive region in binary mask. 77 | """ 78 | N = masks.shape[0] 79 | bboxes = masks.new_zeros((N, 4), dtype=torch.float32) 80 | x_any = torch.any(masks, dim=1) 81 | y_any = torch.any(masks, dim=2) 82 | for i in range(N): 83 | x = torch.where(x_any[i, :])[0] 84 | y = torch.where(y_any[i, :])[0] 85 | if len(x) > 0 and len(y) > 0: 86 | bboxes[i, :] = bboxes.new_tensor( 87 | [x[0], y[0], x[-1] + 1, y[-1] + 1]) 88 | 89 | return bboxes 90 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dist_utils import (DistOptimizerHook, all_reduce_dict, allreduce_grads, 3 | reduce_mean) 4 | from .misc import add_prefix, multi_apply 5 | 6 | __all__ = [ 7 | 'add_prefix', 'multi_apply', 'DistOptimizerHook', 'allreduce_grads', 8 | 'all_reduce_dict', 'reduce_mean' 9 | ] 10 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/core/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | def multi_apply(func, *args, **kwargs): 3 | """Apply function to a list of arguments. 4 | 5 | Note: 6 | This function applies the ``func`` to multiple inputs and 7 | map the multiple outputs of the ``func`` into different 8 | list. Each list contains the same type of outputs corresponding 9 | to different inputs. 10 | 11 | Args: 12 | func (Function): A function that will be applied to a list of 13 | arguments 14 | 15 | Returns: 16 | tuple(list): A tuple containing multiple list, each list contains \ 17 | a kind of returned results by the function 18 | """ 19 | pfunc = partial(func, **kwargs) if kwargs else func 20 | map_results = map(pfunc, *args) 21 | return tuple(map(list, zip(*map_results))) 22 | 23 | 24 | def add_prefix(inputs, prefix): 25 | """Add prefix for dict. 26 | 27 | Args: 28 | inputs (dict): The input dict with str keys. 29 | prefix (str): The prefix to add. 30 | 31 | Returns: 32 | 33 | dict: The dict with keys updated with ``prefix``. 34 | """ 35 | 36 | outputs = dict() 37 | for name, value in inputs.items(): 38 | outputs[f'{prefix}.{name}'] = value 39 | 40 | return outputs 41 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .mapillary import MapillaryDataset # noqa: F401,F403 3 | from .nyu_depth_v2 import NYUDepthV2Dataset # noqa: F401,F403 4 | from .pipelines import * # noqa: F401,F403 5 | from .dataset_wrappers import ConcatDataset 6 | 7 | 8 | __all__ = [ 9 | 'MapillaryDataset', 'NYUDepthV2Dataset', 'ConcatDataset' 10 | ] -------------------------------------------------------------------------------- /segmentation/mmseg_custom/datasets/mapillary.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | from mmseg.datasets.builder import DATASETS 7 | from mmseg.datasets.custom import CustomDataset 8 | 9 | 10 | @DATASETS.register_module() 11 | class MapillaryDataset(CustomDataset): 12 | """Mapillary dataset. 13 | """ 14 | CLASSES = ('Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail', 'Barrier', 15 | 'Wall', 'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Parking', 'Pedestrian Area', 16 | 'Rail Track', 'Road', 'Service Lane', 'Sidewalk', 'Bridge', 'Building', 'Tunnel', 17 | 'Person', 'Bicyclist', 'Motorcyclist', 'Other Rider', 'Lane Marking - Crosswalk', 18 | 'Lane Marking - General', 'Mountain', 'Sand', 'Sky', 'Snow', 'Terrain', 'Vegetation', 19 | 'Water', 'Banner', 'Bench', 'Bike Rack', 'Billboard', 'Catch Basin', 'CCTV Camera', 20 | 'Fire Hydrant', 'Junction Box', 'Mailbox', 'Manhole', 'Phone Booth', 'Pothole', 21 | 'Street Light', 'Pole', 'Traffic Sign Frame', 'Utility Pole', 'Traffic Light', 22 | 'Traffic Sign (Back)', 'Traffic Sign (Front)', 'Trash Can', 'Bicycle', 'Boat', 23 | 'Bus', 'Car', 'Caravan', 'Motorcycle', 'On Rails', 'Other Vehicle', 'Trailer', 24 | 'Truck', 'Wheeled Slow', 'Car Mount', 'Ego Vehicle', 'Unlabeled') 25 | 26 | PALETTE = [[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153], 27 | [180, 165, 180], [90, 120, 150], [102, 102, 156], [128, 64, 255], 28 | [140, 140, 200], [170, 170, 170], [250, 170, 160], [96, 96, 96], 29 | [230, 150, 140], [128, 64, 128], [110, 110, 110], [244, 35, 232], 30 | [150, 100, 100], [70, 70, 70], [150, 120, 90], [220, 20, 60], 31 | [255, 0, 0], [255, 0, 100], [255, 0, 200], [200, 128, 128], 32 | [255, 255, 255], [64, 170, 64], [230, 160, 50], [70, 130, 180], 33 | [190, 255, 255], [152, 251, 152], [107, 142, 35], [0, 170, 30], 34 | [255, 255, 128], [250, 0, 30], [100, 140, 180], [220, 220, 220], 35 | [220, 128, 128], [222, 40, 40], [100, 170, 30], [40, 40, 40], 36 | [33, 33, 33], [100, 128, 160], [142, 0, 0], [70, 100, 150], 37 | [210, 170, 100], [153, 153, 153], [128, 128, 128], [0, 0, 80], 38 | [250, 170, 30], [192, 192, 192], [220, 220, 0], [140, 140, 20], 39 | [119, 11, 32], [150, 0, 255], [0, 60, 100], [0, 0, 142], [0, 0, 90], 40 | [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110], [0, 0, 70], 41 | [0, 0, 192], [32, 32, 32], [120, 10, 10], [0, 0, 0]] 42 | 43 | def __init__(self, **kwargs): 44 | super(MapillaryDataset, self).__init__( 45 | img_suffix='.jpg', 46 | seg_map_suffix='.png', 47 | reduce_zero_label=False, 48 | **kwargs) -------------------------------------------------------------------------------- /segmentation/mmseg_custom/datasets/nyu_depth_v2.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | from mmseg.datasets.builder import DATASETS 7 | from mmseg.datasets.custom import CustomDataset 8 | 9 | 10 | @DATASETS.register_module() 11 | class NYUDepthV2Dataset(CustomDataset): 12 | """NYU Depth V2 dataset. 13 | """ 14 | 15 | CLASSES = ('wall', 'floor', 'cabinet', 'bed', 'chair', 16 | 'sofa', 'table', 'door', 'window', 'bookshelf', 17 | 'picture', 'counter', 'blinds', 'desk', 'shelves', 18 | 'curtain', 'dresser', 'pillow', 'mirror', 'floor mat', 19 | 'clothes', 'ceiling', 'books', 'refridgerator', 'television', 20 | 'paper', 'towel', 'shower curtain', 'box', 'whiteboard', 21 | 'person', 'night stand', 'toilet', 'sink', 'lamp', 22 | 'bathtub', 'bag', 'otherstructure', 'otherfurniture', 'otherprop') 23 | 24 | 25 | PALETTE = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50], 26 | [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255], 27 | [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7], 28 | [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82], 29 | [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3], 30 | [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255], 31 | [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220], 32 | [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224], 33 | [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255], 34 | [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],] 35 | 36 | def __init__(self, split, **kwargs): 37 | super(NYUDepthV2Dataset, self).__init__( 38 | img_suffix='.png', 39 | seg_map_suffix='.png', 40 | split=split, 41 | reduce_zero_label=True, 42 | **kwargs) 43 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/datasets/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .formatting import DefaultFormatBundle, ToMask 3 | from .transform import MapillaryHack, PadShortSide, SETR_Resize 4 | 5 | __all__ = [ 6 | 'DefaultFormatBundle', 'ToMask', 'SETR_Resize', 7 | 'PadShortSide', 'MapillaryHack' 8 | ] 9 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/datasets/pipelines/formatting.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import numpy as np 3 | from mmcv.parallel import DataContainer as DC 4 | from mmseg.datasets.builder import PIPELINES 5 | from mmseg.datasets.pipelines.formatting import to_tensor 6 | 7 | 8 | @PIPELINES.register_module(force=True) 9 | class DefaultFormatBundle(object): 10 | """Default formatting bundle. 11 | 12 | It simplifies the pipeline of formatting common fields, including "img" 13 | and "gt_semantic_seg". These fields are formatted as follows. 14 | 15 | - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) 16 | - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, 17 | (3)to DataContainer (stack=True) 18 | """ 19 | def __call__(self, results): 20 | """Call function to transform and format common fields in results. 21 | 22 | Args: 23 | results (dict): Result dict contains the data to convert. 24 | 25 | Returns: 26 | dict: The result dict contains the data that is formatted with 27 | default bundle. 28 | """ 29 | 30 | if 'img' in results: 31 | img = results['img'] 32 | if len(img.shape) < 3: 33 | img = np.expand_dims(img, -1) 34 | img = np.ascontiguousarray(img.transpose(2, 0, 1)) 35 | results['img'] = DC(to_tensor(img), stack=True) 36 | if 'gt_semantic_seg' in results: 37 | # convert to long 38 | results['gt_semantic_seg'] = DC(to_tensor( 39 | results['gt_semantic_seg'][None, ...].astype(np.int64)), 40 | stack=True) 41 | if 'gt_masks' in results: 42 | results['gt_masks'] = DC(to_tensor(results['gt_masks'])) 43 | if 'gt_labels' in results: 44 | results['gt_labels'] = DC(to_tensor(results['gt_labels'])) 45 | 46 | return results 47 | 48 | def __repr__(self): 49 | return self.__class__.__name__ 50 | 51 | 52 | @PIPELINES.register_module() 53 | class ToMask(object): 54 | """Transfer gt_semantic_seg to binary mask and generate gt_labels.""" 55 | def __init__(self, ignore_index=255): 56 | self.ignore_index = ignore_index 57 | 58 | def __call__(self, results): 59 | gt_semantic_seg = results['gt_semantic_seg'] 60 | gt_labels = np.unique(gt_semantic_seg) 61 | # remove ignored region 62 | gt_labels = gt_labels[gt_labels != self.ignore_index] 63 | 64 | gt_masks = [] 65 | for class_id in gt_labels: 66 | gt_masks.append(gt_semantic_seg == class_id) 67 | 68 | if len(gt_masks) == 0: 69 | # Some image does not have annotation (all ignored) 70 | gt_masks = np.empty((0, ) + results['pad_shape'][:-1], dtype=np.int64) 71 | gt_labels = np.empty((0, ), dtype=np.int64) 72 | else: 73 | gt_masks = np.asarray(gt_masks, dtype=np.int64) 74 | gt_labels = np.asarray(gt_labels, dtype=np.int64) 75 | 76 | results['gt_labels'] = gt_labels 77 | results['gt_masks'] = gt_masks 78 | return results 79 | 80 | def __repr__(self): 81 | return self.__class__.__name__ + \ 82 | f'(ignore_index={self.ignore_index})' 83 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .backbones import * # noqa: F401,F403 8 | from .decode_heads import * # noqa: F401,F403 9 | from .losses import * # noqa: F401,F403 10 | from .plugins import * # noqa: F401,F403 11 | from .segmentors import * # noqa: F401,F403 12 | from .utils import * # noqa: F401,F403 -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # FlashInternImage 3 | # Copyright (c) 2023 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .flash_intern_image import FlashInternImage 8 | 9 | __all__ = ['FlashInternImage'] 10 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import warnings # noqa: F401,F403 3 | 4 | from mmcv.utils import Registry 5 | 6 | TRANSFORMER = Registry('Transformer') 7 | MASK_ASSIGNERS = Registry('mask_assigner') 8 | MATCH_COST = Registry('match_cost') 9 | 10 | 11 | def build_match_cost(cfg): 12 | """Build Match Cost.""" 13 | return MATCH_COST.build(cfg) 14 | 15 | 16 | def build_assigner(cfg): 17 | """Build Assigner.""" 18 | return MASK_ASSIGNERS.build(cfg) 19 | 20 | 21 | def build_transformer(cfg): 22 | """Build Transformer.""" 23 | return TRANSFORMER.build(cfg) 24 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/decode_heads/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .mask2former_head import Mask2FormerHead 3 | from .maskformer_head import MaskFormerHead 4 | from .msda import CustomMultiScaleDeformableAttention 5 | __all__ = [ 6 | 'MaskFormerHead', 7 | 'Mask2FormerHead', 8 | ] 9 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/losses/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy, 3 | cross_entropy, mask_cross_entropy) 4 | from .dice_loss import DiceLoss 5 | from .focal_loss import FocalLoss 6 | from .match_costs import (ClassificationCost, CrossEntropyLossCost, DiceCost, 7 | MaskFocalLossCost) 8 | 9 | __all__ = [ 10 | 'cross_entropy', 'binary_cross_entropy', 'mask_cross_entropy', 11 | 'CrossEntropyLoss', 'DiceLoss', 'FocalLoss', 'ClassificationCost', 12 | 'MaskFocalLossCost', 'DiceCost', 'CrossEntropyLossCost' 13 | ] 14 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .msdeformattn_pixel_decoder import MSDeformAttnPixelDecoder 3 | from .pixel_decoder import PixelDecoder, TransformerEncoderPixelDecoder 4 | 5 | __all__ = [ 6 | 'PixelDecoder', 'TransformerEncoderPixelDecoder', 7 | 'MSDeformAttnPixelDecoder' 8 | ] 9 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/segmentors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .encoder_decoder_mask2former import EncoderDecoderMask2Former 3 | from .encoder_decoder_mask2former_aug import EncoderDecoderMask2FormerAug 4 | 5 | __all__ = ['EncoderDecoderMask2Former', 'EncoderDecoderMask2FormerAug'] 6 | -------------------------------------------------------------------------------- /segmentation/mmseg_custom/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Shanghai AI Lab. All rights reserved. 2 | from .assigner import MaskHungarianAssigner 3 | from .point_sample import get_uncertain_point_coords_with_randomness 4 | from .positional_encoding import (LearnedPositionalEncoding, 5 | SinePositionalEncoding) 6 | from .transformer import (DetrTransformerDecoder, DetrTransformerDecoderLayer, 7 | DynamicConv, Transformer) 8 | 9 | __all__ = [ 10 | 'DetrTransformerDecoderLayer', 'DetrTransformerDecoder', 'DynamicConv', 11 | 'Transformer', 'LearnedPositionalEncoding', 'SinePositionalEncoding', 12 | 'MaskHungarianAssigner', 'get_uncertain_point_coords_with_randomness' 13 | ] 14 | -------------------------------------------------------------------------------- /segmentation/ops_dcnv3/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .dcnv3_func import DCNv3Function, dcnv3_core_pytorch 8 | -------------------------------------------------------------------------------- /segmentation/ops_dcnv3/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # -------------------------------------------------------- 3 | # DCNv4 4 | # Copyright (c) 2024 OpenGVLab 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | python setup.py build install 9 | -------------------------------------------------------------------------------- /segmentation/ops_dcnv3/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .dcnv3 import DCNv3, DCNv3_pytorch -------------------------------------------------------------------------------- /segmentation/ops_dcnv3/setup.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DCNv4 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | import os 8 | import glob 9 | 10 | import torch 11 | 12 | from torch.utils.cpp_extension import CUDA_HOME 13 | from torch.utils.cpp_extension import CppExtension 14 | from torch.utils.cpp_extension import CUDAExtension 15 | 16 | from setuptools import find_packages 17 | from setuptools import setup 18 | 19 | requirements = ["torch", "torchvision"] 20 | 21 | 22 | def get_extensions(): 23 | this_dir = os.path.dirname(os.path.abspath(__file__)) 24 | extensions_dir = os.path.join(this_dir, "src") 25 | 26 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 27 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 28 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 29 | 30 | sources = main_file + source_cpu 31 | extension = CppExtension 32 | extra_compile_args = {"cxx": []} 33 | define_macros = [] 34 | 35 | if torch.cuda.is_available() and CUDA_HOME is not None: 36 | extension = CUDAExtension 37 | sources += source_cuda 38 | define_macros += [("WITH_CUDA", None)] 39 | extra_compile_args["nvcc"] = [ 40 | # "-DCUDA_HAS_FP16=1", 41 | # "-D__CUDA_NO_HALF_OPERATORS__", 42 | # "-D__CUDA_NO_HALF_CONVERSIONS__", 43 | # "-D__CUDA_NO_HALF2_OPERATORS__", 44 | ] 45 | else: 46 | raise NotImplementedError('Cuda is not availabel') 47 | 48 | sources = [os.path.join(extensions_dir, s) for s in sources] 49 | include_dirs = [extensions_dir] 50 | ext_modules = [ 51 | extension( 52 | "DCNv3", 53 | sources, 54 | include_dirs=include_dirs, 55 | define_macros=define_macros, 56 | extra_compile_args=extra_compile_args, 57 | ) 58 | ] 59 | return ext_modules 60 | 61 | 62 | setup( 63 | name="DCNv3", 64 | version="1.0", 65 | author="InternImage", 66 | url="https://github.com/OpenGVLab/InternImage", 67 | description= 68 | "PyTorch Wrapper for CUDA Functions of DCNv3", 69 | packages=find_packages(exclude=( 70 | "configs", 71 | "tests", 72 | )), 73 | ext_modules=get_extensions(), 74 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 75 | ) 76 | -------------------------------------------------------------------------------- /segmentation/ops_dcnv3/src/cpu/dcnv3_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset, 18 | const at::Tensor &mask, const int kernel_h, 19 | const int kernel_w, const int stride_h, 20 | const int stride_w, const int pad_h, 21 | const int pad_w, const int dilation_h, 22 | const int dilation_w, const int group, 23 | const int group_channels, const float offset_scale, 24 | const int im2col_step) { 25 | AT_ERROR("Not implement on cpu"); 26 | } 27 | 28 | std::vector 29 | dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset, 30 | const at::Tensor &mask, const int kernel_h, 31 | const int kernel_w, const int stride_h, const int stride_w, 32 | const int pad_h, const int pad_w, const int dilation_h, 33 | const int dilation_w, const int group, 34 | const int group_channels, const float offset_scale, 35 | const at::Tensor &grad_output, const int im2col_step) { 36 | AT_ERROR("Not implement on cpu"); 37 | } 38 | -------------------------------------------------------------------------------- /segmentation/ops_dcnv3/src/cpu/dcnv3_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #pragma once 13 | #include 14 | 15 | at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset, 16 | const at::Tensor &mask, const int kernel_h, 17 | const int kernel_w, const int stride_h, 18 | const int stride_w, const int pad_h, 19 | const int pad_w, const int dilation_h, 20 | const int dilation_w, const int group, 21 | const int group_channels, const float offset_scale, 22 | const int im2col_step); 23 | 24 | std::vector 25 | dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset, 26 | const at::Tensor &mask, const int kernel_h, 27 | const int kernel_w, const int stride_h, const int stride_w, 28 | const int pad_h, const int pad_w, const int dilation_h, 29 | const int dilation_w, const int group, 30 | const int group_channels, const float offset_scale, 31 | const at::Tensor &grad_output, const int im2col_step); 32 | -------------------------------------------------------------------------------- /segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #pragma once 13 | #include 14 | 15 | at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, 16 | const at::Tensor &mask, const int kernel_h, 17 | const int kernel_w, const int stride_h, 18 | const int stride_w, const int pad_h, 19 | const int pad_w, const int dilation_h, 20 | const int dilation_w, const int group, 21 | const int group_channels, 22 | const float offset_scale, const int im2col_step); 23 | 24 | std::vector 25 | dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, 26 | const at::Tensor &mask, const int kernel_h, 27 | const int kernel_w, const int stride_h, const int stride_w, 28 | const int pad_h, const int pad_w, const int dilation_h, 29 | const int dilation_w, const int group, 30 | const int group_channels, const float offset_scale, 31 | const at::Tensor &grad_output, const int im2col_step); 32 | -------------------------------------------------------------------------------- /segmentation/ops_dcnv3/src/dcnv3.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #pragma once 13 | 14 | #include "cpu/dcnv3_cpu.h" 15 | 16 | #ifdef WITH_CUDA 17 | #include "cuda/dcnv3_cuda.h" 18 | #endif 19 | 20 | at::Tensor dcnv3_forward(const at::Tensor &input, const at::Tensor &offset, 21 | const at::Tensor &mask, const int kernel_h, 22 | const int kernel_w, const int stride_h, 23 | const int stride_w, const int pad_h, const int pad_w, 24 | const int dilation_h, const int dilation_w, 25 | const int group, const int group_channels, 26 | const float offset_scale, const int im2col_step) { 27 | if (input.type().is_cuda()) { 28 | #ifdef WITH_CUDA 29 | return dcnv3_cuda_forward(input, offset, mask, kernel_h, kernel_w, 30 | stride_h, stride_w, pad_h, pad_w, dilation_h, 31 | dilation_w, group, group_channels, 32 | offset_scale, im2col_step); 33 | #else 34 | AT_ERROR("Not compiled with GPU support"); 35 | #endif 36 | } 37 | AT_ERROR("Not implemented on the CPU"); 38 | } 39 | 40 | std::vector 41 | dcnv3_backward(const at::Tensor &input, const at::Tensor &offset, 42 | const at::Tensor &mask, const int kernel_h, const int kernel_w, 43 | const int stride_h, const int stride_w, const int pad_h, 44 | const int pad_w, const int dilation_h, const int dilation_w, 45 | const int group, const int group_channels, 46 | const float offset_scale, const at::Tensor &grad_output, 47 | const int im2col_step) { 48 | if (input.type().is_cuda()) { 49 | #ifdef WITH_CUDA 50 | return dcnv3_cuda_backward(input, offset, mask, kernel_h, kernel_w, 51 | stride_h, stride_w, pad_h, pad_w, dilation_h, 52 | dilation_w, group, group_channels, 53 | offset_scale, grad_output, im2col_step); 54 | #else 55 | AT_ERROR("Not compiled with GPU support"); 56 | #endif 57 | } 58 | AT_ERROR("Not implemented on the CPU"); 59 | } 60 | -------------------------------------------------------------------------------- /segmentation/ops_dcnv3/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * InternImage 4 | * Copyright (c) 2022 OpenGVLab 5 | * Licensed under The MIT License [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from 8 | *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 9 | ************************************************************************************************** 10 | */ 11 | 12 | #include "dcnv3.h" 13 | 14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 15 | m.def("dcnv3_forward", &dcnv3_forward, "dcnv3_forward"); 16 | m.def("dcnv3_backward", &dcnv3_backward, "dcnv3_backward"); 17 | } 18 | -------------------------------------------------------------------------------- /segmentation/slurm_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | CHECKPOINT=$4 9 | GPUS=${GPUS:-8} 10 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 11 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 12 | PY_ARGS=${@:5} 13 | SRUN_ARGS=${SRUN_ARGS:-""} 14 | 15 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 16 | srun -p ${PARTITION} \ 17 | --job-name=${JOB_NAME} \ 18 | --gres=gpu:${GPUS_PER_NODE} \ 19 | --ntasks=${GPUS} \ 20 | --ntasks-per-node=${GPUS_PER_NODE} \ 21 | --cpus-per-task=${CPUS_PER_TASK} \ 22 | --kill-on-bad-exit=1 \ 23 | --quotatype=auto \ 24 | ${SRUN_ARGS} \ 25 | python -u test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} 26 | -------------------------------------------------------------------------------- /segmentation/slurm_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | CONFIG=$3 8 | GPUS=${GPUS:-8} 9 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 10 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 11 | SRUN_ARGS=${SRUN_ARGS:-""} 12 | PY_ARGS=${@:4} 13 | 14 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 15 | srun -p ${PARTITION} \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --quotatype=reserved \ 22 | --kill-on-bad-exit=1 \ 23 | ${SRUN_ARGS} \ 24 | python -u train.py ${CONFIG} --launcher="slurm" ${PY_ARGS} 25 | --------------------------------------------------------------------------------