├── mask2former ├── evaluation │ ├── __init__.py │ └── instance_evaluation.py ├── utils │ ├── __init__.py │ └── misc.py ├── modeling │ ├── backbone │ │ └── __init__.py │ ├── meta_arch │ │ ├── __init__.py │ │ └── mask_former_head.py │ ├── pixel_decoder │ │ ├── __init__.py │ │ └── ops │ │ │ ├── make.sh │ │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn.py │ │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── ms_deform_attn_func.py │ │ │ ├── src │ │ │ ├── vision.cpp │ │ │ ├── cuda │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ └── ms_deform_attn_cuda.cu │ │ │ ├── cpu │ │ │ │ ├── ms_deform_attn_cpu.h │ │ │ │ └── ms_deform_attn_cpu.cpp │ │ │ └── ms_deform_attn.h │ │ │ ├── setup.py │ │ │ └── test.py │ ├── transformer_decoder │ │ ├── __init__.py │ │ ├── position_encoding.py │ │ └── maskformer_transformer_decoder.py │ └── __init__.py ├── data │ ├── dataset_mappers │ │ ├── __init__.py │ │ ├── coco_panoptic_new_baseline_dataset_mapper.py │ │ ├── mask_former_panoptic_dataset_mapper.py │ │ ├── mask_former_instance_dataset_mapper.py │ │ ├── mask_former_semantic_dataset_mapper.py │ │ └── coco_instance_new_baseline_dataset_mapper.py │ ├── __init__.py │ └── datasets │ │ ├── __init__.py │ │ ├── register_ade20k_instance.py │ │ └── register_coco_panoptic_annos_semseg.py ├── __init__.py ├── test_time_augmentation.py └── config.py ├── requirements.txt ├── .gitignore ├── minvis ├── utils │ ├── __init__.py │ └── lr_scheduler.py ├── data_video │ ├── datasets │ │ ├── ytvis_api │ │ │ └── __init__.py │ │ ├── __init__.py │ │ └── builtin.py │ ├── __init__.py │ └── augmentation.py ├── config.py ├── __init__.py └── video_mask2former_transformer_decoder.py ├── assets ├── demo.png └── teaser.png ├── mask2former_video ├── utils │ ├── __init__.py │ └── memory.py ├── data_video │ ├── datasets │ │ ├── ytvis_api │ │ │ └── __init__.py │ │ ├── __init__.py │ │ └── builtin.py │ ├── __init__.py │ └── augmentation.py ├── modeling │ ├── transformer_decoder │ │ ├── __init__.py │ │ └── position_encoding.py │ └── __init__.py ├── __init__.py └── config.py ├── uni_dvps ├── data_video │ ├── datasets │ │ ├── __init__.py │ │ ├── builtin.py │ │ ├── cityscapes_dvps.py │ │ └── semkitti_dvps.py │ ├── __init__.py │ ├── build.py │ └── augmentation.py ├── modeling │ ├── __init__.py │ └── meta_arch │ │ └── unified_decoder_head.py ├── __init__.py └── config.py ├── configs ├── CityscapesDVPS │ ├── swin │ │ └── swinL.yaml │ ├── base.yaml │ └── R50.yaml └── SemKITTIDVPS │ ├── R50.yaml │ └── base.yaml ├── demo └── demo.py ├── README.md └── train.py /mask2former/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | timm 2 | scipy 3 | einops 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | media/.DS_Store 3 | deprecated_media/ 4 | .idea/ 5 | -------------------------------------------------------------------------------- /minvis/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import add_deeplab_config, build_lr_scheduler -------------------------------------------------------------------------------- /assets/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaist-ami/Uni-DVPS/HEAD/assets/demo.png -------------------------------------------------------------------------------- /assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaist-ami/Uni-DVPS/HEAD/assets/teaser.png -------------------------------------------------------------------------------- /mask2former/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask2former_video/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask2former/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask2former/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask2former/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask2former/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import datasets 3 | -------------------------------------------------------------------------------- /minvis/data_video/datasets/ytvis_api/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi 3 | -------------------------------------------------------------------------------- /mask2former_video/data_video/datasets/ytvis_api/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi 3 | -------------------------------------------------------------------------------- /mask2former_video/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder 3 | -------------------------------------------------------------------------------- /uni_dvps/data_video/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from . import builtin # ensure the builtin datasets are registered 2 | 3 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")] 4 | -------------------------------------------------------------------------------- /uni_dvps/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | from .meta_arch.unified_decoder_head import UnifiedDecoderHead 2 | from .transformer_decoder.unified_transformer_decoder import UnifiedTransformerDecoder 3 | from .unidvps_model import UniDVPS 4 | -------------------------------------------------------------------------------- /uni_dvps/data_video/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset_mapper import CityscapesDVPSDatasetMapper, SemkittiDVPSDatasetMapper 2 | from .build import * 3 | 4 | from .datasets import * 5 | from .evaluator import CityscapesDVPSEvaluator, SemkittiDVPSEvaluator -------------------------------------------------------------------------------- /mask2former/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .maskformer_transformer_decoder import StandardTransformerDecoder 3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder 4 | -------------------------------------------------------------------------------- /uni_dvps/__init__.py: -------------------------------------------------------------------------------- 1 | # config 2 | from .config import add_uni_dvps_config 3 | from . import modeling 4 | from .data_video import ( 5 | CityscapesDVPSDatasetMapper, 6 | CityscapesDVPSEvaluator, 7 | SemkittiDVPSDatasetMapper, 8 | SemkittiDVPSEvaluator, 9 | build_hooks, 10 | run_step 11 | ) -------------------------------------------------------------------------------- /minvis/data_video/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper 5 | from .build import * 6 | 7 | from .datasets import * 8 | from .ytvis_eval import YTVISEvaluator 9 | -------------------------------------------------------------------------------- /minvis/data_video/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | from . import builtin # ensure the builtin datasets are registered 5 | 6 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")] 7 | -------------------------------------------------------------------------------- /mask2former_video/data_video/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper 5 | from .build import * 6 | 7 | from .datasets import * 8 | from .ytvis_eval import YTVISEvaluator 9 | -------------------------------------------------------------------------------- /mask2former_video/data_video/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | from . import builtin # ensure the builtin datasets are registered 5 | 6 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")] 7 | -------------------------------------------------------------------------------- /mask2former/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import ( 3 | register_ade20k_full, 4 | register_ade20k_panoptic, 5 | register_coco_stuff_10k, 6 | register_mapillary_vistas, 7 | register_coco_panoptic_annos_semseg, 8 | register_ade20k_instance, 9 | register_mapillary_vistas_panoptic, 10 | ) 11 | -------------------------------------------------------------------------------- /mask2former_video/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder 3 | from .criterion import VideoSetCriterion, calculate_uncertainty, sigmoid_ce_loss_jit, dice_loss_jit 4 | from .matcher import VideoHungarianMatcher, batch_sigmoid_ce_loss_jit, batch_dice_loss 5 | -------------------------------------------------------------------------------- /mask2former/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .backbone.swin import D2SwinTransformer 3 | from .pixel_decoder.fpn import BasePixelDecoder 4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder 5 | from .meta_arch.mask_former_head import MaskFormerHead 6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead 7 | -------------------------------------------------------------------------------- /mask2former_video/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import modeling 3 | 4 | # config 5 | from .config import add_maskformer2_video_config 6 | 7 | # models 8 | from .video_maskformer_model import VideoMaskFormer 9 | 10 | # video 11 | from .data_video import ( 12 | YTVISDatasetMapper, 13 | YTVISEvaluator, 14 | build_detection_train_loader, 15 | build_detection_test_loader, 16 | get_detection_dataset_dicts, 17 | ) 18 | -------------------------------------------------------------------------------- /mask2former_video/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | from detectron2.config import CfgNode as CN 4 | 5 | 6 | def add_maskformer2_video_config(cfg): 7 | # video data 8 | # DataLoader 9 | cfg.INPUT.SAMPLING_FRAME_NUM = 2 10 | cfg.INPUT.SAMPLING_FRAME_RANGE = 20 11 | cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False 12 | cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation" 13 | -------------------------------------------------------------------------------- /minvis/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) 2021-2022, NVIDIA Corporation & Affiliates. All rights reserved. 3 | # 4 | # This work is made available under the Nvidia Source Code License-NC. 5 | # To view a copy of this license, visit 6 | # https://github.com/NVlabs/MinVIS/blob/main/LICENSE 7 | 8 | # Copyright (c) Facebook, Inc. and its affiliates. 9 | from detectron2.config import CfgNode as CN 10 | 11 | 12 | def add_minvis_config(cfg): 13 | cfg.INPUT.SAMPLING_FRAME_RATIO = 1.0 14 | cfg.MODEL.MASK_FORMER.TEST.WINDOW_INFERENCE = False 15 | 16 | -------------------------------------------------------------------------------- /configs/CityscapesDVPS/swin/swinL.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../base.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "D2SwinTransformer" 5 | SWIN: 6 | EMBED_DIM: 192 7 | DEPTHS: [2, 2, 18, 2] 8 | NUM_HEADS: [6, 12, 24, 48] 9 | WINDOW_SIZE: 12 10 | APE: False 11 | DROP_PATH_RATE: 0.3 12 | PATCH_NORM: True 13 | PRETRAIN_IMG_SIZE: 384 14 | WEIGHTS: "/local_data2/ryeon/mask2former/pre_weight/swin_large_patch4_window12_384_22k.pth" 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | MASK_FORMER: 18 | NUM_OBJECT_QUERIES: 200 -------------------------------------------------------------------------------- /minvis/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA Corporation & Affiliates. All rights reserved. 2 | # 3 | # This work is made available under the Nvidia Source Code License-NC. 4 | # To view a copy of this license, visit 5 | # https://github.com/NVlabs/MinVIS/blob/main/LICENSE 6 | 7 | # Copyright (c) Facebook, Inc. and its affiliates. 8 | 9 | # config 10 | from .config import add_minvis_config 11 | 12 | # models 13 | from .video_maskformer_model import VideoMaskFormer_frame 14 | from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder_frame 15 | 16 | # video 17 | from .data_video import ( 18 | YTVISDatasetMapper, 19 | YTVISEvaluator, 20 | build_detection_train_loader, 21 | build_detection_test_loader, 22 | get_detection_dataset_dicts, 23 | ) 24 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | # Copyright (c) Facebook, Inc. and its affiliates. 11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 12 | 13 | python setup.py build install 14 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn import MSDeformAttn 13 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from .ms_deform_attn_func import MSDeformAttnFunction 13 | 14 | -------------------------------------------------------------------------------- /mask2former/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from . import data # register all new datasets 3 | from . import modeling 4 | 5 | # config 6 | from .config import add_maskformer2_config 7 | 8 | # dataset loading 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper 10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper 11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import ( 12 | MaskFormerInstanceDatasetMapper, 13 | ) 14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import ( 15 | MaskFormerPanopticDatasetMapper, 16 | ) 17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( 18 | MaskFormerSemanticDatasetMapper, 19 | ) 20 | 21 | # models 22 | from .maskformer_model import MaskFormer 23 | from .test_time_augmentation import SemanticSegmentorWithTTA 24 | 25 | # evaluation 26 | from .evaluation.instance_evaluation import InstanceSegEvaluator 27 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include "ms_deform_attn.h" 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 20 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 21 | } 22 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor ms_deform_attn_cuda_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step); 26 | 27 | std::vector ms_deform_attn_cuda_backward( 28 | const at::Tensor &value, 29 | const at::Tensor &spatial_shapes, 30 | const at::Tensor &level_start_index, 31 | const at::Tensor &sampling_loc, 32 | const at::Tensor &attn_weight, 33 | const at::Tensor &grad_output, 34 | const int im2col_step); 35 | 36 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | #include 18 | 19 | at::Tensor 20 | ms_deform_attn_cpu_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step); 27 | 28 | std::vector 29 | ms_deform_attn_cpu_backward( 30 | const at::Tensor &value, 31 | const at::Tensor &spatial_shapes, 32 | const at::Tensor &level_start_index, 33 | const at::Tensor &sampling_loc, 34 | const at::Tensor &attn_weight, 35 | const at::Tensor &grad_output, 36 | const int im2col_step); 37 | 38 | 39 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | 22 | at::Tensor 23 | ms_deform_attn_cpu_forward( 24 | const at::Tensor &value, 25 | const at::Tensor &spatial_shapes, 26 | const at::Tensor &level_start_index, 27 | const at::Tensor &sampling_loc, 28 | const at::Tensor &attn_weight, 29 | const int im2col_step) 30 | { 31 | AT_ERROR("Not implement on cpu"); 32 | } 33 | 34 | std::vector 35 | ms_deform_attn_cpu_backward( 36 | const at::Tensor &value, 37 | const at::Tensor &spatial_shapes, 38 | const at::Tensor &level_start_index, 39 | const at::Tensor &sampling_loc, 40 | const at::Tensor &attn_weight, 41 | const at::Tensor &grad_output, 42 | const int im2col_step) 43 | { 44 | AT_ERROR("Not implement on cpu"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /configs/CityscapesDVPS/base.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | STEM_TYPE: "basic" # not used 12 | STEM_OUT_CHANNELS: 64 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 15 | NORM: "SyncBN" 16 | RES5_MULTI_GRID: [1, 1, 1] # not used 17 | DATASETS: 18 | TEST: ("cityscapes_dvps_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 10000 23 | WARMUP_FACTOR: 1.0 24 | WEIGHT_DECAY: 0.05 25 | OPTIMIZER: "ADAMW" 26 | LR_SCHEDULER_NAME: "WarmupPolyLR" 27 | BACKBONE_MULTIPLIER: 0.1 28 | CLIP_GRADIENTS: 29 | ENABLED: True 30 | CLIP_TYPE: "full_model" 31 | CLIP_VALUE: 0.01 32 | NORM_TYPE: 2.0 33 | AMP: 34 | ENABLED: True 35 | INPUT: 36 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(8, 13)]"] 37 | MAX_SIZE_TRAIN: 4096 38 | MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TEST: 2048 41 | RANDOM_FLIP: "flip_by_clip" 42 | SIZE_DIVISIBILITY: -1 43 | CROP: 44 | ENABLED: False 45 | TYPE: "absolute" 46 | SIZE: (1024, 2048) 47 | RESCALE: (1.0, 2.0) 48 | WITH_PAD: True 49 | SINGLE_CATEGORY_MAX_AREA: 1.0 50 | COLOR_AUG_SSD: True 51 | FORMAT: "RGB" 52 | TEST: 53 | EVAL_PERIOD: 500 54 | LOG_PERIOD: 50 55 | DATALOADER: 56 | FILTER_EMPTY_ANNOTATIONS: False 57 | NUM_WORKERS: 4 58 | VERSION: 2 59 | -------------------------------------------------------------------------------- /configs/CityscapesDVPS/R50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: base.yaml 2 | MODEL: 3 | # WEIGHTS: "model_final_3c8ec9.pkl" 4 | META_ARCHITECTURE: "UniDVPS" 5 | SEM_SEG_HEAD: 6 | NAME: "UnifiedDecoderHead" 7 | IGNORE_VALUE: 32000 8 | NUM_CLASSES: 19 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | UNIFIED_FORMER: 20 | TRANSFORMER_DECODER_NAME: "UnifiedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | MATCHER: "dvps_matcher" 23 | DEEP_SUPERVISION: True 24 | NO_OBJECT_WEIGHT: 0.1 25 | CLASS_WEIGHT: 2.0 26 | MASK_WEIGHT: 5.0 27 | DICE_WEIGHT: 5.0 28 | HIDDEN_DIM: 256 29 | NUM_OBJECT_QUERIES: 100 30 | NHEADS: 8 31 | DROPOUT: 0.0 32 | DIM_FEEDFORWARD: 2048 33 | ENC_LAYERS: 0 34 | PRE_NORM: False 35 | ENFORCE_INPUT_PROJ: False 36 | SIZE_DIVISIBILITY: 32 37 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 38 | TRAIN_NUM_POINTS: 12544 39 | OVERSAMPLE_RATIO: 3.0 40 | IMPORTANCE_SAMPLE_RATIO: 0.75 41 | TEST: 42 | SEMANTIC_ON: False 43 | INSTANCE_ON: True 44 | PANOPTIC_ON: False 45 | OVERLAP_THRESHOLD: 0.8 46 | OBJECT_MASK_THRESHOLD: 0.8 47 | DEPTH_DIM: 256 48 | DEPTH_MAX: 80. 49 | SILOG_WEIGHT: 3.0 50 | REL_SQR_WEIGHT: 3.0 51 | REL_ABS_WEIGHT: 3.0 52 | INPUT: 53 | SAMPLING_FRAME_NUM: 1 54 | -------------------------------------------------------------------------------- /configs/SemKITTIDVPS/R50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: base.yaml 2 | MODEL: 3 | # WEIGHTS: "model_final_3c8ec9.pkl" 4 | META_ARCHITECTURE: "UniDVPS" 5 | SEM_SEG_HEAD: 6 | NAME: "UnifiedDecoderHead" 7 | IGNORE_VALUE: 255000 8 | NUM_CLASSES: 19 9 | LOSS_WEIGHT: 1.0 10 | CONVS_DIM: 256 11 | MASK_DIM: 256 12 | NORM: "GN" 13 | # pixel decoder 14 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 15 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 16 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 17 | COMMON_STRIDE: 4 18 | TRANSFORMER_ENC_LAYERS: 6 19 | UNIFIED_FORMER: 20 | TRANSFORMER_DECODER_NAME: "UnifiedTransformerDecoder" 21 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 22 | MATCHER: "dvps_matcher" 23 | DEEP_SUPERVISION: True 24 | NO_OBJECT_WEIGHT: 0.1 25 | CLASS_WEIGHT: 2.0 26 | MASK_WEIGHT: 5.0 27 | DICE_WEIGHT: 5.0 28 | HIDDEN_DIM: 256 29 | NUM_OBJECT_QUERIES: 100 30 | NHEADS: 8 31 | DROPOUT: 0.0 32 | DIM_FEEDFORWARD: 2048 33 | ENC_LAYERS: 0 34 | PRE_NORM: False 35 | ENFORCE_INPUT_PROJ: False 36 | SIZE_DIVISIBILITY: 32 37 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 38 | TRAIN_NUM_POINTS: 491520 39 | OVERSAMPLE_RATIO: 3.0 40 | IMPORTANCE_SAMPLE_RATIO: 0.75 41 | TEST: 42 | SEMANTIC_ON: False 43 | INSTANCE_ON: True 44 | PANOPTIC_ON: False 45 | OVERLAP_THRESHOLD: 0.8 46 | OBJECT_MASK_THRESHOLD: 0.8 47 | DEPTH_DIM: 256 48 | DEPTH_MAX: 88. 49 | SILOG_WEIGHT: 3.0 50 | REL_SQR_WEIGHT: 3.0 51 | REL_ABS_WEIGHT: 3.0 52 | INPUT: 53 | SAMPLING_FRAME_NUM: 1 54 | -------------------------------------------------------------------------------- /configs/SemKITTIDVPS/base.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | MASK_ON: True 9 | RESNETS: 10 | DEPTH: 50 11 | STEM_TYPE: "basic" # not used 12 | STEM_OUT_CHANNELS: 64 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 15 | NORM: "SyncBN" 16 | RES5_MULTI_GRID: [1, 1, 1] # not used 17 | DATASETS: 18 | TEST: ("semkitti_dvps_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 10000 23 | WARMUP_FACTOR: 1.0 24 | WEIGHT_DECAY: 0.05 25 | OPTIMIZER: "ADAMW" 26 | LR_SCHEDULER_NAME: "WarmupPolyLR" 27 | BACKBONE_MULTIPLIER: 0.1 28 | CLIP_GRADIENTS: 29 | ENABLED: True 30 | CLIP_TYPE: "full_model" 31 | CLIP_VALUE: 0.01 32 | NORM_TYPE: 2.0 33 | AMP: 34 | ENABLED: True 35 | INPUT: 36 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(8, 13)]"] 37 | MAX_SIZE_TRAIN: 1280 38 | MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip" 39 | MIN_SIZE_TEST: 784 40 | MAX_SIZE_TEST: 1280 41 | RANDOM_FLIP: "flip_by_clip" 42 | SIZE_DIVISIBILITY: -1 43 | CROP: 44 | # ENABLED: True 45 | ENABLED: False 46 | TYPE: "absolute" 47 | SIZE: (384, 1280) 48 | RESCALE: (-1.0, -2.0) 49 | WITH_PAD: True 50 | SINGLE_CATEGORY_MAX_AREA: 1.0 51 | COLOR_AUG_SSD: True 52 | FORMAT: "RGB" 53 | TEST: 54 | EVAL_PERIOD: 500 55 | LOG_PERIOD: 50 56 | DATALOADER: 57 | FILTER_EMPTY_ANNOTATIONS: False 58 | NUM_WORKERS: 4 59 | VERSION: 2 60 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #pragma once 17 | 18 | #include "cpu/ms_deform_attn_cpu.h" 19 | 20 | #ifdef WITH_CUDA 21 | #include "cuda/ms_deform_attn_cuda.h" 22 | #endif 23 | 24 | 25 | at::Tensor 26 | ms_deform_attn_forward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const int im2col_step) 33 | { 34 | if (value.type().is_cuda()) 35 | { 36 | #ifdef WITH_CUDA 37 | return ms_deform_attn_cuda_forward( 38 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | 46 | std::vector 47 | ms_deform_attn_backward( 48 | const at::Tensor &value, 49 | const at::Tensor &spatial_shapes, 50 | const at::Tensor &level_start_index, 51 | const at::Tensor &sampling_loc, 52 | const at::Tensor &attn_weight, 53 | const at::Tensor &grad_output, 54 | const int im2col_step) 55 | { 56 | if (value.type().is_cuda()) 57 | { 58 | #ifdef WITH_CUDA 59 | return ms_deform_attn_cuda_backward( 60 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 61 | #else 62 | AT_ERROR("Not compiled with GPU support"); 63 | #endif 64 | } 65 | AT_ERROR("Not implemented on the CPU"); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /mask2former_video/data_video/datasets/builtin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | import os 5 | 6 | from .ytvis import ( 7 | register_ytvis_instances, 8 | _get_ytvis_2019_instances_meta, 9 | _get_ytvis_2021_instances_meta, 10 | ) 11 | 12 | # ==== Predefined splits for YTVIS 2019 =========== 13 | _PREDEFINED_SPLITS_YTVIS_2019 = { 14 | "ytvis_2019_train": ("YouTubeVIS_2019/train/JPEGImages", 15 | "YouTubeVIS_2019/train.json"), 16 | "ytvis_2019_val": ("YouTubeVIS_2019/valid/JPEGImages", 17 | "YouTubeVIS_2019/valid.json"), 18 | "ytvis_2019_test": ("YouTubeVIS_2019/test/JPEGImages", 19 | "YouTubeVIS_2019/test.json"), 20 | } 21 | 22 | 23 | # ==== Predefined splits for YTVIS 2021 =========== 24 | _PREDEFINED_SPLITS_YTVIS_2021 = { 25 | "ytvis_2021_train": ("YouTubeVIS_2021/train/JPEGImages", 26 | "YouTubeVIS_2021/train.json"), 27 | "ytvis_2021_val": ("YouTubeVIS_2021/valid/JPEGImages", 28 | "YouTubeVIS_2021/valid.json"), 29 | "ytvis_2021_test": ("YouTubeVIS_2021/test/JPEGImages", 30 | "YouTubeVIS_2021/test.json"), 31 | } 32 | 33 | 34 | def register_all_ytvis_2019(root): 35 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items(): 36 | # Assume pre-defined datasets live in `./datasets`. 37 | register_ytvis_instances( 38 | key, 39 | _get_ytvis_2019_instances_meta(), 40 | os.path.join(root, json_file) if "://" not in json_file else json_file, 41 | os.path.join(root, image_root), 42 | ) 43 | 44 | 45 | def register_all_ytvis_2021(root): 46 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items(): 47 | # Assume pre-defined datasets live in `./datasets`. 48 | register_ytvis_instances( 49 | key, 50 | _get_ytvis_2021_instances_meta(), 51 | os.path.join(root, json_file) if "://" not in json_file else json_file, 52 | os.path.join(root, image_root), 53 | ) 54 | 55 | 56 | if __name__.endswith(".builtin"): 57 | # Assume pre-defined datasets live in `./datasets`. 58 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 59 | register_all_ytvis_2019(_root) 60 | register_all_ytvis_2021(_root) 61 | -------------------------------------------------------------------------------- /minvis/utils/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import math 3 | from typing import List 4 | import torch 5 | 6 | from detectron2.solver.lr_scheduler import _get_warmup_factor_at_iter 7 | 8 | # NOTE: PyTorch's LR scheduler interface uses names that assume the LR changes 9 | # only on epoch boundaries. We typically use iteration based schedules instead. 10 | # As a result, "epoch" (e.g., as in self.last_epoch) should be understood to mean 11 | # "iteration" instead. 12 | 13 | # FIXME: ideally this would be achieved with a CombinedLRScheduler, separating 14 | # MultiStepLR with WarmupLR but the current LRScheduler design doesn't allow it. 15 | 16 | 17 | class WarmupPolyLR(torch.optim.lr_scheduler._LRScheduler): 18 | """ 19 | Poly learning rate schedule used to train DeepLab. 20 | Paper: DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, 21 | Atrous Convolution, and Fully Connected CRFs. 22 | Reference: https://github.com/tensorflow/models/blob/21b73d22f3ed05b650e85ac50849408dd36de32e/research/deeplab/utils/train_utils.py#L337 # noqa 23 | """ 24 | 25 | def __init__( 26 | self, 27 | optimizer: torch.optim.Optimizer, 28 | max_iters: int, 29 | warmup_factor: float = 0.001, 30 | warmup_iters: int = 1000, 31 | warmup_method: str = "linear", 32 | last_epoch: int = -1, 33 | power: float = 0.9, 34 | constant_ending: float = 0.0, 35 | ): 36 | self.max_iters = max_iters 37 | self.warmup_factor = warmup_factor 38 | self.warmup_iters = warmup_iters 39 | self.warmup_method = warmup_method 40 | self.power = power 41 | self.constant_ending = constant_ending 42 | super().__init__(optimizer, last_epoch) 43 | 44 | def get_lr(self) -> List[float]: 45 | warmup_factor = _get_warmup_factor_at_iter( 46 | self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor 47 | ) 48 | if self.constant_ending > 0 and warmup_factor == 1.0: 49 | # Constant ending lr. 50 | if ( 51 | math.pow((1.0 - self.last_epoch / self.max_iters), self.power) 52 | < self.constant_ending 53 | ): 54 | return [base_lr * self.constant_ending for base_lr in self.base_lrs] 55 | return [ 56 | base_lr * warmup_factor * math.pow((1.0 - self.last_epoch / self.max_iters), self.power) 57 | for base_lr in self.base_lrs 58 | ] 59 | 60 | def _compute_values(self) -> List[float]: 61 | # The new interface 62 | return self.get_lr() 63 | -------------------------------------------------------------------------------- /mask2former/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | if mask is None: 31 | mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) 32 | not_mask = ~mask 33 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 34 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 35 | if self.normalize: 36 | eps = 1e-6 37 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 38 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 39 | 40 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 41 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 42 | 43 | pos_x = x_embed[:, :, :, None] / dim_t 44 | pos_y = y_embed[:, :, :, None] / dim_t 45 | pos_x = torch.stack( 46 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 47 | ).flatten(3) 48 | pos_y = torch.stack( 49 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 50 | ).flatten(3) 51 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 52 | return pos 53 | 54 | def __repr__(self, _repr_indent=4): 55 | head = "Positional encoding " + self.__class__.__name__ 56 | body = [ 57 | "num_pos_feats: {}".format(self.num_pos_feats), 58 | "temperature: {}".format(self.temperature), 59 | "normalize: {}".format(self.normalize), 60 | "scale: {}".format(self.scale), 61 | ] 62 | # _repr_indent = 4 63 | lines = [head] + [" " * _repr_indent + line for line in body] 64 | return "\n".join(lines) 65 | -------------------------------------------------------------------------------- /uni_dvps/data_video/datasets/builtin.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .cityscapes_dvps import ( 3 | register_cityscapes_dvps, 4 | _get_cityscapes_dvps_meta 5 | ) 6 | from .semkitti_dvps import ( 7 | register_semkitti_dvps, 8 | _get_semkitti_dvps_meta 9 | ) 10 | 11 | # ==== Predefined splits for Cityscpaes-DVPS =========== 12 | _PREDEFINED_SPLITS_CITYSCAPES_DVPS = { 13 | "cityscapes_dvps_val": ( 14 | "cityscapes-dvps/video_sequence/val", 15 | "cityscapes-dvps/video_sequence/val", 16 | "cityscapes-dvps/video_sequence/dvps_cityscapes_val.json", 17 | ), 18 | } 19 | 20 | # ==== Predefined splits for SemKITTI-DVPS =========== 21 | _PREDEFINED_SPLITS_SEM_KITTI = { 22 | "semkitti_dvps_val": ( 23 | "semkitti-dvps/video_sequence/val", 24 | "semkitti-dvps/video_sequence/val", 25 | "semkitti-dvps/video_sequence/dvps_semkitti_val.json" 26 | ), 27 | } 28 | 29 | def register_all_cityscapes_dvps(root): 30 | for key, (image_dir, gt_dir, gt_json) in _PREDEFINED_SPLITS_CITYSCAPES_DVPS.items(): 31 | image_dir = os.path.join(root, image_dir) 32 | gt_dir = os.path.join(root, gt_dir) 33 | gt_json = os.path.join(root, gt_json) 34 | 35 | register_cityscapes_dvps( 36 | key, 37 | _get_cityscapes_dvps_meta(), 38 | os.path.join(root, gt_json) if "://" not in gt_json else gt_json, 39 | os.path.join(root, image_dir), 40 | os.path.join(root, gt_dir), 41 | ) 42 | 43 | def register_all_sem_kitti(root): 44 | for key, (image_dir, gt_dir, gt_json) in _PREDEFINED_SPLITS_SEM_KITTI.items(): 45 | image_dir = os.path.join(root, image_dir) 46 | gt_dir = os.path.join(root, gt_dir) 47 | gt_json = os.path.join(root, gt_json) 48 | 49 | if "val" in key: 50 | for eval_frames in [1, 5, 10, 20]: 51 | new_key = key+str(eval_frames) 52 | register_semkitti_dvps( 53 | new_key, 54 | _get_semkitti_dvps_meta(), 55 | os.path.join(root, gt_json) if "://" not in gt_json else gt_json, 56 | os.path.join(root, image_dir), 57 | os.path.join(root, gt_dir), 58 | ) 59 | else: 60 | register_semkitti_dvps( 61 | key, 62 | _get_semkitti_dvps_meta(), 63 | os.path.join(root, gt_json) if "://" not in gt_json else gt_json, 64 | os.path.join(root, image_dir), 65 | os.path.join(root, gt_dir), 66 | ) 67 | 68 | 69 | if __name__.endswith(".builtin"): 70 | # Assume pre-defined datasets live in `./datasets`. 71 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 72 | register_all_cityscapes_dvps(_root) 73 | register_all_sem_kitti(_root) 74 | 75 | -------------------------------------------------------------------------------- /mask2former_video/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine3D(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 19 | super().__init__() 20 | self.num_pos_feats = num_pos_feats 21 | self.temperature = temperature 22 | self.normalize = normalize 23 | if scale is not None and normalize is False: 24 | raise ValueError("normalize should be True if scale is passed") 25 | if scale is None: 26 | scale = 2 * math.pi 27 | self.scale = scale 28 | 29 | def forward(self, x, mask=None): 30 | # b, t, c, h, w 31 | assert x.dim() == 5, f"{x.shape} should be a 5-dimensional Tensor, got {x.dim()}-dimensional Tensor instead" 32 | if mask is None: 33 | mask = torch.zeros((x.size(0), x.size(1), x.size(3), x.size(4)), device=x.device, dtype=torch.bool) 34 | not_mask = ~mask 35 | z_embed = not_mask.cumsum(1, dtype=torch.float32) 36 | y_embed = not_mask.cumsum(2, dtype=torch.float32) 37 | x_embed = not_mask.cumsum(3, dtype=torch.float32) 38 | if self.normalize: 39 | eps = 1e-6 40 | z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale 41 | y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale 42 | x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale 43 | 44 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 45 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 46 | 47 | dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device) 48 | dim_t_z = self.temperature ** (2 * (dim_t_z // 2) / (self.num_pos_feats * 2)) 49 | 50 | pos_x = x_embed[:, :, :, :, None] / dim_t 51 | pos_y = y_embed[:, :, :, :, None] / dim_t 52 | pos_z = z_embed[:, :, :, :, None] / dim_t_z 53 | pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4) 54 | pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4) 55 | pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4) 56 | pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3) # b, t, c, h, w 57 | return pos 58 | -------------------------------------------------------------------------------- /mask2former_video/utils/memory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import logging 4 | from contextlib import contextmanager 5 | from functools import wraps 6 | import torch 7 | from torch.cuda.amp import autocast 8 | 9 | __all__ = ["retry_if_cuda_oom"] 10 | 11 | 12 | @contextmanager 13 | def _ignore_torch_cuda_oom(): 14 | """ 15 | A context which ignores CUDA OOM exception from pytorch. 16 | """ 17 | try: 18 | yield 19 | except RuntimeError as e: 20 | # NOTE: the string may change? 21 | if "CUDA out of memory. " in str(e): 22 | pass 23 | else: 24 | raise 25 | 26 | 27 | def retry_if_cuda_oom(func): 28 | """ 29 | Makes a function retry itself after encountering 30 | pytorch's CUDA OOM error. 31 | It will first retry after calling `torch.cuda.empty_cache()`. 32 | If that still fails, it will then retry by trying to convert inputs to CPUs. 33 | In this case, it expects the function to dispatch to CPU implementation. 34 | The return values may become CPU tensors as well and it's user's 35 | responsibility to convert it back to CUDA tensor if needed. 36 | Args: 37 | func: a stateless callable that takes tensor-like objects as arguments 38 | Returns: 39 | a callable which retries `func` if OOM is encountered. 40 | Examples: 41 | :: 42 | output = retry_if_cuda_oom(some_torch_function)(input1, input2) 43 | # output may be on CPU even if inputs are on GPU 44 | Note: 45 | 1. When converting inputs to CPU, it will only look at each argument and check 46 | if it has `.device` and `.to` for conversion. Nested structures of tensors 47 | are not supported. 48 | 2. Since the function might be called more than once, it has to be 49 | stateless. 50 | """ 51 | 52 | def maybe_to_cpu(x): 53 | try: 54 | like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to") 55 | except AttributeError: 56 | like_gpu_tensor = False 57 | if like_gpu_tensor: 58 | return x.to(device="cpu").to(torch.float32) 59 | else: 60 | return x 61 | 62 | @wraps(func) 63 | def wrapped(*args, **kwargs): 64 | with _ignore_torch_cuda_oom(): 65 | return func(*args, **kwargs) 66 | 67 | # Clear cache and retry 68 | torch.cuda.empty_cache() 69 | with _ignore_torch_cuda_oom(): 70 | return func(*args, **kwargs) 71 | 72 | # Try on CPU. This slows down the code significantly, therefore print a notice. 73 | logger = logging.getLogger(__name__) 74 | logger.info("Attempting to copy inputs to CPU due to CUDA OOM") 75 | new_args = (maybe_to_cpu(x) for x in args) 76 | new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()} 77 | with autocast(enabled=False): 78 | return func(*new_args, **new_kwargs) 79 | 80 | return wrapped 81 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | import os 13 | import glob 14 | 15 | import torch 16 | 17 | from torch.utils.cpp_extension import CUDA_HOME 18 | from torch.utils.cpp_extension import CppExtension 19 | from torch.utils.cpp_extension import CUDAExtension 20 | 21 | from setuptools import find_packages 22 | from setuptools import setup 23 | 24 | requirements = ["torch", "torchvision"] 25 | 26 | def get_extensions(): 27 | this_dir = os.path.dirname(os.path.abspath(__file__)) 28 | extensions_dir = os.path.join(this_dir, "src") 29 | 30 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 31 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 32 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 33 | 34 | sources = main_file + source_cpu 35 | extension = CppExtension 36 | extra_compile_args = {"cxx": []} 37 | define_macros = [] 38 | 39 | # Force cuda since torch ask for a device, not if cuda is in fact available. 40 | if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: 41 | extension = CUDAExtension 42 | sources += source_cuda 43 | define_macros += [("WITH_CUDA", None)] 44 | extra_compile_args["nvcc"] = [ 45 | "-DCUDA_HAS_FP16=1", 46 | "-D__CUDA_NO_HALF_OPERATORS__", 47 | "-D__CUDA_NO_HALF_CONVERSIONS__", 48 | "-D__CUDA_NO_HALF2_OPERATORS__", 49 | ] 50 | else: 51 | if CUDA_HOME is None: 52 | raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') 53 | else: 54 | raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') 55 | 56 | sources = [os.path.join(extensions_dir, s) for s in sources] 57 | include_dirs = [extensions_dir] 58 | ext_modules = [ 59 | extension( 60 | "MultiScaleDeformableAttention", 61 | sources, 62 | include_dirs=include_dirs, 63 | define_macros=define_macros, 64 | extra_compile_args=extra_compile_args, 65 | ) 66 | ] 67 | return ext_modules 68 | 69 | setup( 70 | name="MultiScaleDeformableAttention", 71 | version="1.0", 72 | author="Weijie Su", 73 | url="https://github.com/fundamentalvision/Deformable-DETR", 74 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 75 | packages=find_packages(exclude=("configs", "tests",)), 76 | ext_modules=get_extensions(), 77 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 78 | ) 79 | -------------------------------------------------------------------------------- /uni_dvps/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | from detectron2.config import CfgNode as CN 4 | 5 | 6 | def add_uni_dvps_config(cfg): 7 | cfg.INPUT.ROTATE_AUG = 0. 8 | cfg.INPUT.DEPTH_BOUND = True 9 | 10 | cfg.OUTPUT_DIR = './output' 11 | cfg.TEST.LOG_PERIOD = 1 12 | cfg.SEED = 42 13 | cfg.EVAL_FRAMES = 0 14 | 15 | #PanopticDepth aug 16 | cfg.INPUT.CROP.WITH_PAD = True 17 | cfg.INPUT.CROP.RESCALE = (0.8, 1.2) 18 | 19 | # DEPTH_FORMER 20 | cfg.MODEL.DEPTH_FORMER = CN() 21 | cfg.MODEL.DEPTH_FORMER.DEPTH_DIM = 256 22 | cfg.MODEL.DEPTH_FORMER.DEPTH_MAX = 88. 23 | cfg.MODEL.DEPTH_FORMER.SILOG_WEIGHT = 1.0 24 | cfg.MODEL.DEPTH_FORMER.REL_SQR_WEIGHT = 1.0 25 | cfg.MODEL.DEPTH_FORMER.REL_ABS_WEIGHT = 1.0 26 | cfg.MODEL.DEPTH_FORMER.TRANSFORMER_DECODER_NAME = "VideoMultiScaleDepthTransformerDecoder_frame" 27 | 28 | # UNIFIED_FORMER 29 | cfg.MODEL.UNIFIED_FORMER = CN() 30 | 31 | # loss 32 | cfg.MODEL.UNIFIED_FORMER.DEEP_SUPERVISION = True 33 | cfg.MODEL.UNIFIED_FORMER.NO_OBJECT_WEIGHT = 0.1 34 | cfg.MODEL.UNIFIED_FORMER.CLASS_WEIGHT = 1.0 35 | cfg.MODEL.UNIFIED_FORMER.DICE_WEIGHT = 1.0 36 | cfg.MODEL.UNIFIED_FORMER.MASK_WEIGHT = 20.0 37 | 38 | # transformer config 39 | cfg.MODEL.UNIFIED_FORMER.NHEADS = 8 40 | cfg.MODEL.UNIFIED_FORMER.DROPOUT = 0.1 41 | cfg.MODEL.UNIFIED_FORMER.DIM_FEEDFORWARD = 2048 42 | cfg.MODEL.UNIFIED_FORMER.ENC_LAYERS = 0 43 | cfg.MODEL.UNIFIED_FORMER.DEC_LAYERS = 6 44 | cfg.MODEL.UNIFIED_FORMER.PRE_NORM = False 45 | 46 | cfg.MODEL.UNIFIED_FORMER.HIDDEN_DIM = 256 47 | cfg.MODEL.UNIFIED_FORMER.NUM_OBJECT_QUERIES = 100 48 | 49 | cfg.MODEL.UNIFIED_FORMER.TRANSFORMER_IN_FEATURE = "res5" 50 | cfg.MODEL.UNIFIED_FORMER.ENFORCE_INPUT_PROJ = False 51 | 52 | cfg.MODEL.UNIFIED_FORMER.DEPTH_DIM = 256 53 | cfg.MODEL.UNIFIED_FORMER.DEPTH_MAX = 80. 54 | cfg.MODEL.UNIFIED_FORMER.SILOG_WEIGHT = 1.0 55 | cfg.MODEL.UNIFIED_FORMER.REL_SQR_WEIGHT = 1.0 56 | cfg.MODEL.UNIFIED_FORMER.REL_ABS_WEIGHT = 1.0 57 | 58 | cfg.MODEL.UNIFIED_FORMER.TRANSFORMER_DECODER_NAME = "VideoMultiScaleMaskedTransformerDecoder_frame_unified_decoder" 59 | cfg.MODEL.UNIFIED_FORMER.SIZE_DIVISIBILITY = 32 60 | 61 | # UNIFIED_FORMER inference config 62 | cfg.MODEL.UNIFIED_FORMER.TEST = CN() 63 | cfg.MODEL.UNIFIED_FORMER.TEST.SEMANTIC_ON = True 64 | cfg.MODEL.UNIFIED_FORMER.TEST.INSTANCE_ON = False 65 | cfg.MODEL.UNIFIED_FORMER.TEST.PANOPTIC_ON = False 66 | cfg.MODEL.UNIFIED_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0 67 | cfg.MODEL.UNIFIED_FORMER.TEST.OVERLAP_THRESHOLD = 0.0 68 | cfg.MODEL.UNIFIED_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False 69 | cfg.MODEL.UNIFIED_FORMER.TEST.WINDOW_INFERENCE = False 70 | 71 | # point loss configs 72 | # Number of points sampled during training for a mask point head. 73 | cfg.MODEL.UNIFIED_FORMER.TRAIN_NUM_POINTS = 112 * 112 74 | # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the 75 | # original paper. 76 | cfg.MODEL.UNIFIED_FORMER.OVERSAMPLE_RATIO = 3.0 77 | # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in 78 | # the original paper. 79 | cfg.MODEL.UNIFIED_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75 80 | 81 | # MATCHER 82 | cfg.MODEL.UNIFIED_FORMER.MATCHER = "video_depth_matcher" -------------------------------------------------------------------------------- /demo/demo.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import multiprocessing as mp 4 | import os 5 | import time 6 | from torch.cuda.amp import autocast 7 | from detectron2.config import get_cfg 8 | from detectron2.data.detection_utils import read_image 9 | from detectron2.projects.deeplab import add_deeplab_config 10 | from detectron2.utils.logger import setup_logger 11 | 12 | from mask2former import add_maskformer2_config 13 | from mask2former_video import add_maskformer2_video_config 14 | from minvis import add_minvis_config 15 | from uni_dvps import add_uni_dvps_config 16 | from predictor import VisualizationDemo_Panoptic 17 | 18 | from tqdm import tqdm 19 | import warnings 20 | warnings.filterwarnings(action='ignore') 21 | 22 | def setup_cfg(args): 23 | # load config from file and command-line arguments 24 | cfg = get_cfg() 25 | add_deeplab_config(cfg) 26 | add_maskformer2_config(cfg) 27 | add_maskformer2_video_config(cfg) 28 | add_minvis_config(cfg) 29 | add_uni_dvps_config(cfg) 30 | cfg.merge_from_file(args.config_file) 31 | cfg.merge_from_list(args.opts) 32 | cfg.freeze() 33 | return cfg 34 | 35 | def get_parser(): 36 | parser = argparse.ArgumentParser(description="unidvps demo for builtin configs") 37 | parser.add_argument( 38 | "--config-file", 39 | default="configs/CityscapesDVPS/R50.yaml", 40 | metavar="FILE", 41 | help="path to config file", 42 | ) 43 | parser.add_argument( 44 | "--input", 45 | help="directory of input video frames", 46 | required=True, 47 | ) 48 | parser.add_argument( 49 | "--output", 50 | help="directory to save output frames", 51 | required=True, 52 | ) 53 | parser.add_argument( 54 | "--confidence-threshold", 55 | type=float, 56 | default=0.5, 57 | help="Minimum score for instance predictions to be shown", 58 | ) 59 | parser.add_argument( 60 | "--opts", 61 | help="Modify config options using the command-line 'KEY VALUE' pairs", 62 | default=[], 63 | nargs=argparse.REMAINDER, 64 | ) 65 | return parser 66 | 67 | if __name__ == "__main__": 68 | mp.set_start_method("spawn", force=True) 69 | args = get_parser().parse_args() 70 | setup_logger(name="fvcore") 71 | logger = setup_logger() 72 | logger.info("Arguments: " + str(args)) 73 | 74 | cfg = setup_cfg(args) 75 | demo = VisualizationDemo_Panoptic(cfg) 76 | 77 | assert args.input and args.output 78 | 79 | video_root = args.input 80 | output_root = args.output 81 | 82 | os.makedirs(output_root, exist_ok=True) 83 | 84 | frames_path = video_root 85 | frames_path = glob.glob(os.path.expanduser(os.path.join(frames_path, '*.png'))) 86 | frames_path.sort() 87 | 88 | vid_frames = [] 89 | for path in frames_path: 90 | img = read_image(path, format="RGB") 91 | vid_frames.append(img) 92 | 93 | start_time = time.time() 94 | with autocast(): 95 | predictions, visualized_output, visualized_depth_output = demo.run_on_video(vid_frames) 96 | 97 | # save frames 98 | for path, _vis_output in zip(frames_path, tqdm(visualized_output, initial=1)): 99 | out_filename = os.path.join(output_root, os.path.basename(path).split('.png')[0]+'_seg.png') 100 | _vis_output.save(out_filename) 101 | print("Panoptic segmentation results are saved in {}".format(output_root)) 102 | 103 | for path, _vis_output in zip(frames_path, tqdm(visualized_depth_output, initial=1)): 104 | out_filename = os.path.join(output_root, os.path.basename(path).split('.png')[0]+'_depth.png') 105 | _vis_output.save(out_filename) 106 | print("Depth estimation results are saved in {}".format(output_root)) 107 | 108 | -------------------------------------------------------------------------------- /minvis/data_video/datasets/builtin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA Corporation & Affiliates. All rights reserved. 2 | # 3 | # This work is made available under the Nvidia Source Code License-NC. 4 | # To view a copy of this license, visit 5 | # https://github.com/NVlabs/MinVIS/blob/main/LICENSE 6 | 7 | # Copyright (c) Facebook, Inc. and its affiliates. 8 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 9 | 10 | import os 11 | 12 | from .ytvis import ( 13 | register_ytvis_instances, 14 | _get_ytvis_2019_instances_meta, 15 | _get_ytvis_2021_instances_meta, 16 | _get_ovis_instances_meta, 17 | ) 18 | 19 | # ==== Predefined splits for YTVIS 2019 =========== 20 | _PREDEFINED_SPLITS_YTVIS_2019 = { 21 | "ytvis_2019_train": ("YouTubeVIS_2019/train/JPEGImages", 22 | "YouTubeVIS_2019/train.json"), 23 | "ytvis_2019_val": ("YouTubeVIS_2019/valid/JPEGImages", 24 | "YouTubeVIS_2019/valid.json"), 25 | "ytvis_2019_test": ("YouTubeVIS_2019/test/JPEGImages", 26 | "YouTubeVIS_2019/test.json"), 27 | } 28 | 29 | 30 | # ==== Predefined splits for YTVIS 2021 =========== 31 | _PREDEFINED_SPLITS_YTVIS_2021 = { 32 | "ytvis_2021_train": ("YouTubeVIS_2021/train/JPEGImages", 33 | "YouTubeVIS_2021/train.json"), 34 | "ytvis_2021_val": ("YouTubeVIS_2021/valid/JPEGImages", 35 | "YouTubeVIS_2021/valid.json"), 36 | "ytvis_2021_test": ("YouTubeVIS_2021/test/JPEGImages", 37 | "YouTubeVIS_2021/test.json"), 38 | } 39 | 40 | # ==== Predefined splits for OVIS =========== 41 | _PREDEFINED_SPLITS_OVIS = { 42 | "ovis_train": ("ovis/train", 43 | "ovis/annotations/annotations_train.json"), 44 | "ovis_val": ("ovis/valid", 45 | "ovis/annotations/annotations_valid.json"), 46 | "ovis_test": ("ovis/test", 47 | "ovis/annotations/annotations_test.json"), 48 | } 49 | 50 | 51 | def register_all_ytvis_2019(root): 52 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items(): 53 | # Assume pre-defined datasets live in `./datasets`. 54 | register_ytvis_instances( 55 | key, 56 | _get_ytvis_2019_instances_meta(), 57 | os.path.join(root, json_file) if "://" not in json_file else json_file, 58 | os.path.join(root, image_root), 59 | ) 60 | 61 | 62 | def register_all_ytvis_2021(root): 63 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items(): 64 | # Assume pre-defined datasets live in `./datasets`. 65 | register_ytvis_instances( 66 | key, 67 | _get_ytvis_2021_instances_meta(), 68 | os.path.join(root, json_file) if "://" not in json_file else json_file, 69 | os.path.join(root, image_root), 70 | ) 71 | 72 | 73 | def register_all_ovis(root): 74 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_OVIS.items(): 75 | # Assume pre-defined datasets live in `./datasets`. 76 | register_ytvis_instances( 77 | key, 78 | _get_ovis_instances_meta(), 79 | os.path.join(root, json_file) if "://" not in json_file else json_file, 80 | os.path.join(root, image_root), 81 | ) 82 | 83 | 84 | if __name__.endswith(".builtin"): 85 | # Assume pre-defined datasets live in `./datasets`. 86 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 87 | # register_all_ytvis_2019(_root) 88 | # register_all_ytvis_2021(_root) 89 | register_all_ovis(_root) 90 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch.autograd import Function 19 | from torch.autograd.function import once_differentiable 20 | 21 | try: 22 | import MultiScaleDeformableAttention as MSDA 23 | except ModuleNotFoundError as e: 24 | info_string = ( 25 | "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" 26 | "\t`cd mask2former/modeling/pixel_decoder/ops`\n" 27 | "\t`sh make.sh`\n" 28 | ) 29 | raise ModuleNotFoundError(info_string) 30 | 31 | 32 | class MSDeformAttnFunction(Function): 33 | @staticmethod 34 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 35 | ctx.im2col_step = im2col_step 36 | output = MSDA.ms_deform_attn_forward( 37 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 38 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 39 | return output 40 | 41 | @staticmethod 42 | @once_differentiable 43 | def backward(ctx, grad_output): 44 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 45 | grad_value, grad_sampling_loc, grad_attn_weight = \ 46 | MSDA.ms_deform_attn_backward( 47 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 48 | 49 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 50 | 51 | 52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): 53 | # for debug and test only, 54 | # need to use cuda version instead 55 | N_, S_, M_, D_ = value.shape 56 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 57 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 58 | sampling_grids = 2 * sampling_locations - 1 59 | sampling_value_list = [] 60 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 61 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 62 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 63 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 64 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 65 | # N_*M_, D_, Lq_, P_ 66 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 67 | mode='bilinear', padding_mode='zeros', align_corners=False) 68 | sampling_value_list.append(sampling_value_l_) 69 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 70 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 71 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 72 | return output.transpose(1, 2).contiguous() 73 | -------------------------------------------------------------------------------- /uni_dvps/data_video/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | from typing import Optional 5 | import os 6 | import torch 7 | import time 8 | 9 | from detectron2.engine import hooks 10 | from detectron2.utils import comm 11 | from fvcore.nn.precise_bn import get_bn_modules 12 | 13 | from detectron2.utils.events import ( 14 | EventWriter, 15 | get_event_storage, 16 | CommonMetricPrinter, 17 | JSONWriter, 18 | TensorboardXWriter 19 | ) 20 | 21 | 22 | def build_hooks(self): 23 | cfg = self.cfg.clone() 24 | cfg.defrost() 25 | cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN 26 | 27 | ret = [ 28 | hooks.IterationTimer(), 29 | hooks.LRScheduler(), 30 | hooks.PreciseBN( 31 | # Run at the same freq as (but before) evaluation. 32 | cfg.TEST.EVAL_PERIOD, 33 | self.model, 34 | # Build a new data loader to not affect training 35 | self.build_train_loader(cfg), 36 | cfg.TEST.PRECISE_BN.NUM_ITER, 37 | ) 38 | if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model) 39 | else None, 40 | ] 41 | 42 | # Do PreciseBN before checkpointer, because it updates the model and need to 43 | # be saved by checkpointer. 44 | # This is not always the best: if checkpointing has a different frequency, 45 | # some checkpoints may have more precise statistics than others. 46 | if comm.is_main_process(): 47 | ret.append(hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD)) 48 | 49 | def test_and_save_results(): 50 | self._last_eval_results = self.test(self.cfg, self.model) 51 | return self._last_eval_results 52 | 53 | # Do evaluation after checkpointer, because then if it fails, 54 | # we can use the saved checkpoint to debug. 55 | ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) 56 | 57 | if comm.is_main_process(): 58 | # Here the default print/log frequency of each writer is used. 59 | # run writers in the end, so that evaluation metrics are written 60 | if cfg.OUTPUT_DIR: 61 | ret.append(hooks.PeriodicWriter(build_writers(cfg.OUTPUT_DIR, self.max_iter), period=cfg.TEST.LOG_PERIOD)) 62 | return ret 63 | 64 | def build_writers(output_dir: str, max_iter: Optional[int] = None): 65 | return [ 66 | # It may not always print what you want to see, since it prints "common" metrics only. 67 | CommonMetricPrinter(max_iter), 68 | JSONWriter(os.path.join(output_dir, "metrics.json")), 69 | # TensorboardXWriter(output_dir), 70 | # WAndBWriter() 71 | ] 72 | 73 | def run_step(self): 74 | """ 75 | Implement the AMP training logic. 76 | """ 77 | assert self.model.training, "[AMPTrainer] model was changed to eval mode!" 78 | assert torch.cuda.is_available(), "[AMPTrainer] CUDA is required for AMP training!" 79 | from torch.cuda.amp import autocast 80 | 81 | start = time.perf_counter() 82 | data = next(self._data_loader_iter) 83 | data_time = time.perf_counter() - start 84 | 85 | with autocast(): 86 | loss_dict, image = self.model(data) 87 | if isinstance(loss_dict, torch.Tensor): 88 | losses = loss_dict 89 | loss_dict = {"total_loss": loss_dict} 90 | else: 91 | losses = sum(loss_dict.values()) 92 | 93 | self.optimizer.zero_grad() 94 | # depth.retain_grad() 95 | self.grad_scaler.scale(losses).backward() 96 | 97 | self._write_metrics(loss_dict, data_time) 98 | 99 | if isinstance(image, torch.Tensor): 100 | _log_images(image) 101 | 102 | self.grad_scaler.step(self.optimizer) 103 | self.grad_scaler.update() 104 | 105 | def _log_images(image): 106 | image_name = "depth" 107 | if comm.is_main_process(): 108 | storage = get_event_storage() 109 | storage.put_image(image_name, image) -------------------------------------------------------------------------------- /mask2former/test_time_augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | from itertools import count 5 | 6 | import numpy as np 7 | import torch 8 | from fvcore.transforms import HFlipTransform 9 | from torch import nn 10 | from torch.nn.parallel import DistributedDataParallel 11 | 12 | from detectron2.data.detection_utils import read_image 13 | from detectron2.modeling import DatasetMapperTTA 14 | 15 | 16 | __all__ = [ 17 | "SemanticSegmentorWithTTA", 18 | ] 19 | 20 | 21 | class SemanticSegmentorWithTTA(nn.Module): 22 | """ 23 | A SemanticSegmentor with test-time augmentation enabled. 24 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`. 25 | """ 26 | 27 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1): 28 | """ 29 | Args: 30 | cfg (CfgNode): 31 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on. 32 | tta_mapper (callable): takes a dataset dict and returns a list of 33 | augmented versions of the dataset dict. Defaults to 34 | `DatasetMapperTTA(cfg)`. 35 | batch_size (int): batch the augmented images into this batch size for inference. 36 | """ 37 | super().__init__() 38 | if isinstance(model, DistributedDataParallel): 39 | model = model.module 40 | self.cfg = cfg.clone() 41 | 42 | self.model = model 43 | 44 | if tta_mapper is None: 45 | tta_mapper = DatasetMapperTTA(cfg) 46 | self.tta_mapper = tta_mapper 47 | self.batch_size = batch_size 48 | 49 | def __call__(self, batched_inputs): 50 | """ 51 | Same input/output format as :meth:`SemanticSegmentor.forward` 52 | """ 53 | 54 | def _maybe_read_image(dataset_dict): 55 | ret = copy.copy(dataset_dict) 56 | if "image" not in ret: 57 | image = read_image(ret.pop("file_name"), self.model.input_format) 58 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW 59 | ret["image"] = image 60 | if "height" not in ret and "width" not in ret: 61 | ret["height"] = image.shape[1] 62 | ret["width"] = image.shape[2] 63 | return ret 64 | 65 | processed_results = [] 66 | for x in batched_inputs: 67 | result = self._inference_one_image(_maybe_read_image(x)) 68 | processed_results.append(result) 69 | return processed_results 70 | 71 | def _inference_one_image(self, input): 72 | """ 73 | Args: 74 | input (dict): one dataset dict with "image" field being a CHW tensor 75 | Returns: 76 | dict: one output dict 77 | """ 78 | orig_shape = (input["height"], input["width"]) 79 | augmented_inputs, tfms = self._get_augmented_inputs(input) 80 | 81 | final_predictions = None 82 | count_predictions = 0 83 | for input, tfm in zip(augmented_inputs, tfms): 84 | count_predictions += 1 85 | with torch.no_grad(): 86 | if final_predictions is None: 87 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 88 | final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2]) 89 | else: 90 | final_predictions = self.model([input])[0].pop("sem_seg") 91 | else: 92 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 93 | final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2]) 94 | else: 95 | final_predictions += self.model([input])[0].pop("sem_seg") 96 | 97 | final_predictions = final_predictions / count_predictions 98 | return {"sem_seg": final_predictions} 99 | 100 | def _get_augmented_inputs(self, input): 101 | augmented_inputs = self.tta_mapper(input) 102 | tfms = [x.pop("transforms") for x in augmented_inputs] 103 | return augmented_inputs, tfms 104 | -------------------------------------------------------------------------------- /mask2former/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py 3 | """ 4 | Misc functions, including distributed helpers. 5 | 6 | Mostly copy-paste from torchvision references. 7 | """ 8 | from typing import List, Optional 9 | 10 | import torch 11 | import torch.distributed as dist 12 | import torchvision 13 | from torch import Tensor 14 | 15 | 16 | def _max_by_axis(the_list): 17 | # type: (List[List[int]]) -> List[int] 18 | maxes = the_list[0] 19 | for sublist in the_list[1:]: 20 | for index, item in enumerate(sublist): 21 | maxes[index] = max(maxes[index], item) 22 | return maxes 23 | 24 | 25 | class NestedTensor(object): 26 | def __init__(self, tensors, mask: Optional[Tensor]): 27 | self.tensors = tensors 28 | self.mask = mask 29 | 30 | def to(self, device): 31 | # type: (Device) -> NestedTensor # noqa 32 | cast_tensor = self.tensors.to(device) 33 | mask = self.mask 34 | if mask is not None: 35 | assert mask is not None 36 | cast_mask = mask.to(device) 37 | else: 38 | cast_mask = None 39 | return NestedTensor(cast_tensor, cast_mask) 40 | 41 | def decompose(self): 42 | return self.tensors, self.mask 43 | 44 | def __repr__(self): 45 | return str(self.tensors) 46 | 47 | 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 49 | # TODO make this more general 50 | if tensor_list[0].ndim == 3: 51 | if torchvision._is_tracing(): 52 | # nested_tensor_from_tensor_list() does not export well to ONNX 53 | # call _onnx_nested_tensor_from_tensor_list() instead 54 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 55 | 56 | # TODO make it support different-sized images 57 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 58 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 59 | batch_shape = [len(tensor_list)] + max_size 60 | b, c, h, w = batch_shape 61 | dtype = tensor_list[0].dtype 62 | device = tensor_list[0].device 63 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 64 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 65 | for img, pad_img, m in zip(tensor_list, tensor, mask): 66 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 67 | m[: img.shape[1], : img.shape[2]] = False 68 | else: 69 | raise ValueError("not supported") 70 | return NestedTensor(tensor, mask) 71 | 72 | 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing. 75 | @torch.jit.unused 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: 77 | max_size = [] 78 | for i in range(tensor_list[0].dim()): 79 | max_size_i = torch.max( 80 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) 81 | ).to(torch.int64) 82 | max_size.append(max_size_i) 83 | max_size = tuple(max_size) 84 | 85 | # work around for 86 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 87 | # m[: img.shape[1], :img.shape[2]] = False 88 | # which is not yet supported in onnx 89 | padded_imgs = [] 90 | padded_masks = [] 91 | for img in tensor_list: 92 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 93 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 94 | padded_imgs.append(padded_img) 95 | 96 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 97 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 98 | padded_masks.append(padded_mask.to(torch.bool)) 99 | 100 | tensor = torch.stack(padded_imgs) 101 | mask = torch.stack(padded_masks) 102 | 103 | return NestedTensor(tensor, mask=mask) 104 | 105 | 106 | def is_dist_avail_and_initialized(): 107 | if not dist.is_available(): 108 | return False 109 | if not dist.is_initialized(): 110 | return False 111 | return True 112 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import time 17 | import torch 18 | import torch.nn as nn 19 | from torch.autograd import gradcheck 20 | 21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 22 | 23 | 24 | N, M, D = 1, 2, 2 25 | Lq, L, P = 2, 2, 2 26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 28 | S = sum([(H*W).item() for H, W in shapes]) 29 | 30 | 31 | torch.manual_seed(3) 32 | 33 | 34 | @torch.no_grad() 35 | def check_forward_equal_with_pytorch_double(): 36 | value = torch.rand(N, S, M, D).cuda() * 0.01 37 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 38 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 39 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 40 | im2col_step = 2 41 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 42 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 43 | fwdok = torch.allclose(output_cuda, output_pytorch) 44 | max_abs_err = (output_cuda - output_pytorch).abs().max() 45 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 46 | 47 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 48 | 49 | 50 | @torch.no_grad() 51 | def check_forward_equal_with_pytorch_float(): 52 | value = torch.rand(N, S, M, D).cuda() * 0.01 53 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 54 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 55 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 56 | im2col_step = 2 57 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 58 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 59 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 60 | max_abs_err = (output_cuda - output_pytorch).abs().max() 61 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 62 | 63 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 64 | 65 | 66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 67 | 68 | value = torch.rand(N, S, M, channels).cuda() * 0.01 69 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 70 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 71 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 72 | im2col_step = 2 73 | func = MSDeformAttnFunction.apply 74 | 75 | value.requires_grad = grad_value 76 | sampling_locations.requires_grad = grad_sampling_loc 77 | attention_weights.requires_grad = grad_attn_weight 78 | 79 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 80 | 81 | print(f'* {gradok} check_gradient_numerical(D={channels})') 82 | 83 | 84 | if __name__ == '__main__': 85 | check_forward_equal_with_pytorch_double() 86 | check_forward_equal_with_pytorch_float() 87 | 88 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 89 | check_gradient_numerical(channels, True, True, True) 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /uni_dvps/data_video/datasets/cityscapes_dvps.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import io 3 | import json 4 | import logging 5 | import numpy as np 6 | import os 7 | import tqdm 8 | import pycocotools.mask as mask_util 9 | from fvcore.common.file_io import PathManager 10 | from fvcore.common.timer import Timer 11 | 12 | from detectron2.structures import Boxes, BoxMode, PolygonMasks 13 | from detectron2.data import DatasetCatalog, MetadataCatalog 14 | from detectron2.data.datasets.builtin_meta import CITYSCAPES_CATEGORIES 15 | 16 | """ 17 | This file contains functions to parse Cityscapes_DVPS dataset of 18 | COCO-format annotations into dicts in "Detectron2 format". 19 | """ 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | __all__ = ["load_cityscapes_dvps_json", "register_cityscapes_dvps"] 24 | 25 | def _get_cityscapes_dvps_meta(): 26 | thing_ids = [k["trainId"] for k in CITYSCAPES_CATEGORIES if k["isthing"] == 1] 27 | thing_classes = [k["name"] for k in CITYSCAPES_CATEGORIES if k["isthing"] == 1] 28 | thing_colors = [k["color"] for k in CITYSCAPES_CATEGORIES if k["isthing"] == 1] 29 | stuff_ids = [k["trainId"] for k in CITYSCAPES_CATEGORIES if k["isthing"] == 0] 30 | stuff_classes = [k["name"] for k in CITYSCAPES_CATEGORIES if k["isthing"] == 0] 31 | stuff_colors = [k["color"] for k in CITYSCAPES_CATEGORIES if k["isthing"] == 0] 32 | assert len(thing_ids) == 8, len(thing_ids) 33 | assert len(stuff_ids) == 11, len(stuff_ids) 34 | # Mapping from the incontiguous Cityscapes_DVPS category id to an id in [0, 10] 35 | thing_train_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} 36 | stuff_train_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)} 37 | 38 | ret = { 39 | "thing_ids": thing_ids, 40 | "thing_classes": thing_classes, 41 | "thing_colors": thing_colors, 42 | "thing_train_id_to_contiguous_id": thing_train_id_to_contiguous_id, 43 | "stuff_ids": stuff_ids, 44 | "stuff_classes": stuff_classes, 45 | "stuff_colors": stuff_colors, 46 | "stuff_train_id_to_contiguous_id": stuff_train_id_to_contiguous_id 47 | } 48 | return ret 49 | 50 | def load_cityscapes_dvps_json(gt_json, image_dir, gt_dir, meta, name): 51 | assert os.path.exists(gt_json), gt_json+" not exists" 52 | with open(gt_json) as f: 53 | file_dicts = json.load(f) 54 | 55 | dataset_dicts = [] 56 | for file_dict in file_dicts: 57 | if file_dict["image"].split("_")[1] == "000000": 58 | record = {} 59 | record["height"] = file_dict["height"] 60 | record["width"] = file_dict["width"] 61 | # record["length"] = 6 62 | record["video_id"] = file_dict["image"].split("_")[0] 63 | record["file_names"] = [os.path.join(image_dir, file_dict["image"])] 64 | record["seg_file_names"] = [os.path.join(gt_dir, file_dict["seg"])] 65 | record["depth_file_names"] = [os.path.join(gt_dir, file_dict["depth"])] 66 | 67 | dataset_dicts.append(record) 68 | else: 69 | video_id = file_dict["image"].split("_")[0] 70 | image_name = os.path.join(image_dir, file_dict["image"]) 71 | seg_gt_name = os.path.join(gt_dir, file_dict["seg"]) 72 | depth_gt_name = os.path.join(gt_dir, file_dict["depth"]) 73 | # video_idx = [i for i, dict in enumerate(dataset_dicts) if dict["video_id"] == video_id][0] 74 | 75 | video_idx = int(video_id) 76 | dataset_dicts[video_idx]["file_names"].append(image_name) 77 | dataset_dicts[video_idx]["seg_file_names"].append(seg_gt_name) 78 | dataset_dicts[video_idx]["depth_file_names"].append(depth_gt_name) 79 | 80 | logger.info("Loaded {} images from {}".format(len(file_dicts), image_dir)) 81 | return dataset_dicts 82 | 83 | 84 | def register_cityscapes_dvps(name, meta, gt_json, image_dir, gt_dir): 85 | """ 86 | Register a dataset in Cityscapes_DVPS's json annotation format for DVPS. 87 | """ 88 | assert isinstance(name, str), name 89 | assert isinstance(gt_json, (str, os.PathLike)), gt_json 90 | assert isinstance(image_dir, (str, os.PathLike)), image_dir 91 | assert isinstance(gt_dir, (str, os.PathLike)), gt_dir 92 | 93 | DatasetCatalog.register(name, lambda: load_cityscapes_dvps_json(gt_json, image_dir, gt_dir, meta, name)) 94 | MetadataCatalog.get(name).set( 95 | panoptic_root=gt_dir, 96 | image_root=image_dir, 97 | gt_dir=gt_dir, 98 | evaluator_type="cityscapes_dvps", 99 | # ignore_label=255, 100 | ignore_label=32, 101 | label_divisor=1000, 102 | **meta, 103 | ) 104 | -------------------------------------------------------------------------------- /mask2former/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | from detectron2.config import CfgNode as CN 4 | 5 | 6 | def add_maskformer2_config(cfg): 7 | """ 8 | Add config for MASK_FORMER. 9 | """ 10 | # NOTE: configs from original maskformer 11 | # data config 12 | # select the dataset mapper 13 | cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic" 14 | # Color augmentation 15 | cfg.INPUT.COLOR_AUG_SSD = False 16 | # We retry random cropping until no single category in semantic segmentation GT occupies more 17 | # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. 18 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 19 | # Pad image and segmentation GT in dataset mapper. 20 | cfg.INPUT.SIZE_DIVISIBILITY = -1 21 | 22 | # solver config 23 | # weight decay on embedding 24 | cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 25 | # optimizer 26 | cfg.SOLVER.OPTIMIZER = "ADAMW" 27 | cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 28 | 29 | # mask_former model config 30 | cfg.MODEL.MASK_FORMER = CN() 31 | 32 | # loss 33 | cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True 34 | cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1 35 | cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0 36 | cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0 37 | cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0 38 | 39 | # transformer config 40 | cfg.MODEL.MASK_FORMER.NHEADS = 8 41 | cfg.MODEL.MASK_FORMER.DROPOUT = 0.1 42 | cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048 43 | cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0 44 | cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6 45 | cfg.MODEL.MASK_FORMER.PRE_NORM = False 46 | 47 | cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256 48 | cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100 49 | 50 | cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5" 51 | cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False 52 | 53 | # mask_former inference config 54 | cfg.MODEL.MASK_FORMER.TEST = CN() 55 | cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True 56 | cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False 57 | cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False 58 | cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0 59 | cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0 60 | cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False 61 | 62 | # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) 63 | # you can use this config to override 64 | cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32 65 | 66 | # pixel decoder config 67 | cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 68 | # adding transformer in pixel decoder 69 | cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 70 | # pixel decoder 71 | cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder" 72 | 73 | # swin transformer backbone 74 | cfg.MODEL.SWIN = CN() 75 | cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 76 | cfg.MODEL.SWIN.PATCH_SIZE = 4 77 | cfg.MODEL.SWIN.EMBED_DIM = 96 78 | cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] 79 | cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] 80 | cfg.MODEL.SWIN.WINDOW_SIZE = 7 81 | cfg.MODEL.SWIN.MLP_RATIO = 4.0 82 | cfg.MODEL.SWIN.QKV_BIAS = True 83 | cfg.MODEL.SWIN.QK_SCALE = None 84 | cfg.MODEL.SWIN.DROP_RATE = 0.0 85 | cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0 86 | cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3 87 | cfg.MODEL.SWIN.APE = False 88 | cfg.MODEL.SWIN.PATCH_NORM = True 89 | cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] 90 | cfg.MODEL.SWIN.USE_CHECKPOINT = False 91 | 92 | # NOTE: maskformer2 extra configs 93 | # transformer module 94 | cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder" 95 | 96 | # LSJ aug 97 | cfg.INPUT.IMAGE_SIZE = 1024 98 | cfg.INPUT.MIN_SCALE = 0.1 99 | cfg.INPUT.MAX_SCALE = 2.0 100 | 101 | # MSDeformAttn encoder configs 102 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"] 103 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4 104 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8 105 | 106 | # point loss configs 107 | # Number of points sampled during training for a mask point head. 108 | cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112 109 | # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the 110 | # original paper. 111 | cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0 112 | # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in 113 | # the original paper. 114 | cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75 115 | -------------------------------------------------------------------------------- /uni_dvps/modeling/meta_arch/unified_decoder_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | from typing import Dict 4 | from torch import nn 5 | 6 | from detectron2.config import configurable 7 | from detectron2.layers import ShapeSpec 8 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY 9 | 10 | from mask2former.modeling.pixel_decoder.fpn import build_pixel_decoder 11 | from ..transformer_decoder.unified_transformer_decoder import build_unified_transformer_decoder 12 | 13 | 14 | @SEM_SEG_HEADS_REGISTRY.register() 15 | class UnifiedDecoderHead(nn.Module): 16 | _version = 2 17 | 18 | def _load_from_state_dict( 19 | self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs 20 | ): 21 | # version = local_metadata.get("version", None) 22 | # if version is None or version < 2: 23 | # Do not warn if train from scratch 24 | scratch = True 25 | logger = logging.getLogger(__name__) 26 | for k in list(state_dict.keys()): 27 | newk = k 28 | if "sem_seg_head" in k and k.startswith(prefix + "predictor"): 29 | newk = k.replace("predictor", "unified_decoder") 30 | if newk != k: 31 | state_dict[newk] = state_dict[k] 32 | del state_dict[k] 33 | scratch = False 34 | 35 | if not scratch: 36 | logger.warning( 37 | f"Weight format of {self.__class__.__name__} have changed! " 38 | "Please upgrade your models. Applying automatic conversion now ..." 39 | ) 40 | 41 | @configurable 42 | def __init__( 43 | self, 44 | input_shape: Dict[str, ShapeSpec], 45 | *, 46 | num_classes: int, 47 | pixel_decoder: nn.Module, 48 | loss_weight: float = 1.0, 49 | ignore_value: int = -1, 50 | # extra parameters 51 | unified_decoder: nn.Module, 52 | transformer_in_feature: str, 53 | ): 54 | super().__init__() 55 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) 56 | self.in_features = [k for k, v in input_shape] 57 | feature_strides = [v.stride for k, v in input_shape] 58 | feature_channels = [v.channels for k, v in input_shape] 59 | self.num_classes = num_classes 60 | 61 | self.ignore_value = ignore_value 62 | self.common_stride = 4 63 | self.loss_weight = loss_weight 64 | 65 | self.pixel_decoder = pixel_decoder 66 | self.unified_decoder = unified_decoder 67 | self.transformer_in_feature = transformer_in_feature 68 | 69 | @classmethod 70 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): 71 | # figure out in_channels to transformer predictor 72 | if cfg.MODEL.UNIFIED_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder": 73 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 74 | elif cfg.MODEL.UNIFIED_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding": 75 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 76 | elif cfg.MODEL.UNIFIED_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": # for unidvps 77 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 78 | else: 79 | transformer_predictor_in_channels = input_shape[cfg.MODEL.UNIFIED_FORMER.TRANSFORMER_IN_FEATURE].channels 80 | 81 | return { 82 | "input_shape": { 83 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES 84 | }, 85 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 86 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 87 | "pixel_decoder": build_pixel_decoder(cfg, input_shape), 88 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, 89 | "transformer_in_feature": cfg.MODEL.UNIFIED_FORMER.TRANSFORMER_IN_FEATURE, 90 | "unified_decoder": build_unified_transformer_decoder( 91 | cfg, 92 | transformer_predictor_in_channels, 93 | mask_classification=True, 94 | ), 95 | } 96 | 97 | def forward(self, features, mask=None): 98 | return self.layers(features, mask) 99 | 100 | def layers(self, features, mask=None): 101 | mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features) 102 | if self.transformer_in_feature == "multi_scale_pixel_decoder": 103 | predictions = self.unified_decoder(multi_scale_features, mask_features, mask) 104 | 105 | return predictions -------------------------------------------------------------------------------- /mask2former/data/datasets/register_ade20k_instance.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import json 3 | import logging 4 | import numpy as np 5 | import os 6 | from PIL import Image 7 | 8 | from detectron2.data import DatasetCatalog, MetadataCatalog 9 | from detectron2.data.datasets.coco import load_coco_json, register_coco_instances 10 | from detectron2.utils.file_io import PathManager 11 | 12 | ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}] 13 | 14 | 15 | _PREDEFINED_SPLITS = { 16 | # point annotations without masks 17 | "ade20k_instance_train": ( 18 | "ADEChallengeData2016/images/training", 19 | "ADEChallengeData2016/ade20k_instance_train.json", 20 | ), 21 | "ade20k_instance_val": ( 22 | "ADEChallengeData2016/images/validation", 23 | "ADEChallengeData2016/ade20k_instance_val.json", 24 | ), 25 | } 26 | 27 | 28 | def _get_ade_instances_meta(): 29 | thing_ids = [k["id"] for k in ADE_CATEGORIES] 30 | assert len(thing_ids) == 100, len(thing_ids) 31 | # Mapping from the incontiguous ADE category id to an id in [0, 99] 32 | thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} 33 | thing_classes = [k["name"] for k in ADE_CATEGORIES] 34 | ret = { 35 | "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, 36 | "thing_classes": thing_classes, 37 | } 38 | return ret 39 | 40 | 41 | def register_all_ade20k_instance(root): 42 | for key, (image_root, json_file) in _PREDEFINED_SPLITS.items(): 43 | # Assume pre-defined datasets live in `./datasets`. 44 | register_coco_instances( 45 | key, 46 | _get_ade_instances_meta(), 47 | os.path.join(root, json_file) if "://" not in json_file else json_file, 48 | os.path.join(root, image_root), 49 | ) 50 | 51 | 52 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 53 | register_all_ade20k_instance(_root) 54 | -------------------------------------------------------------------------------- /mask2former/evaluation/instance_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import contextlib 3 | import copy 4 | import io 5 | import itertools 6 | import json 7 | import logging 8 | import numpy as np 9 | import os 10 | import pickle 11 | from collections import OrderedDict 12 | import pycocotools.mask as mask_util 13 | import torch 14 | from pycocotools.coco import COCO 15 | from pycocotools.cocoeval import COCOeval 16 | from tabulate import tabulate 17 | 18 | import detectron2.utils.comm as comm 19 | from detectron2.config import CfgNode 20 | from detectron2.data import MetadataCatalog 21 | from detectron2.data.datasets.coco import convert_to_coco_json 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco 23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt 24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou 25 | from detectron2.utils.file_io import PathManager 26 | from detectron2.utils.logger import create_small_table 27 | 28 | 29 | # modified from COCOEvaluator for instance segmetnat 30 | class InstanceSegEvaluator(COCOEvaluator): 31 | """ 32 | Evaluate AR for object proposals, AP for instance detection/segmentation, AP 33 | for keypoint detection outputs using COCO's metrics. 34 | See http://cocodataset.org/#detection-eval and 35 | http://cocodataset.org/#keypoints-eval to understand its metrics. 36 | The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means 37 | the metric cannot be computed (e.g. due to no predictions made). 38 | 39 | In addition to COCO, this evaluator is able to support any bounding box detection, 40 | instance segmentation, or keypoint detection dataset. 41 | """ 42 | 43 | def _eval_predictions(self, predictions, img_ids=None): 44 | """ 45 | Evaluate predictions. Fill self._results with the metrics of the tasks. 46 | """ 47 | self._logger.info("Preparing results for COCO format ...") 48 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) 49 | tasks = self._tasks or self._tasks_from_predictions(coco_results) 50 | 51 | # unmap the category ids for COCO 52 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): 53 | dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id 54 | # all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) 55 | # num_classes = len(all_contiguous_ids) 56 | # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 57 | 58 | reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} 59 | for result in coco_results: 60 | category_id = result["category_id"] 61 | # assert category_id < num_classes, ( 62 | # f"A prediction has class={category_id}, " 63 | # f"but the dataset only has {num_classes} classes and " 64 | # f"predicted class id should be in [0, {num_classes - 1}]." 65 | # ) 66 | assert category_id in reverse_id_mapping, ( 67 | f"A prediction has class={category_id}, " 68 | f"but the dataset only has class ids in {dataset_id_to_contiguous_id}." 69 | ) 70 | result["category_id"] = reverse_id_mapping[category_id] 71 | 72 | if self._output_dir: 73 | file_path = os.path.join(self._output_dir, "coco_instances_results.json") 74 | self._logger.info("Saving results to {}".format(file_path)) 75 | with PathManager.open(file_path, "w") as f: 76 | f.write(json.dumps(coco_results)) 77 | f.flush() 78 | 79 | if not self._do_evaluation: 80 | self._logger.info("Annotations are not available for evaluation.") 81 | return 82 | 83 | self._logger.info( 84 | "Evaluating predictions with {} COCO API...".format( 85 | "unofficial" if self._use_fast_impl else "official" 86 | ) 87 | ) 88 | for task in sorted(tasks): 89 | assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!" 90 | coco_eval = ( 91 | _evaluate_predictions_on_coco( 92 | self._coco_api, 93 | coco_results, 94 | task, 95 | kpt_oks_sigmas=self._kpt_oks_sigmas, 96 | use_fast_impl=self._use_fast_impl, 97 | img_ids=img_ids, 98 | max_dets_per_image=self._max_dets_per_image, 99 | ) 100 | if len(coco_results) > 0 101 | else None # cocoapi does not handle empty results very well 102 | ) 103 | 104 | res = self._derive_coco_results( 105 | coco_eval, task, class_names=self._metadata.get("thing_classes") 106 | ) 107 | self._results[task] = res 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Uni-DVPS (RA-L 2024) 2 | ### [Project Page](https://jiyeon-k1m.github.io/uni-dvps) | [Paper](https://ieeexplore.ieee.org/document/10517661) 3 | This repository contains the official implementation of the RA-L 2024 paper, 4 | "Uni-DVPS: Unified Model for Depth-Aware Video Panoptic Segmentation". 5 | 6 | teaser 7 | 8 | ## Installation 9 | ### Requirements 10 | - Ubuntu 18.04 with Python 3.7 11 | - PyTorch 1.9.1 12 | - CUDA 11.1 13 | - Detectron2: [Detectron2 installation instruction](https://detectron2.readthedocs.io/en/latest/tutorials/install.html) 14 | - OpenCV is optional but needed for demo and visualization 15 | 16 | ### Environment setup 17 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit. 18 | ```shell 19 | conda create --name unidvps python=3.7 20 | conda activate unidvps 21 | 22 | # pytorch installation 23 | pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html 24 | 25 | # opencv installation 26 | pip install -U opencv-python 27 | 28 | # detectron2 installation 29 | git clone --recursive git@github.com:facebookresearch/detectron2.git 30 | cd detectron2 31 | git checkout 1315c8977e867b9979b4fa712638c1975ca8c78f 32 | pip install -e . 33 | 34 | pip install git+https://github.com/cocodataset/panopticapi.git 35 | pip install git+https://github.com/mcordts/cityscapesScripts.git 36 | 37 | cd .. 38 | git clone https://github.com/postech-ami/Uni-DVPS.git 39 | cd Uni-DVPS 40 | pip install -r requirements.txt 41 | ``` 42 | ### CUDA kernel for MSDeformAttm 43 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn. 44 | 45 | ```shell 46 | cd mask2former/modeling/pixel_decoder/ops 47 | sh make.sh 48 | ``` 49 | 50 | ## Preparing data 51 | First, download [Cityscapes-DVPS](https://github.com/joe-siyuan-qiao/ViP-DeepLab/blob/master/cityscapes-dvps/README.md) and [SemKITTI-DVPS](https://github.com/joe-siyuan-qiao/ViP-DeepLab/tree/master/semkitti-dvps) datasets. 52 | Then download dataset json file from [this link](https://drive.google.com/drive/folders/1mVnO-bnwblx9sgPPqtfQ6_zMyky9GpC5?usp=sharing). 53 | The datasets are assumed to exist in a directory specified by the environment variable `DETECTRON2_DATASETS`. 54 | 55 | You can set the location of dataset directory by ```export DETECTRON2_DATASETS=/path/to/dataset```. 56 | If it is unset, the default will be `./datasets` relative to your current working directory. 57 | 58 | We follow the format of Detectron2 Custom Datasets. Please refer [this page](https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html) if you want to use your own dataset. 59 | 60 | ### Expected dataset structure 61 | ``` 62 | $DETECTRON2_DATASETS 63 | ├── cityscapes-dvps 64 | │ └── video_sequence 65 | │ ├── dvps_cityscapes_val.json 66 | │ └── val 67 | │ ├── 000000_000000_munster_000168_000004_leftImg8bit.png 68 | │ ├── 000000_000000_munster_000168_000004_gtFine_instanceTrainIds.png 69 | │ ├── 000000_000000_munster_000168_000004_depth.png 70 | │ └── ... 71 | └── semkitti-dvps 72 | └── video_sequence 73 | ├── dvps_semkitti_val.json 74 | └── val 75 | ├── 000008_000000_leftImg8bit.png 76 | ├── 000008_000000_gtFine_class.png 77 | ├── 000008_000000_gtFine_instance.png 78 | ├── 000008_000000_depth_707.0911865234375.png 79 | └── ... 80 | ``` 81 | 82 | ### Pretrained models 83 | The trained models are available for download in [this link](https://drive.google.com/drive/folders/1PCIoEbvv6U3Te2M3iZrp9ys2_kDa-Xvm?usp=sharing). 84 | 85 | ## Getting Start with Uni-DVPS 86 | ### Demo 87 | Visualize the results of video panoptic segmentation and depth estimation. 88 | ```shell 89 | python demo/demo.py \ 90 | --config-file configs/CityscapesDVPS/R50.yaml \ 91 | --input /path/to/video/frames \ 92 | --output /output/folder \ 93 | --opts MODEL.WEIGHTS /path/to/checkpoint_file 94 | ``` 95 | demo 96 | 97 | ### Evaluation on Cityscapes-DVPS dataset 98 | You can evaluate the model on Cityscapes-DVPS dataset. 99 | ```shell 100 | python train.py \ 101 | --config-file configs/CityscapesDVPS/R50.yaml \ 102 | --eval-only \ 103 | OUTPUT_DIR /output/folder \ 104 | MODEL.WEIGHTS /path/to/checkpoint_file 105 | ``` 106 | 107 | ### Evaluation on SemKITTI-DVPS dataset 108 | 109 | You can evaluate the model on SemKITTI-DVPS dataset. 110 | Please set the argument `EVAL_FRAMES` among {1, 5, 10, 20}. 111 | 112 | ```shell 113 | python train.py \ 114 | --config-file configs/SemKITTIDVPS/R50.yaml \ 115 | --eval-only \ 116 | EVAL_FRAMES 5 \ 117 | OUTPUT_DIR /output/folder \ 118 | MODEL.WEIGHTS /path/to/checkpoint_file 119 | ``` 120 | ## Citation 121 | If you use Uni-DVPS in your research or wish to use our code, please consider citing: 122 | ``` 123 | @article{jiyeon2024unidvps, 124 | title={Uni-DVPS: Unified Model for Depth-Aware Video Panoptic Segmentation}, 125 | author={Ji-Yeon, Kim and Hyun-Bin, Oh and Byung-Ki, Kwon and Kim, Dahun and Kwon, Yongjin and Oh, Tae-Hyun}, 126 | journal={IEEE Robotics and Automation Letters}, 127 | year={2024}, 128 | publisher={IEEE} 129 | } 130 | ``` 131 | 132 | ## Acknowledgement 133 | The implementation of Uni-DVPS is largely fine-tuned from [Mask2former](https://github.com/facebookresearch/Mask2Former) and [MinVIS](https://github.com/NVlabs/MinVIS). 134 | Depth-aware Video Panoptic Segmentation datasets are from [ViP-DeepLab](https://github.com/joe-siyuan-qiao/ViP-DeepLab). 135 | We would like to sincerely thank the authors for generously sharing their code and data. 136 | 137 | > This work was supported by Institute of Information & Communications Technology Planning & Evaluation (IITP) grant funded by the Korea government (MSIT) (No. 2020-0-00004, Development of Previsional Intelligence based on Long-term Visual Memory Network) and Institute of Information & communications Technology Planning \& Evaluation (IITP) grant funded by the Korea government(MSIT) (No.2022-0-00290, Visual Intelligence for Space-Time Understanding and Generation based on Multi-layered Visual Common Sense). -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Uni-DVPS Training Script. 3 | This script is based on Mask2Former and MinVIS. 4 | """ 5 | import os 6 | import copy 7 | import itertools 8 | import logging 9 | import torch 10 | from collections import OrderedDict 11 | from typing import Any, Dict, List, Set 12 | 13 | # detectron2 14 | from detectron2.checkpoint import DetectionCheckpointer 15 | import detectron2.utils.comm as comm 16 | from detectron2.config import get_cfg 17 | from detectron2.engine import ( 18 | DefaultTrainer, 19 | default_argument_parser, 20 | default_setup, 21 | launch, 22 | ) 23 | from detectron2.evaluation import ( 24 | DatasetEvaluator, 25 | inference_on_dataset, 26 | print_csv_format, 27 | verify_results, 28 | ) 29 | from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler 30 | from detectron2.solver.build import maybe_add_gradient_clipping 31 | from detectron2.utils.logger import setup_logger 32 | 33 | # models 34 | from mask2former import add_maskformer2_config 35 | from mask2former_video import add_maskformer2_video_config 36 | from minvis import ( 37 | add_minvis_config, 38 | build_detection_test_loader, 39 | ) 40 | from uni_dvps import ( 41 | add_uni_dvps_config, 42 | CityscapesDVPSDatasetMapper, 43 | CityscapesDVPSEvaluator, 44 | SemkittiDVPSDatasetMapper, 45 | SemkittiDVPSEvaluator, 46 | ) 47 | 48 | import warnings 49 | warnings.filterwarnings(action='ignore') 50 | 51 | 52 | class Trainer(DefaultTrainer): 53 | @classmethod 54 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 55 | if output_folder is None: 56 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 57 | os.makedirs(output_folder, exist_ok=True) 58 | 59 | if "cityscapes" in dataset_name: 60 | evaluator = CityscapesDVPSEvaluator(dataset_name, output_folder) 61 | if "kitti" in dataset_name: 62 | evaluator = SemkittiDVPSEvaluator(dataset_name, output_folder, eval_frame=int(dataset_name.split('val')[1])) 63 | 64 | return evaluator 65 | 66 | @classmethod 67 | def build_test_loader(cls, cfg, dataset_name): 68 | dataset_name = cfg.DATASETS.TEST[0] 69 | if "cityscapes" in dataset_name: 70 | mapper = CityscapesDVPSDatasetMapper(cfg, is_train=False) 71 | if "kitti" in dataset_name: 72 | mapper = SemkittiDVPSDatasetMapper(cfg, is_train= False) 73 | return build_detection_test_loader(cfg, dataset_name, mapper=mapper) 74 | 75 | @classmethod 76 | def test(cls, cfg, model, evaluators=None, eval_frames=None): 77 | from torch.cuda.amp import autocast 78 | logger = logging.getLogger(__name__) 79 | 80 | if isinstance(evaluators, DatasetEvaluator): 81 | evaluators = [evaluators] 82 | if evaluators is not None: 83 | assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format( 84 | len(cfg.DATASETS.TEST), len(evaluators) 85 | ) 86 | 87 | results = OrderedDict() 88 | for idx, dataset_name in enumerate(cfg.DATASETS.TEST): 89 | data_loader = cls.build_test_loader(cfg, dataset_name) 90 | if evaluators is not None: 91 | evaluator = evaluators[idx] 92 | else: 93 | try: 94 | evaluator = cls.build_evaluator(cfg, dataset_name) 95 | except NotImplementedError: 96 | logger.warn( 97 | "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, " 98 | "or implement its `build_evaluator` method." 99 | ) 100 | results[dataset_name] = {} 101 | continue 102 | with autocast(): 103 | results_i = inference_on_dataset(model, data_loader, evaluator) 104 | results[dataset_name] = results_i 105 | if comm.is_main_process(): 106 | assert isinstance( 107 | results_i, dict 108 | ), "Evaluator must return a dict on the main process. Got {} instead.".format( 109 | results_i 110 | ) 111 | logger.info("Evaluation results for {} in csv format:".format(dataset_name)) 112 | print_csv_format(results_i) 113 | 114 | if len(results) == 1: 115 | results = list(results.values())[0] 116 | return results 117 | 118 | 119 | def setup(args): 120 | """ 121 | Create configs and perform basic setups. 122 | """ 123 | cfg = get_cfg() 124 | # for poly lr schedule 125 | add_deeplab_config(cfg) 126 | add_maskformer2_config(cfg) 127 | add_maskformer2_video_config(cfg) 128 | add_minvis_config(cfg) 129 | add_uni_dvps_config(cfg) 130 | cfg.merge_from_file(args.config_file) 131 | cfg.merge_from_list(args.opts) 132 | if cfg.EVAL_FRAMES: 133 | cfg.DATASETS.TEST = (cfg.DATASETS.TEST[0]+str(cfg.EVAL_FRAMES),) 134 | cfg.freeze() 135 | default_setup(cfg, args) 136 | setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="uni_dvps") 137 | return cfg 138 | 139 | 140 | def main(args): 141 | cfg = setup(args) 142 | if args.eval_only: 143 | model = Trainer.build_model(cfg) 144 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 145 | cfg.MODEL.WEIGHTS, resume=args.resume 146 | ) 147 | res = Trainer.test(cfg, model) 148 | if cfg.TEST.AUG.ENABLED: 149 | raise NotImplementedError 150 | if comm.is_main_process(): 151 | verify_results(cfg, res) 152 | return res 153 | 154 | trainer = Trainer(cfg) 155 | trainer.resume_or_load(resume=args.resume) 156 | return trainer.train() 157 | 158 | 159 | if __name__ == "__main__": 160 | args = default_argument_parser().parse_args() 161 | print("Command Line Args:", args) 162 | launch( 163 | main, 164 | args.num_gpus, 165 | num_machines=args.num_machines, 166 | machine_rank=args.machine_rank, 167 | dist_url=args.dist_url, 168 | args=(args,), 169 | ) 170 | -------------------------------------------------------------------------------- /mask2former/modeling/meta_arch/mask_former_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | from copy import deepcopy 4 | from typing import Callable, Dict, List, Optional, Tuple, Union 5 | 6 | import fvcore.nn.weight_init as weight_init 7 | from torch import nn 8 | from torch.nn import functional as F 9 | 10 | from detectron2.config import configurable 11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY 13 | 14 | from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder 15 | from ..pixel_decoder.fpn import build_pixel_decoder 16 | 17 | 18 | @SEM_SEG_HEADS_REGISTRY.register() 19 | class MaskFormerHead(nn.Module): 20 | 21 | _version = 2 22 | 23 | def _load_from_state_dict( 24 | self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs 25 | ): 26 | version = local_metadata.get("version", None) 27 | if version is None or version < 2: 28 | # Do not warn if train from scratch 29 | scratch = True 30 | logger = logging.getLogger(__name__) 31 | for k in list(state_dict.keys()): 32 | newk = k 33 | if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): 34 | # newk = k.replace(prefix, prefix + "pixel_decoder.") 35 | newk = k.replace(prefix, prefix) 36 | # logger.debug(f"{k} ==> {newk}") 37 | if newk != k: 38 | state_dict[newk] = state_dict[k] 39 | del state_dict[k] 40 | scratch = False 41 | 42 | if not scratch: 43 | logger.warning( 44 | f"Weight format of {self.__class__.__name__} have changed! " 45 | "Please upgrade your models. Applying automatic conversion now ..." 46 | ) 47 | 48 | @configurable 49 | def __init__( 50 | self, 51 | input_shape: Dict[str, ShapeSpec], 52 | *, 53 | num_classes: int, 54 | pixel_decoder: nn.Module, 55 | loss_weight: float = 1.0, 56 | ignore_value: int = -1, 57 | # extra parameters 58 | transformer_predictor: nn.Module, 59 | transformer_in_feature: str, 60 | ): 61 | """ 62 | NOTE: this interface is experimental. 63 | Args: 64 | input_shape: shapes (channels and stride) of the input features 65 | num_classes: number of classes to predict 66 | pixel_decoder: the pixel decoder module 67 | loss_weight: loss weight 68 | ignore_value: category id to be ignored during training. 69 | transformer_predictor: the transformer decoder that makes prediction 70 | transformer_in_feature: input feature name to the transformer_predictor 71 | """ 72 | super().__init__() 73 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) 74 | self.in_features = [k for k, v in input_shape] 75 | feature_strides = [v.stride for k, v in input_shape] 76 | feature_channels = [v.channels for k, v in input_shape] 77 | 78 | self.ignore_value = ignore_value 79 | self.common_stride = 4 80 | self.loss_weight = loss_weight 81 | 82 | self.pixel_decoder = pixel_decoder 83 | self.predictor = transformer_predictor 84 | self.transformer_in_feature = transformer_in_feature 85 | 86 | self.num_classes = num_classes 87 | 88 | @classmethod 89 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): 90 | # figure out in_channels to transformer predictor 91 | if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder": 92 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 93 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding": 94 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 95 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": # for maskformer2 96 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 97 | else: 98 | transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels 99 | 100 | return { 101 | "input_shape": { 102 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES 103 | }, 104 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 105 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 106 | "pixel_decoder": build_pixel_decoder(cfg, input_shape), 107 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, 108 | "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE, 109 | "transformer_predictor": build_transformer_decoder( 110 | cfg, 111 | transformer_predictor_in_channels, 112 | mask_classification=True, 113 | ), 114 | } 115 | 116 | def forward(self, features, mask=None): 117 | return self.layers(features, mask) 118 | 119 | def layers(self, features, mask=None): 120 | mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features) 121 | if self.transformer_in_feature == "multi_scale_pixel_decoder": 122 | predictions = self.predictor(multi_scale_features, mask_features, mask) 123 | else: 124 | if self.transformer_in_feature == "transformer_encoder": 125 | assert ( 126 | transformer_encoder_features is not None 127 | ), "Please use the TransformerEncoderPixelDecoder." 128 | predictions = self.predictor(transformer_encoder_features, mask_features, mask) 129 | elif self.transformer_in_feature == "pixel_embedding": 130 | predictions = self.predictor(mask_features, mask_features, mask) 131 | else: 132 | predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask) 133 | return predictions 134 | -------------------------------------------------------------------------------- /mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py 3 | import copy 4 | import logging 5 | 6 | import numpy as np 7 | import torch 8 | 9 | from detectron2.config import configurable 10 | from detectron2.data import detection_utils as utils 11 | from detectron2.data import transforms as T 12 | from detectron2.data.transforms import TransformGen 13 | from detectron2.structures import BitMasks, Boxes, Instances 14 | 15 | __all__ = ["COCOPanopticNewBaselineDatasetMapper"] 16 | 17 | 18 | def build_transform_gen(cfg, is_train): 19 | """ 20 | Create a list of default :class:`Augmentation` from config. 21 | Now it includes resizing and flipping. 22 | Returns: 23 | list[Augmentation] 24 | """ 25 | assert is_train, "Only support training augmentation" 26 | image_size = cfg.INPUT.IMAGE_SIZE 27 | min_scale = cfg.INPUT.MIN_SCALE 28 | max_scale = cfg.INPUT.MAX_SCALE 29 | 30 | augmentation = [] 31 | 32 | if cfg.INPUT.RANDOM_FLIP != "none": 33 | augmentation.append( 34 | T.RandomFlip( 35 | horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", 36 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical", 37 | ) 38 | ) 39 | 40 | augmentation.extend([ 41 | T.ResizeScale( 42 | min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size 43 | ), 44 | T.FixedSizeCrop(crop_size=(image_size, image_size)), 45 | ]) 46 | 47 | return augmentation 48 | 49 | 50 | # This is specifically designed for the COCO dataset. 51 | class COCOPanopticNewBaselineDatasetMapper: 52 | """ 53 | A callable which takes a dataset dict in Detectron2 Dataset format, 54 | and map it into a format used by MaskFormer. 55 | 56 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. 57 | 58 | The callable currently does the following: 59 | 60 | 1. Read the image from "file_name" 61 | 2. Applies geometric transforms to the image and annotation 62 | 3. Find and applies suitable cropping to the image and annotation 63 | 4. Prepare image and annotation to Tensors 64 | """ 65 | 66 | @configurable 67 | def __init__( 68 | self, 69 | is_train=True, 70 | *, 71 | tfm_gens, 72 | image_format, 73 | ): 74 | """ 75 | NOTE: this interface is experimental. 76 | Args: 77 | is_train: for training or inference 78 | augmentations: a list of augmentations or deterministic transforms to apply 79 | crop_gen: crop augmentation 80 | tfm_gens: data augmentation 81 | image_format: an image format supported by :func:`detection_utils.read_image`. 82 | """ 83 | self.tfm_gens = tfm_gens 84 | logging.getLogger(__name__).info( 85 | "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format( 86 | str(self.tfm_gens) 87 | ) 88 | ) 89 | 90 | self.img_format = image_format 91 | self.is_train = is_train 92 | 93 | @classmethod 94 | def from_config(cls, cfg, is_train=True): 95 | # Build augmentation 96 | tfm_gens = build_transform_gen(cfg, is_train) 97 | 98 | ret = { 99 | "is_train": is_train, 100 | "tfm_gens": tfm_gens, 101 | "image_format": cfg.INPUT.FORMAT, 102 | } 103 | return ret 104 | 105 | def __call__(self, dataset_dict): 106 | """ 107 | Args: 108 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 109 | 110 | Returns: 111 | dict: a format that builtin models in detectron2 accept 112 | """ 113 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 114 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 115 | utils.check_image_size(dataset_dict, image) 116 | 117 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 118 | image_shape = image.shape[:2] # h, w 119 | 120 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 121 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 122 | # Therefore it's important to use torch.Tensor. 123 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 124 | 125 | if not self.is_train: 126 | # USER: Modify this if you want to keep them for some reason. 127 | dataset_dict.pop("annotations", None) 128 | return dataset_dict 129 | 130 | if "pan_seg_file_name" in dataset_dict: 131 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") 132 | segments_info = dataset_dict["segments_info"] 133 | 134 | # apply the same transformation to panoptic segmentation 135 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) 136 | 137 | from panopticapi.utils import rgb2id 138 | 139 | pan_seg_gt = rgb2id(pan_seg_gt) 140 | 141 | instances = Instances(image_shape) 142 | classes = [] 143 | masks = [] 144 | for segment_info in segments_info: 145 | class_id = segment_info["category_id"] 146 | if not segment_info["iscrowd"]: 147 | classes.append(class_id) 148 | masks.append(pan_seg_gt == segment_info["id"]) 149 | 150 | classes = np.array(classes) 151 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 152 | if len(masks) == 0: 153 | # Some image does not have annotation (all ignored) 154 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) 155 | instances.gt_boxes = Boxes(torch.zeros((0, 4))) 156 | else: 157 | masks = BitMasks( 158 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 159 | ) 160 | instances.gt_masks = masks.tensor 161 | instances.gt_boxes = masks.get_bounding_boxes() 162 | 163 | dataset_dict["instances"] = instances 164 | 165 | return dataset_dict 166 | -------------------------------------------------------------------------------- /minvis/data_video/augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | import numpy as np 5 | import logging 6 | import sys 7 | from fvcore.transforms.transform import ( 8 | HFlipTransform, 9 | NoOpTransform, 10 | VFlipTransform, 11 | ) 12 | from PIL import Image 13 | 14 | from detectron2.data import transforms as T 15 | 16 | 17 | class ResizeShortestEdge(T.Augmentation): 18 | """ 19 | Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge. 20 | If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. 21 | """ 22 | 23 | def __init__( 24 | self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1 25 | ): 26 | """ 27 | Args: 28 | short_edge_length (list[int]): If ``sample_style=="range"``, 29 | a [min, max] interval from which to sample the shortest edge length. 30 | If ``sample_style=="choice"``, a list of shortest edge lengths to sample from. 31 | max_size (int): maximum allowed longest edge length. 32 | sample_style (str): either "range" or "choice". 33 | """ 34 | super().__init__() 35 | assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style 36 | 37 | self.is_range = ("range" in sample_style) 38 | if isinstance(short_edge_length, int): 39 | short_edge_length = (short_edge_length, short_edge_length) 40 | if self.is_range: 41 | assert len(short_edge_length) == 2, ( 42 | "short_edge_length must be two values using 'range' sample style." 43 | f" Got {short_edge_length}!" 44 | ) 45 | self._cnt = 0 46 | self._init(locals()) 47 | 48 | def get_transform(self, image): 49 | if self._cnt % self.clip_frame_cnt == 0: 50 | if self.is_range: 51 | self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1) 52 | else: 53 | self.size = np.random.choice(self.short_edge_length) 54 | if self.size == 0: 55 | return NoOpTransform() 56 | 57 | self._cnt = 0 # avoiding overflow 58 | self._cnt += 1 59 | 60 | h, w = image.shape[:2] 61 | 62 | scale = self.size * 1.0 / min(h, w) 63 | if h < w: 64 | newh, neww = self.size, scale * w 65 | else: 66 | newh, neww = scale * h, self.size 67 | if max(newh, neww) > self.max_size: 68 | scale = self.max_size * 1.0 / max(newh, neww) 69 | newh = newh * scale 70 | neww = neww * scale 71 | neww = int(neww + 0.5) 72 | newh = int(newh + 0.5) 73 | return T.ResizeTransform(h, w, newh, neww, self.interp) 74 | 75 | 76 | class RandomFlip(T.Augmentation): 77 | """ 78 | Flip the image horizontally or vertically with the given probability. 79 | """ 80 | 81 | def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1): 82 | """ 83 | Args: 84 | prob (float): probability of flip. 85 | horizontal (boolean): whether to apply horizontal flipping 86 | vertical (boolean): whether to apply vertical flipping 87 | """ 88 | super().__init__() 89 | 90 | if horizontal and vertical: 91 | raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.") 92 | if not horizontal and not vertical: 93 | raise ValueError("At least one of horiz or vert has to be True!") 94 | self._cnt = 0 95 | 96 | self._init(locals()) 97 | 98 | def get_transform(self, image): 99 | if self._cnt % self.clip_frame_cnt == 0: 100 | self.do = self._rand_range() < self.prob 101 | self._cnt = 0 # avoiding overflow 102 | self._cnt += 1 103 | 104 | h, w = image.shape[:2] 105 | 106 | if self.do: 107 | if self.horizontal: 108 | return HFlipTransform(w) 109 | elif self.vertical: 110 | return VFlipTransform(h) 111 | else: 112 | return NoOpTransform() 113 | 114 | 115 | def build_augmentation(cfg, is_train): 116 | logger = logging.getLogger(__name__) 117 | aug_list = [] 118 | if is_train: 119 | # Crop 120 | if cfg.INPUT.CROP.ENABLED: 121 | aug_list.append(T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)) 122 | 123 | # Resize 124 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 125 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 126 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING 127 | ms_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM if "by_clip" in cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING else 1 128 | aug_list.append(ResizeShortestEdge(min_size, max_size, sample_style, clip_frame_cnt=ms_clip_frame_cnt)) 129 | 130 | # Flip 131 | if cfg.INPUT.RANDOM_FLIP != "none": 132 | if cfg.INPUT.RANDOM_FLIP == "flip_by_clip": 133 | flip_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM 134 | else: 135 | flip_clip_frame_cnt = 1 136 | 137 | aug_list.append( 138 | # NOTE using RandomFlip modified for the support of flip maintenance 139 | RandomFlip( 140 | horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"), 141 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical", 142 | clip_frame_cnt=flip_clip_frame_cnt, 143 | ) 144 | ) 145 | 146 | # Additional augmentations : brightness, contrast, saturation, rotation 147 | augmentations = cfg.INPUT.AUGMENTATIONS 148 | if "brightness" in augmentations: 149 | aug_list.append(T.RandomBrightness(0.9, 1.1)) 150 | if "contrast" in augmentations: 151 | aug_list.append(T.RandomContrast(0.9, 1.1)) 152 | if "saturation" in augmentations: 153 | aug_list.append(T.RandomSaturation(0.9, 1.1)) 154 | if "rotation" in augmentations: 155 | aug_list.append( 156 | T.RandomRotation( 157 | [-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], sample_style="range" 158 | ) 159 | ) 160 | else: 161 | # Resize 162 | min_size = cfg.INPUT.MIN_SIZE_TEST 163 | max_size = cfg.INPUT.MAX_SIZE_TEST 164 | sample_style = "choice" 165 | aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style)) 166 | 167 | return aug_list 168 | -------------------------------------------------------------------------------- /mask2former_video/data_video/augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | import numpy as np 5 | import logging 6 | import sys 7 | from fvcore.transforms.transform import ( 8 | HFlipTransform, 9 | NoOpTransform, 10 | VFlipTransform, 11 | ) 12 | from PIL import Image 13 | 14 | from detectron2.data import transforms as T 15 | 16 | 17 | class ResizeShortestEdge(T.Augmentation): 18 | """ 19 | Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge. 20 | If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. 21 | """ 22 | 23 | def __init__( 24 | self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1 25 | ): 26 | """ 27 | Args: 28 | short_edge_length (list[int]): If ``sample_style=="range"``, 29 | a [min, max] interval from which to sample the shortest edge length. 30 | If ``sample_style=="choice"``, a list of shortest edge lengths to sample from. 31 | max_size (int): maximum allowed longest edge length. 32 | sample_style (str): either "range" or "choice". 33 | """ 34 | super().__init__() 35 | assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style 36 | 37 | self.is_range = ("range" in sample_style) 38 | if isinstance(short_edge_length, int): 39 | short_edge_length = (short_edge_length, short_edge_length) 40 | if self.is_range: 41 | assert len(short_edge_length) == 2, ( 42 | "short_edge_length must be two values using 'range' sample style." 43 | f" Got {short_edge_length}!" 44 | ) 45 | self._cnt = 0 46 | self._init(locals()) 47 | 48 | def get_transform(self, image): 49 | if self._cnt % self.clip_frame_cnt == 0: 50 | if self.is_range: 51 | self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1) 52 | else: 53 | self.size = np.random.choice(self.short_edge_length) 54 | if self.size == 0: 55 | return NoOpTransform() 56 | 57 | self._cnt = 0 # avoiding overflow 58 | self._cnt += 1 59 | 60 | h, w = image.shape[:2] 61 | 62 | scale = self.size * 1.0 / min(h, w) 63 | if h < w: 64 | newh, neww = self.size, scale * w 65 | else: 66 | newh, neww = scale * h, self.size 67 | if max(newh, neww) > self.max_size: 68 | scale = self.max_size * 1.0 / max(newh, neww) 69 | newh = newh * scale 70 | neww = neww * scale 71 | neww = int(neww + 0.5) 72 | newh = int(newh + 0.5) 73 | return T.ResizeTransform(h, w, newh, neww, self.interp) 74 | 75 | 76 | class RandomFlip(T.Augmentation): 77 | """ 78 | Flip the image horizontally or vertically with the given probability. 79 | """ 80 | 81 | def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1): 82 | """ 83 | Args: 84 | prob (float): probability of flip. 85 | horizontal (boolean): whether to apply horizontal flipping 86 | vertical (boolean): whether to apply vertical flipping 87 | """ 88 | super().__init__() 89 | 90 | if horizontal and vertical: 91 | raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.") 92 | if not horizontal and not vertical: 93 | raise ValueError("At least one of horiz or vert has to be True!") 94 | self._cnt = 0 95 | 96 | self._init(locals()) 97 | 98 | def get_transform(self, image): 99 | if self._cnt % self.clip_frame_cnt == 0: 100 | self.do = self._rand_range() < self.prob 101 | self._cnt = 0 # avoiding overflow 102 | self._cnt += 1 103 | 104 | h, w = image.shape[:2] 105 | 106 | if self.do: 107 | if self.horizontal: 108 | return HFlipTransform(w) 109 | elif self.vertical: 110 | return VFlipTransform(h) 111 | else: 112 | return NoOpTransform() 113 | 114 | 115 | def build_augmentation(cfg, is_train): 116 | logger = logging.getLogger(__name__) 117 | aug_list = [] 118 | if is_train: 119 | # Crop 120 | if cfg.INPUT.CROP.ENABLED: 121 | aug_list.append(T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)) 122 | 123 | # Resize 124 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 125 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 126 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING 127 | ms_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM if "by_clip" in cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING else 1 128 | aug_list.append(ResizeShortestEdge(min_size, max_size, sample_style, clip_frame_cnt=ms_clip_frame_cnt)) 129 | 130 | # Flip 131 | if cfg.INPUT.RANDOM_FLIP != "none": 132 | if cfg.INPUT.RANDOM_FLIP == "flip_by_clip": 133 | flip_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM 134 | else: 135 | flip_clip_frame_cnt = 1 136 | 137 | aug_list.append( 138 | # NOTE using RandomFlip modified for the support of flip maintenance 139 | RandomFlip( 140 | horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"), 141 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical", 142 | clip_frame_cnt=flip_clip_frame_cnt, 143 | ) 144 | ) 145 | 146 | # Additional augmentations : brightness, contrast, saturation, rotation 147 | augmentations = cfg.INPUT.AUGMENTATIONS 148 | if "brightness" in augmentations: 149 | aug_list.append(T.RandomBrightness(0.9, 1.1)) 150 | if "contrast" in augmentations: 151 | aug_list.append(T.RandomContrast(0.9, 1.1)) 152 | if "saturation" in augmentations: 153 | aug_list.append(T.RandomSaturation(0.9, 1.1)) 154 | if "rotation" in augmentations: 155 | aug_list.append( 156 | T.RandomRotation( 157 | [-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], sample_style="range" 158 | ) 159 | ) 160 | else: 161 | # Resize 162 | min_size = cfg.INPUT.MIN_SIZE_TEST 163 | max_size = cfg.INPUT.MAX_SIZE_TEST 164 | sample_style = "choice" 165 | aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style)) 166 | 167 | return aug_list 168 | -------------------------------------------------------------------------------- /uni_dvps/data_video/augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC 3 | 4 | import numpy as np 5 | import logging 6 | import sys 7 | from fvcore.transforms.transform import ( 8 | HFlipTransform, 9 | NoOpTransform, 10 | VFlipTransform, 11 | ) 12 | from PIL import Image 13 | 14 | from detectron2.data import transforms as T 15 | from detectron2.projects.point_rend import ColorAugSSDTransform 16 | from typing import Tuple 17 | from fvcore.transforms.transform import ( 18 | CropTransform, 19 | PadTransform, 20 | TransformList, 21 | ) 22 | 23 | class FixedSizeCenterCrop(T.Augmentation): 24 | """ 25 | If `crop_size` is smaller than the input image size, then it uses a center crop of 26 | the crop size. If `crop_size` is larger than the input image size, then it pads 27 | the around of the image to the crop size. 28 | """ 29 | 30 | def __init__(self, crop_size: Tuple[int], pad_value: float = 128.0, with_pad=True): 31 | """ 32 | Args: 33 | crop_size: target image (height, width). 34 | pad_value: the padding value. 35 | """ 36 | super().__init__() 37 | self._init(locals()) 38 | 39 | def get_transform(self, image: np.ndarray) -> TransformList: 40 | # Compute the image scale and scaled size. 41 | input_size = image.shape[:2] 42 | output_size = self.crop_size 43 | 44 | # Add random crop if the image is scaled up. 45 | max_offset = np.subtract(input_size, output_size) 46 | max_offset = np.maximum(max_offset, 0) 47 | offset = np.multiply(max_offset, 0.5)#np.random.uniform(0.0, 1.0)) 48 | offset = np.round(offset).astype(int) 49 | crop_transform = CropTransform( 50 | offset[1], offset[0], output_size[1], output_size[0], input_size[1], input_size[0] 51 | ) 52 | if not self.with_pad: 53 | return TransformList([crop_transform, ]) 54 | 55 | # Add padding if the image is scaled down. 56 | pad_size = np.subtract(output_size, input_size) 57 | pad_size = np.maximum(pad_size, 0) 58 | pad_size_0 = pad_size // 2 59 | pad_size_1 = pad_size - pad_size_0 60 | original_size = np.minimum(input_size, output_size) 61 | pad_transform = PadTransform( 62 | pad_size_0[1], pad_size_0[0], pad_size_1[1], pad_size_1[0], original_size[1], original_size[0], self.pad_value 63 | ) 64 | 65 | return TransformList([crop_transform, pad_transform]) 66 | 67 | class ResizeShortestEdge(T.Augmentation): 68 | """ 69 | Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge. 70 | If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. 71 | """ 72 | 73 | def __init__( 74 | self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1 75 | ): 76 | """ 77 | Args: 78 | short_edge_length (list[int]): If ``sample_style=="range"``, 79 | a [min, max] interval from which to sample the shortest edge length. 80 | If ``sample_style=="choice"``, a list of shortest edge lengths to sample from. 81 | max_size (int): maximum allowed longest edge length. 82 | sample_style (str): either "range" or "choice". 83 | """ 84 | super().__init__() 85 | assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style 86 | 87 | self.is_range = ("range" in sample_style) 88 | if isinstance(short_edge_length, int): 89 | short_edge_length = (short_edge_length, short_edge_length) 90 | if self.is_range: 91 | assert len(short_edge_length) == 2, ( 92 | "short_edge_length must be two values using 'range' sample style." 93 | f" Got {short_edge_length}!" 94 | ) 95 | self._cnt = 0 96 | self._init(locals()) 97 | 98 | def get_transform(self, image): 99 | if self._cnt % self.clip_frame_cnt == 0: 100 | if self.is_range: 101 | self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1) 102 | else: 103 | self.size = np.random.choice(self.short_edge_length) 104 | if self.size == 0: 105 | return NoOpTransform() 106 | 107 | self._cnt = 0 # avoiding overflow 108 | self._cnt += 1 109 | 110 | h, w = image.shape[:2] 111 | 112 | scale = self.size * 1.0 / min(h, w) 113 | if h < w: 114 | newh, neww = self.size, scale * w 115 | else: 116 | newh, neww = scale * h, self.size 117 | if max(newh, neww) > self.max_size: 118 | scale = self.max_size * 1.0 / max(newh, neww) 119 | newh = newh * scale 120 | neww = neww * scale 121 | neww = int(neww + 0.5) 122 | newh = int(newh + 0.5) 123 | return T.ResizeTransform(h, w, newh, neww, self.interp) 124 | 125 | class RandomFlip(T.Augmentation): 126 | """ 127 | Flip the image horizontally or vertically with the given probability. 128 | """ 129 | 130 | def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1): 131 | """ 132 | Args: 133 | prob (float): probability of flip. 134 | horizontal (boolean): whether to apply horizontal flipping 135 | vertical (boolean): whether to apply vertical flipping 136 | """ 137 | super().__init__() 138 | 139 | if horizontal and vertical: 140 | raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.") 141 | if not horizontal and not vertical: 142 | raise ValueError("At least one of horiz or vert has to be True!") 143 | self._cnt = 0 144 | 145 | self._init(locals()) 146 | 147 | def get_transform(self, image): 148 | if self._cnt % self.clip_frame_cnt == 0: 149 | self.do = self._rand_range() < self.prob 150 | self._cnt = 0 # avoiding overflow 151 | self._cnt += 1 152 | 153 | h, w = image.shape[:2] 154 | 155 | if self.do: 156 | if self.horizontal: 157 | return HFlipTransform(w) 158 | elif self.vertical: 159 | return VFlipTransform(h) 160 | else: 161 | return NoOpTransform() 162 | 163 | def build_augmentation(cfg, is_train): 164 | logger = logging.getLogger(__name__) 165 | aug_list = [] 166 | print("aug_list: ", aug_list) 167 | return aug_list 168 | 169 | def build_semkitti_augmentation(cfg, is_train): 170 | logger = logging.getLogger(__name__) 171 | aug_list = [] 172 | print("aug_list: ", aug_list) 173 | return aug_list 174 | -------------------------------------------------------------------------------- /mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | 5 | import numpy as np 6 | import torch 7 | from torch.nn import functional as F 8 | 9 | from detectron2.config import configurable 10 | from detectron2.data import detection_utils as utils 11 | from detectron2.data import transforms as T 12 | from detectron2.structures import BitMasks, Instances 13 | 14 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper 15 | 16 | __all__ = ["MaskFormerPanopticDatasetMapper"] 17 | 18 | 19 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper): 20 | """ 21 | A callable which takes a dataset dict in Detectron2 Dataset format, 22 | and map it into a format used by MaskFormer for panoptic segmentation. 23 | 24 | The callable currently does the following: 25 | 26 | 1. Read the image from "file_name" 27 | 2. Applies geometric transforms to the image and annotation 28 | 3. Find and applies suitable cropping to the image and annotation 29 | 4. Prepare image and annotation to Tensors 30 | """ 31 | 32 | @configurable 33 | def __init__( 34 | self, 35 | is_train=True, 36 | *, 37 | augmentations, 38 | image_format, 39 | ignore_label, 40 | size_divisibility, 41 | ): 42 | """ 43 | NOTE: this interface is experimental. 44 | Args: 45 | is_train: for training or inference 46 | augmentations: a list of augmentations or deterministic transforms to apply 47 | image_format: an image format supported by :func:`detection_utils.read_image`. 48 | ignore_label: the label that is ignored to evaluation 49 | size_divisibility: pad image size to be divisible by this value 50 | """ 51 | super().__init__( 52 | is_train, 53 | augmentations=augmentations, 54 | image_format=image_format, 55 | ignore_label=ignore_label, 56 | size_divisibility=size_divisibility, 57 | ) 58 | 59 | def __call__(self, dataset_dict): 60 | """ 61 | Args: 62 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 63 | 64 | Returns: 65 | dict: a format that builtin models in detectron2 accept 66 | """ 67 | assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!" 68 | 69 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 70 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 71 | utils.check_image_size(dataset_dict, image) 72 | 73 | # semantic segmentation 74 | if "sem_seg_file_name" in dataset_dict: 75 | # PyTorch transformation not implemented for uint16, so converting it to double first 76 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") 77 | else: 78 | sem_seg_gt = None 79 | 80 | # panoptic segmentation 81 | if "pan_seg_file_name" in dataset_dict: 82 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") 83 | segments_info = dataset_dict["segments_info"] 84 | else: 85 | pan_seg_gt = None 86 | segments_info = None 87 | 88 | if pan_seg_gt is None: 89 | raise ValueError( 90 | "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format( 91 | dataset_dict["file_name"] 92 | ) 93 | ) 94 | 95 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt) 96 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) 97 | image = aug_input.image 98 | if sem_seg_gt is not None: 99 | sem_seg_gt = aug_input.sem_seg 100 | 101 | # apply the same transformation to panoptic segmentation 102 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) 103 | 104 | from panopticapi.utils import rgb2id 105 | 106 | pan_seg_gt = rgb2id(pan_seg_gt) 107 | 108 | # Pad image and segmentation label here! 109 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 110 | if sem_seg_gt is not None: 111 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) 112 | pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long")) 113 | 114 | if self.size_divisibility > 0: 115 | image_size = (image.shape[-2], image.shape[-1]) 116 | padding_size = [ 117 | 0, 118 | self.size_divisibility - image_size[1], 119 | 0, 120 | self.size_divisibility - image_size[0], 121 | ] 122 | image = F.pad(image, padding_size, value=128).contiguous() 123 | if sem_seg_gt is not None: 124 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() 125 | pan_seg_gt = F.pad( 126 | pan_seg_gt, padding_size, value=0 127 | ).contiguous() # 0 is the VOID panoptic label 128 | 129 | image_shape = (image.shape[-2], image.shape[-1]) # h, w 130 | 131 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 132 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 133 | # Therefore it's important to use torch.Tensor. 134 | dataset_dict["image"] = image 135 | if sem_seg_gt is not None: 136 | dataset_dict["sem_seg"] = sem_seg_gt.long() 137 | 138 | if "annotations" in dataset_dict: 139 | raise ValueError("Pemantic segmentation dataset should not have 'annotations'.") 140 | 141 | # Prepare per-category binary masks 142 | pan_seg_gt = pan_seg_gt.numpy() 143 | instances = Instances(image_shape) 144 | classes = [] 145 | masks = [] 146 | for segment_info in segments_info: 147 | class_id = segment_info["category_id"] 148 | if not segment_info["iscrowd"]: 149 | classes.append(class_id) 150 | masks.append(pan_seg_gt == segment_info["id"]) 151 | 152 | classes = np.array(classes) 153 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 154 | if len(masks) == 0: 155 | # Some image does not have annotation (all ignored) 156 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) 157 | else: 158 | masks = BitMasks( 159 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 160 | ) 161 | instances.gt_masks = masks.tensor 162 | 163 | dataset_dict["instances"] = instances 164 | 165 | return dataset_dict 166 | -------------------------------------------------------------------------------- /minvis/video_mask2former_transformer_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA Corporation & Affiliates. All rights reserved. 2 | # 3 | # This work is made available under the Nvidia Source Code License-NC. 4 | # To view a copy of this license, visit 5 | # https://github.com/NVlabs/MinVIS/blob/main/LICENSE 6 | 7 | # Copyright (c) Facebook, Inc. and its affiliates. 8 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py 9 | import torch 10 | from torch import nn, Tensor 11 | from torch.nn import functional as F 12 | 13 | from detectron2.config import configurable 14 | 15 | from mask2former.modeling.transformer_decoder.maskformer_transformer_decoder import TRANSFORMER_DECODER_REGISTRY 16 | from mask2former.modeling.transformer_decoder.position_encoding import PositionEmbeddingSine 17 | 18 | from mask2former_video.modeling.transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder 19 | import einops 20 | 21 | 22 | @TRANSFORMER_DECODER_REGISTRY.register() 23 | class VideoMultiScaleMaskedTransformerDecoder_frame(VideoMultiScaleMaskedTransformerDecoder): 24 | 25 | @configurable 26 | def __init__( 27 | self, 28 | in_channels, 29 | mask_classification=True, 30 | *, 31 | num_classes: int, 32 | hidden_dim: int, 33 | num_queries: int, 34 | nheads: int, 35 | dim_feedforward: int, 36 | dec_layers: int, 37 | pre_norm: bool, 38 | mask_dim: int, 39 | enforce_input_project: bool, 40 | # video related 41 | num_frames, 42 | ): 43 | super().__init__( 44 | in_channels=in_channels, 45 | mask_classification=mask_classification, 46 | num_classes=num_classes, 47 | hidden_dim=hidden_dim, 48 | num_queries=num_queries, 49 | nheads=nheads, 50 | dim_feedforward=dim_feedforward, 51 | dec_layers=dec_layers, 52 | pre_norm=pre_norm, 53 | mask_dim=mask_dim, 54 | enforce_input_project=enforce_input_project, 55 | num_frames=num_frames, 56 | ) 57 | 58 | # use 2D positional embedding 59 | N_steps = hidden_dim // 2 60 | self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) 61 | 62 | def forward(self, x, mask_features, mask = None): 63 | # x is a list of multi-scale feature 64 | assert len(x) == self.num_feature_levels 65 | src = [] 66 | pos = [] 67 | size_list = [] 68 | 69 | # disable mask, it does not affect performance 70 | del mask 71 | 72 | for i in range(self.num_feature_levels): 73 | size_list.append(x[i].shape[-2:]) 74 | pos.append(self.pe_layer(x[i], None).flatten(2)) 75 | src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None]) 76 | 77 | # flatten NxCxHxW to HWxNxC 78 | pos[-1] = pos[-1].permute(2, 0, 1) 79 | src[-1] = src[-1].permute(2, 0, 1) 80 | 81 | _, bs, _ = src[0].shape 82 | 83 | # QxNxC 84 | query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1) 85 | output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1) 86 | 87 | predictions_class = [] 88 | predictions_mask = [] 89 | 90 | # prediction heads on learnable query features 91 | outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0]) 92 | predictions_class.append(outputs_class) 93 | predictions_mask.append(outputs_mask) 94 | 95 | for i in range(self.num_layers): 96 | level_index = i % self.num_feature_levels 97 | # prevent NaN output 98 | attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False 99 | # attention: cross-attention first 100 | output = self.transformer_cross_attention_layers[i]( 101 | output, src[level_index], 102 | memory_mask=attn_mask, 103 | memory_key_padding_mask=None, # here we do not apply masking on padded region 104 | pos=pos[level_index], query_pos=query_embed 105 | ) 106 | 107 | output = self.transformer_self_attention_layers[i]( 108 | output, tgt_mask=None, 109 | tgt_key_padding_mask=None, 110 | query_pos=query_embed 111 | ) 112 | 113 | # FFN 114 | output = self.transformer_ffn_layers[i]( 115 | output 116 | ) 117 | 118 | outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels]) 119 | predictions_class.append(outputs_class) 120 | predictions_mask.append(outputs_mask) 121 | 122 | assert len(predictions_class) == self.num_layers + 1 123 | 124 | # expand BT to B, T 125 | bt = predictions_mask[-1].shape[0] 126 | bs = bt // self.num_frames if self.training else 1 127 | t = bt // bs 128 | for i in range(len(predictions_mask)): 129 | predictions_mask[i] = einops.rearrange(predictions_mask[i], '(b t) q h w -> b q t h w', t=t) 130 | 131 | for i in range(len(predictions_class)): 132 | predictions_class[i] = einops.rearrange(predictions_class[i], '(b t) q c -> b t q c', t=t) 133 | 134 | pred_embds = self.decoder_norm(output) 135 | pred_embds = einops.rearrange(pred_embds, 'q (b t) c -> b c t q', t=t) 136 | 137 | out = { 138 | 'pred_logits': predictions_class[-1], 139 | 'pred_masks': predictions_mask[-1], 140 | 'aux_outputs': self._set_aux_loss( 141 | predictions_class if self.mask_classification else None, predictions_mask 142 | ), 143 | 'pred_embds': pred_embds, 144 | } 145 | 146 | return out 147 | 148 | def forward_prediction_heads(self, output, mask_features, attn_mask_target_size): 149 | decoder_output = self.decoder_norm(output) 150 | decoder_output = decoder_output.transpose(0, 1) 151 | outputs_class = self.class_embed(decoder_output) 152 | mask_embed = self.mask_embed(decoder_output) 153 | outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features) 154 | 155 | # NOTE: prediction is of higher-resolution 156 | # [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW] 157 | attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bilinear", align_corners=False) 158 | # must use bool type 159 | # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged. 160 | attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool() 161 | attn_mask = attn_mask.detach() 162 | 163 | return outputs_class, outputs_mask, attn_mask 164 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | # Copyright (c) Facebook, Inc. and its affiliates. 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 11 | 12 | from __future__ import absolute_import 13 | from __future__ import print_function 14 | from __future__ import division 15 | 16 | import warnings 17 | import math 18 | 19 | import torch 20 | from torch import nn 21 | import torch.nn.functional as F 22 | from torch.nn.init import xavier_uniform_, constant_ 23 | 24 | from ..functions import MSDeformAttnFunction 25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch 26 | 27 | 28 | def _is_power_of_2(n): 29 | if (not isinstance(n, int)) or (n < 0): 30 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 31 | return (n & (n-1) == 0) and n != 0 32 | 33 | 34 | class MSDeformAttn(nn.Module): 35 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 36 | """ 37 | Multi-Scale Deformable Attention Module 38 | :param d_model hidden dimension 39 | :param n_levels number of feature levels 40 | :param n_heads number of attention heads 41 | :param n_points number of sampling points per attention head per feature level 42 | """ 43 | super().__init__() 44 | if d_model % n_heads != 0: 45 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 46 | _d_per_head = d_model // n_heads 47 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 48 | if not _is_power_of_2(_d_per_head): 49 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 50 | "which is more efficient in our CUDA implementation.") 51 | 52 | self.im2col_step = 128 53 | 54 | self.d_model = d_model 55 | self.n_levels = n_levels 56 | self.n_heads = n_heads 57 | self.n_points = n_points 58 | 59 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) 60 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 61 | self.value_proj = nn.Linear(d_model, d_model) 62 | self.output_proj = nn.Linear(d_model, d_model) 63 | 64 | self._reset_parameters() 65 | 66 | def _reset_parameters(self): 67 | constant_(self.sampling_offsets.weight.data, 0.) 68 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 69 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 70 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) 71 | for i in range(self.n_points): 72 | grid_init[:, :, i, :] *= i + 1 73 | with torch.no_grad(): 74 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 75 | constant_(self.attention_weights.weight.data, 0.) 76 | constant_(self.attention_weights.bias.data, 0.) 77 | xavier_uniform_(self.value_proj.weight.data) 78 | constant_(self.value_proj.bias.data, 0.) 79 | xavier_uniform_(self.output_proj.weight.data) 80 | constant_(self.output_proj.bias.data, 0.) 81 | 82 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 83 | """ 84 | :param query (N, Length_{query}, C) 85 | :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area 86 | or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes 87 | :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) 88 | :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 89 | :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] 90 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 91 | 92 | :return output (N, Length_{query}, C) 93 | """ 94 | N, Len_q, _ = query.shape 95 | N, Len_in, _ = input_flatten.shape 96 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 97 | 98 | value = self.value_proj(input_flatten) 99 | if input_padding_mask is not None: 100 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 101 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 102 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 103 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 104 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 105 | # N, Len_q, n_heads, n_levels, n_points, 2 106 | if reference_points.shape[-1] == 2: 107 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 108 | sampling_locations = reference_points[:, :, None, :, None, :] \ 109 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 110 | elif reference_points.shape[-1] == 4: 111 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 112 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 113 | else: 114 | raise ValueError( 115 | 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) 116 | try: 117 | output = MSDeformAttnFunction.apply( 118 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) 119 | except: 120 | # CPU 121 | output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 122 | # # For FLOPs calculation only 123 | # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 124 | output = self.output_proj(output) 125 | return output 126 | -------------------------------------------------------------------------------- /mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | 5 | import numpy as np 6 | import pycocotools.mask as mask_util 7 | import torch 8 | from torch.nn import functional as F 9 | 10 | from detectron2.config import configurable 11 | from detectron2.data import detection_utils as utils 12 | from detectron2.data import transforms as T 13 | from detectron2.projects.point_rend import ColorAugSSDTransform 14 | from detectron2.structures import BitMasks, Instances, polygons_to_bitmask 15 | 16 | __all__ = ["MaskFormerInstanceDatasetMapper"] 17 | 18 | 19 | class MaskFormerInstanceDatasetMapper: 20 | """ 21 | A callable which takes a dataset dict in Detectron2 Dataset format, 22 | and map it into a format used by MaskFormer for instance segmentation. 23 | 24 | The callable currently does the following: 25 | 26 | 1. Read the image from "file_name" 27 | 2. Applies geometric transforms to the image and annotation 28 | 3. Find and applies suitable cropping to the image and annotation 29 | 4. Prepare image and annotation to Tensors 30 | """ 31 | 32 | @configurable 33 | def __init__( 34 | self, 35 | is_train=True, 36 | *, 37 | augmentations, 38 | image_format, 39 | size_divisibility, 40 | ): 41 | """ 42 | NOTE: this interface is experimental. 43 | Args: 44 | is_train: for training or inference 45 | augmentations: a list of augmentations or deterministic transforms to apply 46 | image_format: an image format supported by :func:`detection_utils.read_image`. 47 | size_divisibility: pad image size to be divisible by this value 48 | """ 49 | self.is_train = is_train 50 | self.tfm_gens = augmentations 51 | self.img_format = image_format 52 | self.size_divisibility = size_divisibility 53 | 54 | logger = logging.getLogger(__name__) 55 | mode = "training" if is_train else "inference" 56 | logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") 57 | 58 | @classmethod 59 | def from_config(cls, cfg, is_train=True): 60 | # Build augmentation 61 | augs = [ 62 | T.ResizeShortestEdge( 63 | cfg.INPUT.MIN_SIZE_TRAIN, 64 | cfg.INPUT.MAX_SIZE_TRAIN, 65 | cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, 66 | ) 67 | ] 68 | if cfg.INPUT.CROP.ENABLED: 69 | augs.append( 70 | T.RandomCrop( 71 | cfg.INPUT.CROP.TYPE, 72 | cfg.INPUT.CROP.SIZE, 73 | ) 74 | ) 75 | if cfg.INPUT.COLOR_AUG_SSD: 76 | augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) 77 | augs.append(T.RandomFlip()) 78 | 79 | ret = { 80 | "is_train": is_train, 81 | "augmentations": augs, 82 | "image_format": cfg.INPUT.FORMAT, 83 | "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, 84 | } 85 | return ret 86 | 87 | def __call__(self, dataset_dict): 88 | """ 89 | Args: 90 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 91 | 92 | Returns: 93 | dict: a format that builtin models in detectron2 accept 94 | """ 95 | assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!" 96 | 97 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 98 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 99 | utils.check_image_size(dataset_dict, image) 100 | 101 | aug_input = T.AugInput(image) 102 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) 103 | image = aug_input.image 104 | 105 | # transform instnace masks 106 | assert "annotations" in dataset_dict 107 | for anno in dataset_dict["annotations"]: 108 | anno.pop("keypoints", None) 109 | 110 | annos = [ 111 | utils.transform_instance_annotations(obj, transforms, image.shape[:2]) 112 | for obj in dataset_dict.pop("annotations") 113 | if obj.get("iscrowd", 0) == 0 114 | ] 115 | 116 | if len(annos): 117 | assert "segmentation" in annos[0] 118 | segms = [obj["segmentation"] for obj in annos] 119 | masks = [] 120 | for segm in segms: 121 | if isinstance(segm, list): 122 | # polygon 123 | masks.append(polygons_to_bitmask(segm, *image.shape[:2])) 124 | elif isinstance(segm, dict): 125 | # COCO RLE 126 | masks.append(mask_util.decode(segm)) 127 | elif isinstance(segm, np.ndarray): 128 | assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( 129 | segm.ndim 130 | ) 131 | # mask array 132 | masks.append(segm) 133 | else: 134 | raise ValueError( 135 | "Cannot convert segmentation of type '{}' to BitMasks!" 136 | "Supported types are: polygons as list[list[float] or ndarray]," 137 | " COCO-style RLE as a dict, or a binary segmentation mask " 138 | " in a 2D numpy array of shape HxW.".format(type(segm)) 139 | ) 140 | 141 | # Pad image and segmentation label here! 142 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 143 | masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks] 144 | 145 | classes = [int(obj["category_id"]) for obj in annos] 146 | classes = torch.tensor(classes, dtype=torch.int64) 147 | 148 | if self.size_divisibility > 0: 149 | image_size = (image.shape[-2], image.shape[-1]) 150 | padding_size = [ 151 | 0, 152 | self.size_divisibility - image_size[1], 153 | 0, 154 | self.size_divisibility - image_size[0], 155 | ] 156 | # pad image 157 | image = F.pad(image, padding_size, value=128).contiguous() 158 | # pad mask 159 | masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks] 160 | 161 | image_shape = (image.shape[-2], image.shape[-1]) # h, w 162 | 163 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 164 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 165 | # Therefore it's important to use torch.Tensor. 166 | dataset_dict["image"] = image 167 | 168 | # Prepare per-category binary masks 169 | instances = Instances(image_shape) 170 | instances.gt_classes = classes 171 | if len(masks) == 0: 172 | # Some image does not have annotation (all ignored) 173 | instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1])) 174 | else: 175 | masks = BitMasks(torch.stack(masks)) 176 | instances.gt_masks = masks.tensor 177 | 178 | dataset_dict["instances"] = instances 179 | 180 | return dataset_dict 181 | -------------------------------------------------------------------------------- /mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | 5 | import numpy as np 6 | import torch 7 | from torch.nn import functional as F 8 | 9 | from detectron2.config import configurable 10 | from detectron2.data import MetadataCatalog 11 | from detectron2.data import detection_utils as utils 12 | from detectron2.data import transforms as T 13 | from detectron2.projects.point_rend import ColorAugSSDTransform 14 | from detectron2.structures import BitMasks, Instances 15 | 16 | __all__ = ["MaskFormerSemanticDatasetMapper"] 17 | 18 | 19 | class MaskFormerSemanticDatasetMapper: 20 | """ 21 | A callable which takes a dataset dict in Detectron2 Dataset format, 22 | and map it into a format used by MaskFormer for semantic segmentation. 23 | 24 | The callable currently does the following: 25 | 26 | 1. Read the image from "file_name" 27 | 2. Applies geometric transforms to the image and annotation 28 | 3. Find and applies suitable cropping to the image and annotation 29 | 4. Prepare image and annotation to Tensors 30 | """ 31 | 32 | @configurable 33 | def __init__( 34 | self, 35 | is_train=True, 36 | *, 37 | augmentations, 38 | image_format, 39 | ignore_label, 40 | size_divisibility, 41 | ): 42 | """ 43 | NOTE: this interface is experimental. 44 | Args: 45 | is_train: for training or inference 46 | augmentations: a list of augmentations or deterministic transforms to apply 47 | image_format: an image format supported by :func:`detection_utils.read_image`. 48 | ignore_label: the label that is ignored to evaluation 49 | size_divisibility: pad image size to be divisible by this value 50 | """ 51 | self.is_train = is_train 52 | self.tfm_gens = augmentations 53 | self.img_format = image_format 54 | self.ignore_label = ignore_label 55 | self.size_divisibility = size_divisibility 56 | 57 | logger = logging.getLogger(__name__) 58 | mode = "training" if is_train else "inference" 59 | logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") 60 | 61 | @classmethod 62 | def from_config(cls, cfg, is_train=True): 63 | # Build augmentation 64 | augs = [ 65 | T.ResizeShortestEdge( 66 | cfg.INPUT.MIN_SIZE_TRAIN, 67 | cfg.INPUT.MAX_SIZE_TRAIN, 68 | cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, 69 | ) 70 | ] 71 | if cfg.INPUT.CROP.ENABLED: 72 | augs.append( 73 | T.RandomCrop_CategoryAreaConstraint( 74 | cfg.INPUT.CROP.TYPE, 75 | cfg.INPUT.CROP.SIZE, 76 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA, 77 | cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 78 | ) 79 | ) 80 | if cfg.INPUT.COLOR_AUG_SSD: 81 | augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) 82 | augs.append(T.RandomFlip()) 83 | 84 | # Assume always applies to the training set. 85 | dataset_names = cfg.DATASETS.TRAIN 86 | meta = MetadataCatalog.get(dataset_names[0]) 87 | ignore_label = meta.ignore_label 88 | 89 | ret = { 90 | "is_train": is_train, 91 | "augmentations": augs, 92 | "image_format": cfg.INPUT.FORMAT, 93 | "ignore_label": ignore_label, 94 | "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, 95 | } 96 | return ret 97 | 98 | def __call__(self, dataset_dict): 99 | """ 100 | Args: 101 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 102 | 103 | Returns: 104 | dict: a format that builtin models in detectron2 accept 105 | """ 106 | assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!" 107 | 108 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 109 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 110 | utils.check_image_size(dataset_dict, image) 111 | 112 | if "sem_seg_file_name" in dataset_dict: 113 | # PyTorch transformation not implemented for uint16, so converting it to double first 114 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") 115 | else: 116 | sem_seg_gt = None 117 | 118 | if sem_seg_gt is None: 119 | raise ValueError( 120 | "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format( 121 | dataset_dict["file_name"] 122 | ) 123 | ) 124 | 125 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt) 126 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) 127 | image = aug_input.image 128 | sem_seg_gt = aug_input.sem_seg 129 | 130 | # Pad image and segmentation label here! 131 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 132 | if sem_seg_gt is not None: 133 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) 134 | 135 | if self.size_divisibility > 0: 136 | image_size = (image.shape[-2], image.shape[-1]) 137 | padding_size = [ 138 | 0, 139 | self.size_divisibility - image_size[1], 140 | 0, 141 | self.size_divisibility - image_size[0], 142 | ] 143 | image = F.pad(image, padding_size, value=128).contiguous() 144 | if sem_seg_gt is not None: 145 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() 146 | 147 | image_shape = (image.shape[-2], image.shape[-1]) # h, w 148 | 149 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 150 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 151 | # Therefore it's important to use torch.Tensor. 152 | dataset_dict["image"] = image 153 | 154 | if sem_seg_gt is not None: 155 | dataset_dict["sem_seg"] = sem_seg_gt.long() 156 | 157 | if "annotations" in dataset_dict: 158 | raise ValueError("Semantic segmentation dataset should not have 'annotations'.") 159 | 160 | # Prepare per-category binary masks 161 | if sem_seg_gt is not None: 162 | sem_seg_gt = sem_seg_gt.numpy() 163 | instances = Instances(image_shape) 164 | classes = np.unique(sem_seg_gt) 165 | # remove ignored region 166 | classes = classes[classes != self.ignore_label] 167 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 168 | 169 | masks = [] 170 | for class_id in classes: 171 | masks.append(sem_seg_gt == class_id) 172 | 173 | if len(masks) == 0: 174 | # Some image does not have annotation (all ignored) 175 | instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1])) 176 | else: 177 | masks = BitMasks( 178 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 179 | ) 180 | instances.gt_masks = masks.tensor 181 | 182 | dataset_dict["instances"] = instances 183 | 184 | return dataset_dict 185 | -------------------------------------------------------------------------------- /mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py 3 | import fvcore.nn.weight_init as weight_init 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | from detectron2.config import configurable 9 | from detectron2.layers import Conv2d 10 | from detectron2.utils.registry import Registry 11 | 12 | from .position_encoding import PositionEmbeddingSine 13 | from .transformer import Transformer 14 | 15 | 16 | TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE") 17 | TRANSFORMER_DECODER_REGISTRY.__doc__ = """ 18 | Registry for transformer module in MaskFormer. 19 | """ 20 | 21 | 22 | def build_transformer_decoder(cfg, in_channels, mask_classification=True): 23 | """ 24 | Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`. 25 | """ 26 | name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME 27 | return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification) 28 | 29 | 30 | @TRANSFORMER_DECODER_REGISTRY.register() 31 | class StandardTransformerDecoder(nn.Module): 32 | @configurable 33 | def __init__( 34 | self, 35 | in_channels, 36 | mask_classification=True, 37 | *, 38 | num_classes: int, 39 | hidden_dim: int, 40 | num_queries: int, 41 | nheads: int, 42 | dropout: float, 43 | dim_feedforward: int, 44 | enc_layers: int, 45 | dec_layers: int, 46 | pre_norm: bool, 47 | deep_supervision: bool, 48 | mask_dim: int, 49 | enforce_input_project: bool, 50 | ): 51 | """ 52 | NOTE: this interface is experimental. 53 | Args: 54 | in_channels: channels of the input features 55 | mask_classification: whether to add mask classifier or not 56 | num_classes: number of classes 57 | hidden_dim: Transformer feature dimension 58 | num_queries: number of queries 59 | nheads: number of heads 60 | dropout: dropout in Transformer 61 | dim_feedforward: feature dimension in feedforward network 62 | enc_layers: number of Transformer encoder layers 63 | dec_layers: number of Transformer decoder layers 64 | pre_norm: whether to use pre-LayerNorm or not 65 | deep_supervision: whether to add supervision to every decoder layers 66 | mask_dim: mask feature dimension 67 | enforce_input_project: add input project 1x1 conv even if input 68 | channels and hidden dim is identical 69 | """ 70 | super().__init__() 71 | 72 | self.mask_classification = mask_classification 73 | 74 | # positional encoding 75 | N_steps = hidden_dim // 2 76 | self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) 77 | 78 | transformer = Transformer( 79 | d_model=hidden_dim, 80 | dropout=dropout, 81 | nhead=nheads, 82 | dim_feedforward=dim_feedforward, 83 | num_encoder_layers=enc_layers, 84 | num_decoder_layers=dec_layers, 85 | normalize_before=pre_norm, 86 | return_intermediate_dec=deep_supervision, 87 | ) 88 | 89 | self.num_queries = num_queries 90 | self.transformer = transformer 91 | hidden_dim = transformer.d_model 92 | 93 | self.query_embed = nn.Embedding(num_queries, hidden_dim) 94 | 95 | if in_channels != hidden_dim or enforce_input_project: 96 | self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1) 97 | weight_init.c2_xavier_fill(self.input_proj) 98 | else: 99 | self.input_proj = nn.Sequential() 100 | self.aux_loss = deep_supervision 101 | 102 | # output FFNs 103 | if self.mask_classification: 104 | self.class_embed = nn.Linear(hidden_dim, num_classes + 1) 105 | self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) 106 | 107 | @classmethod 108 | def from_config(cls, cfg, in_channels, mask_classification): 109 | ret = {} 110 | ret["in_channels"] = in_channels 111 | ret["mask_classification"] = mask_classification 112 | 113 | ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES 114 | ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM 115 | ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES 116 | # Transformer parameters: 117 | ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS 118 | ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT 119 | ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD 120 | ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS 121 | ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS 122 | ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM 123 | ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION 124 | ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ 125 | 126 | ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 127 | 128 | return ret 129 | 130 | def forward(self, x, mask_features, mask=None): 131 | if mask is not None: 132 | mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 133 | pos = self.pe_layer(x, mask) 134 | 135 | src = x 136 | hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos) 137 | 138 | if self.mask_classification: 139 | outputs_class = self.class_embed(hs) 140 | out = {"pred_logits": outputs_class[-1]} 141 | else: 142 | out = {} 143 | 144 | if self.aux_loss: 145 | # [l, bs, queries, embed] 146 | mask_embed = self.mask_embed(hs) 147 | outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features) 148 | out["pred_masks"] = outputs_seg_masks[-1] 149 | out["aux_outputs"] = self._set_aux_loss( 150 | outputs_class if self.mask_classification else None, outputs_seg_masks 151 | ) 152 | else: 153 | # FIXME h_boxes takes the last one computed, keep this in mind 154 | # [bs, queries, embed] 155 | mask_embed = self.mask_embed(hs[-1]) 156 | outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features) 157 | out["pred_masks"] = outputs_seg_masks 158 | return out 159 | 160 | @torch.jit.unused 161 | def _set_aux_loss(self, outputs_class, outputs_seg_masks): 162 | # this is a workaround to make torchscript happy, as torchscript 163 | # doesn't support dictionary with non-homogeneous values, such 164 | # as a dict having both a Tensor and a list. 165 | if self.mask_classification: 166 | return [ 167 | {"pred_logits": a, "pred_masks": b} 168 | for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) 169 | ] 170 | else: 171 | return [{"pred_masks": b} for b in outputs_seg_masks[:-1]] 172 | 173 | 174 | class MLP(nn.Module): 175 | """Very simple multi-layer perceptron (also called FFN)""" 176 | 177 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 178 | super().__init__() 179 | self.num_layers = num_layers 180 | h = [hidden_dim] * (num_layers - 1) 181 | self.layers = nn.ModuleList( 182 | nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) 183 | ) 184 | 185 | def forward(self, x): 186 | for i, layer in enumerate(self.layers): 187 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 188 | return x 189 | -------------------------------------------------------------------------------- /mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py 3 | import copy 4 | import logging 5 | 6 | import numpy as np 7 | import torch 8 | 9 | from detectron2.config import configurable 10 | from detectron2.data import detection_utils as utils 11 | from detectron2.data import transforms as T 12 | from detectron2.data.transforms import TransformGen 13 | from detectron2.structures import BitMasks, Instances 14 | 15 | from pycocotools import mask as coco_mask 16 | 17 | __all__ = ["COCOInstanceNewBaselineDatasetMapper"] 18 | 19 | 20 | def convert_coco_poly_to_mask(segmentations, height, width): 21 | masks = [] 22 | for polygons in segmentations: 23 | rles = coco_mask.frPyObjects(polygons, height, width) 24 | mask = coco_mask.decode(rles) 25 | if len(mask.shape) < 3: 26 | mask = mask[..., None] 27 | mask = torch.as_tensor(mask, dtype=torch.uint8) 28 | mask = mask.any(dim=2) 29 | masks.append(mask) 30 | if masks: 31 | masks = torch.stack(masks, dim=0) 32 | else: 33 | masks = torch.zeros((0, height, width), dtype=torch.uint8) 34 | return masks 35 | 36 | 37 | def build_transform_gen(cfg, is_train): 38 | """ 39 | Create a list of default :class:`Augmentation` from config. 40 | Now it includes resizing and flipping. 41 | Returns: 42 | list[Augmentation] 43 | """ 44 | assert is_train, "Only support training augmentation" 45 | image_size = cfg.INPUT.IMAGE_SIZE 46 | min_scale = cfg.INPUT.MIN_SCALE 47 | max_scale = cfg.INPUT.MAX_SCALE 48 | 49 | augmentation = [] 50 | 51 | if cfg.INPUT.RANDOM_FLIP != "none": 52 | augmentation.append( 53 | T.RandomFlip( 54 | horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", 55 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical", 56 | ) 57 | ) 58 | 59 | augmentation.extend([ 60 | T.ResizeScale( 61 | min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size 62 | ), 63 | T.FixedSizeCrop(crop_size=(image_size, image_size)), 64 | ]) 65 | 66 | return augmentation 67 | 68 | 69 | # This is specifically designed for the COCO dataset. 70 | class COCOInstanceNewBaselineDatasetMapper: 71 | """ 72 | A callable which takes a dataset dict in Detectron2 Dataset format, 73 | and map it into a format used by MaskFormer. 74 | 75 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. 76 | 77 | The callable currently does the following: 78 | 79 | 1. Read the image from "file_name" 80 | 2. Applies geometric transforms to the image and annotation 81 | 3. Find and applies suitable cropping to the image and annotation 82 | 4. Prepare image and annotation to Tensors 83 | """ 84 | 85 | @configurable 86 | def __init__( 87 | self, 88 | is_train=True, 89 | *, 90 | tfm_gens, 91 | image_format, 92 | ): 93 | """ 94 | NOTE: this interface is experimental. 95 | Args: 96 | is_train: for training or inference 97 | augmentations: a list of augmentations or deterministic transforms to apply 98 | tfm_gens: data augmentation 99 | image_format: an image format supported by :func:`detection_utils.read_image`. 100 | """ 101 | self.tfm_gens = tfm_gens 102 | logging.getLogger(__name__).info( 103 | "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens)) 104 | ) 105 | 106 | self.img_format = image_format 107 | self.is_train = is_train 108 | 109 | @classmethod 110 | def from_config(cls, cfg, is_train=True): 111 | # Build augmentation 112 | tfm_gens = build_transform_gen(cfg, is_train) 113 | 114 | ret = { 115 | "is_train": is_train, 116 | "tfm_gens": tfm_gens, 117 | "image_format": cfg.INPUT.FORMAT, 118 | } 119 | return ret 120 | 121 | def __call__(self, dataset_dict): 122 | """ 123 | Args: 124 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 125 | 126 | Returns: 127 | dict: a format that builtin models in detectron2 accept 128 | """ 129 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 130 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 131 | utils.check_image_size(dataset_dict, image) 132 | 133 | # TODO: get padding mask 134 | # by feeding a "segmentation mask" to the same transforms 135 | padding_mask = np.ones(image.shape[:2]) 136 | 137 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 138 | # the crop transformation has default padding value 0 for segmentation 139 | padding_mask = transforms.apply_segmentation(padding_mask) 140 | padding_mask = ~ padding_mask.astype(bool) 141 | 142 | image_shape = image.shape[:2] # h, w 143 | 144 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 145 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 146 | # Therefore it's important to use torch.Tensor. 147 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 148 | dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask)) 149 | 150 | if not self.is_train: 151 | # USER: Modify this if you want to keep them for some reason. 152 | dataset_dict.pop("annotations", None) 153 | return dataset_dict 154 | 155 | if "annotations" in dataset_dict: 156 | # USER: Modify this if you want to keep them for some reason. 157 | for anno in dataset_dict["annotations"]: 158 | # Let's always keep mask 159 | # if not self.mask_on: 160 | # anno.pop("segmentation", None) 161 | anno.pop("keypoints", None) 162 | 163 | # USER: Implement additional transformations if you have other types of data 164 | annos = [ 165 | utils.transform_instance_annotations(obj, transforms, image_shape) 166 | for obj in dataset_dict.pop("annotations") 167 | if obj.get("iscrowd", 0) == 0 168 | ] 169 | # NOTE: does not support BitMask due to augmentation 170 | # Current BitMask cannot handle empty objects 171 | instances = utils.annotations_to_instances(annos, image_shape) 172 | # After transforms such as cropping are applied, the bounding box may no longer 173 | # tightly bound the object. As an example, imagine a triangle object 174 | # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight 175 | # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to 176 | # the intersection of original bounding box and the cropping box. 177 | instances.gt_boxes = instances.gt_masks.get_bounding_boxes() 178 | # Need to filter empty instances first (due to augmentation) 179 | instances = utils.filter_empty_instances(instances) 180 | # Generate masks from polygon 181 | h, w = instances.image_size 182 | # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float) 183 | if hasattr(instances, 'gt_masks'): 184 | gt_masks = instances.gt_masks 185 | gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w) 186 | instances.gt_masks = gt_masks 187 | dataset_dict["instances"] = instances 188 | 189 | return dataset_dict 190 | -------------------------------------------------------------------------------- /mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | /*! 12 | * Copyright (c) Facebook, Inc. and its affiliates. 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR 14 | */ 15 | 16 | #include 17 | #include "cuda/ms_deform_im2col_cuda.cuh" 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | 25 | at::Tensor ms_deform_attn_cuda_forward( 26 | const at::Tensor &value, 27 | const at::Tensor &spatial_shapes, 28 | const at::Tensor &level_start_index, 29 | const at::Tensor &sampling_loc, 30 | const at::Tensor &attn_weight, 31 | const int im2col_step) 32 | { 33 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 34 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 35 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 36 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 37 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 38 | 39 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 40 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 41 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 42 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 43 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 44 | 45 | const int batch = value.size(0); 46 | const int spatial_size = value.size(1); 47 | const int num_heads = value.size(2); 48 | const int channels = value.size(3); 49 | 50 | const int num_levels = spatial_shapes.size(0); 51 | 52 | const int num_query = sampling_loc.size(1); 53 | const int num_point = sampling_loc.size(4); 54 | 55 | const int im2col_step_ = std::min(batch, im2col_step); 56 | 57 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 58 | 59 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); 60 | 61 | const int batch_n = im2col_step_; 62 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 63 | auto per_value_size = spatial_size * num_heads * channels; 64 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 65 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 66 | for (int n = 0; n < batch/im2col_step_; ++n) 67 | { 68 | auto columns = output_n.select(0, n); 69 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { 70 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), 71 | value.data() + n * im2col_step_ * per_value_size, 72 | spatial_shapes.data(), 73 | level_start_index.data(), 74 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 75 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 76 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 77 | columns.data()); 78 | 79 | })); 80 | } 81 | 82 | output = output.view({batch, num_query, num_heads*channels}); 83 | 84 | return output; 85 | } 86 | 87 | 88 | std::vector ms_deform_attn_cuda_backward( 89 | const at::Tensor &value, 90 | const at::Tensor &spatial_shapes, 91 | const at::Tensor &level_start_index, 92 | const at::Tensor &sampling_loc, 93 | const at::Tensor &attn_weight, 94 | const at::Tensor &grad_output, 95 | const int im2col_step) 96 | { 97 | 98 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 99 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 100 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 101 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 102 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 103 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); 104 | 105 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 106 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 107 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 108 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 109 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 110 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); 111 | 112 | const int batch = value.size(0); 113 | const int spatial_size = value.size(1); 114 | const int num_heads = value.size(2); 115 | const int channels = value.size(3); 116 | 117 | const int num_levels = spatial_shapes.size(0); 118 | 119 | const int num_query = sampling_loc.size(1); 120 | const int num_point = sampling_loc.size(4); 121 | 122 | const int im2col_step_ = std::min(batch, im2col_step); 123 | 124 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 125 | 126 | auto grad_value = at::zeros_like(value); 127 | auto grad_sampling_loc = at::zeros_like(sampling_loc); 128 | auto grad_attn_weight = at::zeros_like(attn_weight); 129 | 130 | const int batch_n = im2col_step_; 131 | auto per_value_size = spatial_size * num_heads * channels; 132 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 133 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 134 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 135 | 136 | for (int n = 0; n < batch/im2col_step_; ++n) 137 | { 138 | auto grad_output_g = grad_output_n.select(0, n); 139 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { 140 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), 141 | grad_output_g.data(), 142 | value.data() + n * im2col_step_ * per_value_size, 143 | spatial_shapes.data(), 144 | level_start_index.data(), 145 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 146 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 147 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 148 | grad_value.data() + n * im2col_step_ * per_value_size, 149 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 150 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); 151 | 152 | })); 153 | } 154 | 155 | return { 156 | grad_value, grad_sampling_loc, grad_attn_weight 157 | }; 158 | } -------------------------------------------------------------------------------- /mask2former/data/datasets/register_coco_panoptic_annos_semseg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import json 3 | import os 4 | 5 | from detectron2.data import DatasetCatalog, MetadataCatalog 6 | from detectron2.data.datasets import load_sem_seg 7 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES 8 | from detectron2.utils.file_io import PathManager 9 | 10 | 11 | _PREDEFINED_SPLITS_COCO_PANOPTIC = { 12 | "coco_2017_train_panoptic": ( 13 | # This is the original panoptic annotation directory 14 | "coco/panoptic_train2017", 15 | "coco/annotations/panoptic_train2017.json", 16 | # This directory contains semantic annotations that are 17 | # converted from panoptic annotations. 18 | # It is used by PanopticFPN. 19 | # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py 20 | # to create these directories. 21 | "coco/panoptic_semseg_train2017", 22 | ), 23 | "coco_2017_val_panoptic": ( 24 | "coco/panoptic_val2017", 25 | "coco/annotations/panoptic_val2017.json", 26 | "coco/panoptic_semseg_val2017", 27 | ), 28 | } 29 | 30 | 31 | def get_metadata(): 32 | meta = {} 33 | # The following metadata maps contiguous id from [0, #thing categories + 34 | # #stuff categories) to their names and colors. We have to replica of the 35 | # same name and color under "thing_*" and "stuff_*" because the current 36 | # visualization function in D2 handles thing and class classes differently 37 | # due to some heuristic used in Panoptic FPN. We keep the same naming to 38 | # enable reusing existing visualization functions. 39 | thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1] 40 | thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1] 41 | stuff_classes = [k["name"] for k in COCO_CATEGORIES] 42 | stuff_colors = [k["color"] for k in COCO_CATEGORIES] 43 | 44 | meta["thing_classes"] = thing_classes 45 | meta["thing_colors"] = thing_colors 46 | meta["stuff_classes"] = stuff_classes 47 | meta["stuff_colors"] = stuff_colors 48 | 49 | # Convert category id for training: 50 | # category id: like semantic segmentation, it is the class id for each 51 | # pixel. Since there are some classes not used in evaluation, the category 52 | # id is not always contiguous and thus we have two set of category ids: 53 | # - original category id: category id in the original dataset, mainly 54 | # used for evaluation. 55 | # - contiguous category id: [0, #classes), in order to train the linear 56 | # softmax classifier. 57 | thing_dataset_id_to_contiguous_id = {} 58 | stuff_dataset_id_to_contiguous_id = {} 59 | 60 | for i, cat in enumerate(COCO_CATEGORIES): 61 | if cat["isthing"]: 62 | thing_dataset_id_to_contiguous_id[cat["id"]] = i 63 | # else: 64 | # stuff_dataset_id_to_contiguous_id[cat["id"]] = i 65 | 66 | # in order to use sem_seg evaluator 67 | stuff_dataset_id_to_contiguous_id[cat["id"]] = i 68 | 69 | meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id 70 | meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id 71 | 72 | return meta 73 | 74 | 75 | def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta): 76 | """ 77 | Args: 78 | image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". 79 | gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". 80 | json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". 81 | Returns: 82 | list[dict]: a list of dicts in Detectron2 standard format. (See 83 | `Using Custom Datasets `_ ) 84 | """ 85 | 86 | def _convert_category_id(segment_info, meta): 87 | if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: 88 | segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ 89 | segment_info["category_id"] 90 | ] 91 | segment_info["isthing"] = True 92 | else: 93 | segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ 94 | segment_info["category_id"] 95 | ] 96 | segment_info["isthing"] = False 97 | return segment_info 98 | 99 | with PathManager.open(json_file) as f: 100 | json_info = json.load(f) 101 | 102 | ret = [] 103 | for ann in json_info["annotations"]: 104 | image_id = int(ann["image_id"]) 105 | # TODO: currently we assume image and label has the same filename but 106 | # different extension, and images have extension ".jpg" for COCO. Need 107 | # to make image extension a user-provided argument if we extend this 108 | # function to support other COCO-like datasets. 109 | image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg") 110 | label_file = os.path.join(gt_dir, ann["file_name"]) 111 | sem_label_file = os.path.join(semseg_dir, ann["file_name"]) 112 | segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]] 113 | ret.append( 114 | { 115 | "file_name": image_file, 116 | "image_id": image_id, 117 | "pan_seg_file_name": label_file, 118 | "sem_seg_file_name": sem_label_file, 119 | "segments_info": segments_info, 120 | } 121 | ) 122 | assert len(ret), f"No images found in {image_dir}!" 123 | assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] 124 | assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"] 125 | assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"] 126 | return ret 127 | 128 | 129 | def register_coco_panoptic_annos_sem_seg( 130 | name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json 131 | ): 132 | panoptic_name = name 133 | delattr(MetadataCatalog.get(panoptic_name), "thing_classes") 134 | delattr(MetadataCatalog.get(panoptic_name), "thing_colors") 135 | MetadataCatalog.get(panoptic_name).set( 136 | thing_classes=metadata["thing_classes"], 137 | thing_colors=metadata["thing_colors"], 138 | # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"], 139 | ) 140 | 141 | # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg" 142 | semantic_name = name + "_with_sem_seg" 143 | DatasetCatalog.register( 144 | semantic_name, 145 | lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata), 146 | ) 147 | MetadataCatalog.get(semantic_name).set( 148 | sem_seg_root=sem_seg_root, 149 | panoptic_root=panoptic_root, 150 | image_root=image_root, 151 | panoptic_json=panoptic_json, 152 | json_file=instances_json, 153 | evaluator_type="coco_panoptic_seg", 154 | ignore_label=255, 155 | label_divisor=1000, 156 | **metadata, 157 | ) 158 | 159 | 160 | def register_all_coco_panoptic_annos_sem_seg(root): 161 | for ( 162 | prefix, 163 | (panoptic_root, panoptic_json, semantic_root), 164 | ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items(): 165 | prefix_instances = prefix[: -len("_panoptic")] 166 | instances_meta = MetadataCatalog.get(prefix_instances) 167 | image_root, instances_json = instances_meta.image_root, instances_meta.json_file 168 | 169 | register_coco_panoptic_annos_sem_seg( 170 | prefix, 171 | get_metadata(), 172 | image_root, 173 | os.path.join(root, panoptic_root), 174 | os.path.join(root, panoptic_json), 175 | os.path.join(root, semantic_root), 176 | instances_json, 177 | ) 178 | 179 | 180 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 181 | register_all_coco_panoptic_annos_sem_seg(_root) 182 | -------------------------------------------------------------------------------- /uni_dvps/data_video/datasets/semkitti_dvps.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import io 3 | import json 4 | import logging 5 | import numpy as np 6 | import os 7 | import tqdm 8 | import pycocotools.mask as mask_util 9 | from fvcore.common.file_io import PathManager 10 | from fvcore.common.timer import Timer 11 | 12 | from detectron2.structures import Boxes, BoxMode, PolygonMasks 13 | from detectron2.data import DatasetCatalog, MetadataCatalog 14 | 15 | """ 16 | This file contains functions to parse SemKITTI-DVPS dataset into dicts in "Detectron2 format". 17 | """ 18 | logger = logging.getLogger(__name__) 19 | __all__ = ["load_semkitti_dvps_json", "register_semkitti_dvps"] 20 | 21 | SEMKITTI_CATEGORIES = [ 22 | {"color": (245, 150, 100), "isthing": 1, "id": 10, "trainId": 0, "name": "car"}, 23 | {"color": (245, 230, 100), "isthing": 1, "id": 11, "trainId": 1, "name": "bicycle"}, 24 | {"color": (150, 60, 30), "isthing": 1, "id": 15, "trainId": 2, "name": "motorcycle"}, 25 | {"color": (180, 30, 80), "isthing": 1, "id": 18, "trainId": 3, "name": "truck"}, 26 | {"color": (255, 0, 0), "isthing": 1, "id": 20, "trainId": 4, "name": "other-vehicle"}, 27 | {"color": ( 30, 30, 255), "isthing": 1, "id": 30, "trainId": 5, "name": "person"}, 28 | {"color": (200, 40, 255), "isthing": 1, "id": 31, "trainId": 6, "name": "bicyclist"}, 29 | {"color": ( 90, 30, 150), "isthing": 1, "id": 32, "trainId": 7, "name": "motorcyclist"}, 30 | 31 | {"color": (255, 0, 255), "isthing": 0, "id": 40, "trainId": 8, "name": "road"}, 32 | {"color": (255, 150, 255), "isthing": 0, "id": 44, "trainId": 9, "name": "parking"}, 33 | {"color": ( 75, 0, 75), "isthing": 0, "id": 48, "trainId": 10, "name": "sidewalk"}, 34 | {"color": ( 75, 0, 175), "isthing": 0, "id": 49, "trainId": 11, "name": "other-ground"}, 35 | {"color": ( 0, 200, 255), "isthing": 0, "id": 50, "trainId": 12, "name": "building"}, 36 | {"color": ( 50, 120, 255), "isthing": 0, "id": 51, "trainId": 13, "name": "fence"}, 37 | {"color": ( 0, 175, 0), "isthing": 0, "id": 70, "trainId": 14, "name": "vegetation"}, 38 | {"color": ( 0, 60, 135), "isthing": 0, "id": 71, "trainId": 15, "name": "trunk"}, 39 | {"color": ( 80, 240, 150), "isthing": 0, "id": 72, "trainId": 16, "name": "terrain"}, 40 | {"color": (150, 240, 255), "isthing": 0, "id": 80, "trainId": 17, "name": "pole"}, 41 | {"color": ( 0, 0, 255), "isthing": 0, "id": 81, "trainId": 18, "name": "traffic-sign"}, 42 | ] 43 | 44 | def _get_semkitti_dvps_meta(): 45 | thing_ids = [k["trainId"] for k in SEMKITTI_CATEGORIES if k["isthing"] == 1] 46 | thing_classes = [k["name"] for k in SEMKITTI_CATEGORIES if k["isthing"] == 1] 47 | thing_colors = [k["color"] for k in SEMKITTI_CATEGORIES if k["isthing"] == 1] 48 | stuff_ids = [k["trainId"] for k in SEMKITTI_CATEGORIES if k["isthing"] == 0] 49 | stuff_classes = [k["name"] for k in SEMKITTI_CATEGORIES if k["isthing"] == 0] 50 | stuff_colors = [k["color"] for k in SEMKITTI_CATEGORIES if k["isthing"] == 0] 51 | 52 | assert len(thing_ids) == 8, len(thing_ids) 53 | assert len(stuff_ids) == 11, len(stuff_ids) 54 | 55 | # Mapping from the incontiguous SEMKITTI_DVPS category id to an id in [0, 10] 56 | thing_train_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} 57 | stuff_train_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)} 58 | 59 | ret = { 60 | "thing_ids": thing_ids, 61 | "thing_classes": thing_classes, 62 | "thing_colors": thing_colors, 63 | "thing_train_id_to_contiguous_id": thing_train_id_to_contiguous_id, 64 | "stuff_ids": stuff_ids, 65 | "stuff_classes": stuff_classes, 66 | "stuff_colors": stuff_colors, 67 | "stuff_train_id_to_contiguous_id": stuff_train_id_to_contiguous_id 68 | } 69 | 70 | return ret 71 | 72 | def load_semkitti_dvps_json(gt_json, image_dir, gt_dir, meta, name): 73 | assert os.path.exists(gt_json), gt_json+" not exists" 74 | with open(gt_json) as f: 75 | file_dicts = json.load(f) 76 | 77 | dataset_dicts = [] 78 | if 'train' in name: 79 | for vid in file_dicts.keys(): 80 | for fid in file_dicts[vid].keys(): 81 | if fid == "000000": 82 | record = {} 83 | record["video_id"] = vid 84 | record["height"] = file_dicts[vid][fid]["height"] 85 | record["width"] = file_dicts[vid][fid]["width"] 86 | record["file_names"] = [os.path.join(image_dir, file_dicts[vid][fid]["image"])] 87 | record["class_file_names"] = [os.path.join(image_dir, file_dicts[vid][fid]["class"])] 88 | record["instance_file_names"] = [os.path.join(image_dir, file_dicts[vid][fid]["instance"])] 89 | record["depth_file_names"] = [os.path.join(image_dir, file_dicts[vid][fid]["depth"])] 90 | dataset_dicts.append(record) 91 | else: 92 | dataset_dicts[-1]["file_names"].append(os.path.join(image_dir, file_dicts[vid][fid]["image"])) 93 | dataset_dicts[-1]["class_file_names"].append(os.path.join(image_dir, file_dicts[vid][fid]["class"])) 94 | dataset_dicts[-1]["instance_file_names"].append(os.path.join(image_dir, file_dicts[vid][fid]["instance"])) 95 | dataset_dicts[-1]["depth_file_names"].append(os.path.join(image_dir, file_dicts[vid][fid]["depth"])) 96 | 97 | elif 'val' in name: 98 | len_vid = int(name.split('val')[1]) 99 | for vid in file_dicts.keys(): 100 | for fid in file_dicts[vid].keys(): 101 | if int(fid)+len_vid > len(file_dicts[vid]): 102 | continue 103 | for i in range(len_vid): 104 | if i == 0: 105 | record = {} 106 | record["video_id"] = vid 107 | record["height"] = file_dicts[vid][fid]["height"] 108 | record["width"] = file_dicts[vid][fid]["width"] 109 | record["file_names"] = [os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][fid]["image"])] 110 | record["class_file_names"] = [os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][fid]["class"])] 111 | record["instance_file_names"] = [os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][fid]["instance"])] 112 | record["depth_file_names"] = [os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][fid]["depth"])] 113 | dataset_dicts.append(record) 114 | i += 1 115 | else: 116 | next_fid = '{0:06d}'.format(int(fid)+i) 117 | dataset_dicts[-1]["file_names"].append(os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][next_fid]["image"])) 118 | dataset_dicts[-1]["class_file_names"].append(os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][next_fid]["class"])) 119 | dataset_dicts[-1]["instance_file_names"].append(os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][next_fid]["instance"])) 120 | dataset_dicts[-1]["depth_file_names"].append(os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][next_fid]["depth"])) 121 | i += 1 122 | 123 | 124 | # logger.info("Loaded {} images from {}".format(len(file_dicts), image_dir)) 125 | return dataset_dicts 126 | 127 | 128 | def register_semkitti_dvps(name, meta, gt_json, image_dir, gt_dir): 129 | """ 130 | Register a dataset in Cityscapes_DVPS's json annotation format for DVPS. 131 | """ 132 | assert isinstance(name, str), name 133 | assert isinstance(gt_json, (str, os.PathLike)), gt_json 134 | assert isinstance(image_dir, (str, os.PathLike)), image_dir 135 | assert isinstance(gt_dir, (str, os.PathLike)), gt_dir 136 | 137 | DatasetCatalog.register(name, lambda: load_semkitti_dvps_json(gt_json, image_dir, gt_dir, meta, name)) 138 | MetadataCatalog.get(name).set( 139 | panoptic_root=gt_dir, 140 | image_root=image_dir, 141 | gt_dir=gt_dir, 142 | evaluator_type="semkitti_dvps", 143 | ignore_label=255, 144 | **meta, 145 | ) 146 | --------------------------------------------------------------------------------