├── mask2former
    ├── evaluation
    │   ├── __init__.py
    │   └── instance_evaluation.py
    ├── utils
    │   ├── __init__.py
    │   └── misc.py
    ├── modeling
    │   ├── backbone
    │   │   └── __init__.py
    │   ├── meta_arch
    │   │   ├── __init__.py
    │   │   └── mask_former_head.py
    │   ├── pixel_decoder
    │   │   ├── __init__.py
    │   │   └── ops
    │   │   │   ├── make.sh
    │   │   │   ├── modules
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn.py
    │   │   │   ├── functions
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn_func.py
    │   │   │   ├── src
    │   │   │       ├── vision.cpp
    │   │   │       ├── cuda
    │   │   │       │   ├── ms_deform_attn_cuda.h
    │   │   │       │   └── ms_deform_attn_cuda.cu
    │   │   │       ├── cpu
    │   │   │       │   ├── ms_deform_attn_cpu.h
    │   │   │       │   └── ms_deform_attn_cpu.cpp
    │   │   │       └── ms_deform_attn.h
    │   │   │   ├── setup.py
    │   │   │   └── test.py
    │   ├── transformer_decoder
    │   │   ├── __init__.py
    │   │   ├── position_encoding.py
    │   │   └── maskformer_transformer_decoder.py
    │   └── __init__.py
    ├── data
    │   ├── dataset_mappers
    │   │   ├── __init__.py
    │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
    │   │   ├── mask_former_panoptic_dataset_mapper.py
    │   │   ├── mask_former_instance_dataset_mapper.py
    │   │   ├── mask_former_semantic_dataset_mapper.py
    │   │   └── coco_instance_new_baseline_dataset_mapper.py
    │   ├── __init__.py
    │   └── datasets
    │   │   ├── __init__.py
    │   │   ├── register_ade20k_instance.py
    │   │   └── register_coco_panoptic_annos_semseg.py
    ├── __init__.py
    ├── test_time_augmentation.py
    └── config.py
├── requirements.txt
├── .gitignore
├── minvis
    ├── utils
    │   ├── __init__.py
    │   └── lr_scheduler.py
    ├── data_video
    │   ├── datasets
    │   │   ├── ytvis_api
    │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   └── builtin.py
    │   ├── __init__.py
    │   └── augmentation.py
    ├── config.py
    ├── __init__.py
    └── video_mask2former_transformer_decoder.py
├── assets
    ├── demo.png
    └── teaser.png
├── mask2former_video
    ├── utils
    │   ├── __init__.py
    │   └── memory.py
    ├── data_video
    │   ├── datasets
    │   │   ├── ytvis_api
    │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   └── builtin.py
    │   ├── __init__.py
    │   └── augmentation.py
    ├── modeling
    │   ├── transformer_decoder
    │   │   ├── __init__.py
    │   │   └── position_encoding.py
    │   └── __init__.py
    ├── __init__.py
    └── config.py
├── uni_dvps
    ├── data_video
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── builtin.py
    │   │   ├── cityscapes_dvps.py
    │   │   └── semkitti_dvps.py
    │   ├── __init__.py
    │   ├── build.py
    │   └── augmentation.py
    ├── modeling
    │   ├── __init__.py
    │   └── meta_arch
    │   │   └── unified_decoder_head.py
    ├── __init__.py
    └── config.py
├── configs
    ├── CityscapesDVPS
    │   ├── swin
    │   │   └── swinL.yaml
    │   ├── base.yaml
    │   └── R50.yaml
    └── SemKITTIDVPS
    │   ├── R50.yaml
    │   └── base.yaml
├── demo
    └── demo.py
├── README.md
└── train.py


/mask2former/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | timm
2 | scipy
3 | einops
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | media/.DS_Store
3 | deprecated_media/
4 | .idea/
5 | 


--------------------------------------------------------------------------------
/minvis/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import add_deeplab_config, build_lr_scheduler


--------------------------------------------------------------------------------
/assets/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaist-ami/Uni-DVPS/HEAD/assets/demo.png


--------------------------------------------------------------------------------
/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaist-ami/Uni-DVPS/HEAD/assets/teaser.png


--------------------------------------------------------------------------------
/mask2former/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former_video/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from . import datasets
3 | 


--------------------------------------------------------------------------------
/minvis/data_video/datasets/ytvis_api/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi
3 | 


--------------------------------------------------------------------------------
/mask2former_video/data_video/datasets/ytvis_api/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/youtubevos/cocoapi
3 | 


--------------------------------------------------------------------------------
/mask2former_video/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
3 | 


--------------------------------------------------------------------------------
/uni_dvps/data_video/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from . import builtin  # ensure the builtin datasets are registered
2 | 
3 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
4 | 


--------------------------------------------------------------------------------
/uni_dvps/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | from .meta_arch.unified_decoder_head import UnifiedDecoderHead
2 | from .transformer_decoder.unified_transformer_decoder import UnifiedTransformerDecoder
3 | from .unidvps_model import UniDVPS
4 | 


--------------------------------------------------------------------------------
/uni_dvps/data_video/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset_mapper import CityscapesDVPSDatasetMapper, SemkittiDVPSDatasetMapper
2 | from .build import *
3 | 
4 | from .datasets import *
5 | from .evaluator import CityscapesDVPSEvaluator, SemkittiDVPSEvaluator


--------------------------------------------------------------------------------
/mask2former/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | 


--------------------------------------------------------------------------------
/uni_dvps/__init__.py:
--------------------------------------------------------------------------------
 1 | # config
 2 | from .config import add_uni_dvps_config
 3 | from . import modeling
 4 | from .data_video import (
 5 |     CityscapesDVPSDatasetMapper,
 6 |     CityscapesDVPSEvaluator,
 7 |     SemkittiDVPSDatasetMapper,
 8 |     SemkittiDVPSEvaluator,
 9 |     build_hooks,
10 |     run_step
11 | )


--------------------------------------------------------------------------------
/minvis/data_video/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
3 | 
4 | from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper
5 | from .build import *
6 | 
7 | from .datasets import *
8 | from .ytvis_eval import YTVISEvaluator
9 | 


--------------------------------------------------------------------------------
/minvis/data_video/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
3 | 
4 | from . import builtin  # ensure the builtin datasets are registered
5 | 
6 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
7 | 


--------------------------------------------------------------------------------
/mask2former_video/data_video/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
3 | 
4 | from .dataset_mapper import YTVISDatasetMapper, CocoClipDatasetMapper
5 | from .build import *
6 | 
7 | from .datasets import *
8 | from .ytvis_eval import YTVISEvaluator
9 | 


--------------------------------------------------------------------------------
/mask2former_video/data_video/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
3 | 
4 | from . import builtin  # ensure the builtin datasets are registered
5 | 
6 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
7 | 


--------------------------------------------------------------------------------
/mask2former/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import (
 3 |     register_ade20k_full,
 4 |     register_ade20k_panoptic,
 5 |     register_coco_stuff_10k,
 6 |     register_mapillary_vistas,
 7 |     register_coco_panoptic_annos_semseg,
 8 |     register_ade20k_instance,
 9 |     register_mapillary_vistas_panoptic,
10 | )
11 | 


--------------------------------------------------------------------------------
/mask2former_video/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
3 | from .criterion import VideoSetCriterion, calculate_uncertainty, sigmoid_ce_loss_jit, dice_loss_jit
4 | from .matcher import VideoHungarianMatcher, batch_sigmoid_ce_loss_jit, batch_dice_loss
5 | 


--------------------------------------------------------------------------------
/mask2former/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .pixel_decoder.fpn import BasePixelDecoder
4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
5 | from .meta_arch.mask_former_head import MaskFormerHead
6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
7 | 


--------------------------------------------------------------------------------
/mask2former_video/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import modeling
 3 | 
 4 | # config
 5 | from .config import add_maskformer2_video_config
 6 | 
 7 | # models
 8 | from .video_maskformer_model import VideoMaskFormer
 9 | 
10 | # video
11 | from .data_video import (
12 |     YTVISDatasetMapper,
13 |     YTVISEvaluator,
14 |     build_detection_train_loader,
15 |     build_detection_test_loader,
16 |     get_detection_dataset_dicts,
17 | )
18 | 


--------------------------------------------------------------------------------
/mask2former_video/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | from detectron2.config import CfgNode as CN
 4 | 
 5 | 
 6 | def add_maskformer2_video_config(cfg):
 7 |     # video data
 8 |     # DataLoader
 9 |     cfg.INPUT.SAMPLING_FRAME_NUM = 2
10 |     cfg.INPUT.SAMPLING_FRAME_RANGE = 20
11 |     cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False
12 |     cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation"
13 | 


--------------------------------------------------------------------------------
/minvis/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) 2021-2022, NVIDIA Corporation & Affiliates. All rights reserved.
 3 | #
 4 | # This work is made available under the Nvidia Source Code License-NC.
 5 | # To view a copy of this license, visit
 6 | # https://github.com/NVlabs/MinVIS/blob/main/LICENSE
 7 | 
 8 | # Copyright (c) Facebook, Inc. and its affiliates.
 9 | from detectron2.config import CfgNode as CN
10 | 
11 | 
12 | def add_minvis_config(cfg):
13 |     cfg.INPUT.SAMPLING_FRAME_RATIO = 1.0
14 |     cfg.MODEL.MASK_FORMER.TEST.WINDOW_INFERENCE = False
15 | 
16 | 


--------------------------------------------------------------------------------
/configs/CityscapesDVPS/swin/swinL.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../base.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "/local_data2/ryeon/mask2former/pre_weight/swin_large_patch4_window12_384_22k.pth"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200


--------------------------------------------------------------------------------
/minvis/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA Corporation & Affiliates. All rights reserved.
 2 | #
 3 | # This work is made available under the Nvidia Source Code License-NC.
 4 | # To view a copy of this license, visit
 5 | # https://github.com/NVlabs/MinVIS/blob/main/LICENSE
 6 | 
 7 | # Copyright (c) Facebook, Inc. and its affiliates.
 8 | 
 9 | # config
10 | from .config import add_minvis_config
11 | 
12 | # models
13 | from .video_maskformer_model import VideoMaskFormer_frame
14 | from .video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder_frame
15 | 
16 | # video
17 | from .data_video import (
18 |     YTVISDatasetMapper,
19 |     YTVISEvaluator,
20 |     build_detection_train_loader,
21 |     build_detection_test_loader,
22 |     get_detection_dataset_dicts,
23 | )
24 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 | 
13 | python setup.py build install
14 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/mask2former/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import data  # register all new datasets
 3 | from . import modeling
 4 | 
 5 | # config
 6 | from .config import add_maskformer2_config
 7 | 
 8 | # dataset loading
 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
12 |     MaskFormerInstanceDatasetMapper,
13 | )
14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
15 |     MaskFormerPanopticDatasetMapper,
16 | )
17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
18 |     MaskFormerSemanticDatasetMapper,
19 | )
20 | 
21 | # models
22 | from .maskformer_model import MaskFormer
23 | from .test_time_augmentation import SemanticSegmentorWithTTA
24 | 
25 | # evaluation
26 | from .evaluation.instance_evaluation import InstanceSegEvaluator
27 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/configs/CityscapesDVPS/base.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TEST: ("cityscapes_dvps_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 10000
23 |   WARMUP_FACTOR: 1.0
24 |   WEIGHT_DECAY: 0.05
25 |   OPTIMIZER: "ADAMW"
26 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
27 |   BACKBONE_MULTIPLIER: 0.1
28 |   CLIP_GRADIENTS:
29 |     ENABLED: True
30 |     CLIP_TYPE: "full_model"
31 |     CLIP_VALUE: 0.01
32 |     NORM_TYPE: 2.0
33 |   AMP:
34 |     ENABLED: True
35 | INPUT:
36 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(8, 13)]"]
37 |   MAX_SIZE_TRAIN: 4096
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TEST: 2048
41 |   RANDOM_FLIP: "flip_by_clip"
42 |   SIZE_DIVISIBILITY: -1
43 |   CROP:
44 |     ENABLED: False
45 |     TYPE: "absolute"
46 |     SIZE: (1024, 2048)
47 |     RESCALE: (1.0, 2.0)
48 |     WITH_PAD: True
49 |     SINGLE_CATEGORY_MAX_AREA: 1.0
50 |   COLOR_AUG_SSD: True
51 |   FORMAT: "RGB"
52 | TEST:
53 |   EVAL_PERIOD: 500
54 |   LOG_PERIOD: 50
55 | DATALOADER:
56 |   FILTER_EMPTY_ANNOTATIONS: False
57 |   NUM_WORKERS: 4
58 | VERSION: 2
59 | 


--------------------------------------------------------------------------------
/configs/CityscapesDVPS/R50.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: base.yaml
 2 | MODEL:
 3 | #  WEIGHTS: "model_final_3c8ec9.pkl"
 4 |   META_ARCHITECTURE: "UniDVPS"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "UnifiedDecoderHead"
 7 |     IGNORE_VALUE: 32000
 8 |     NUM_CLASSES: 19
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   UNIFIED_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "UnifiedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     MATCHER: "dvps_matcher"
23 |     DEEP_SUPERVISION: True
24 |     NO_OBJECT_WEIGHT: 0.1
25 |     CLASS_WEIGHT: 2.0
26 |     MASK_WEIGHT: 5.0
27 |     DICE_WEIGHT: 5.0
28 |     HIDDEN_DIM: 256
29 |     NUM_OBJECT_QUERIES: 100
30 |     NHEADS: 8
31 |     DROPOUT: 0.0
32 |     DIM_FEEDFORWARD: 2048
33 |     ENC_LAYERS: 0
34 |     PRE_NORM: False
35 |     ENFORCE_INPUT_PROJ: False
36 |     SIZE_DIVISIBILITY: 32
37 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
38 |     TRAIN_NUM_POINTS: 12544
39 |     OVERSAMPLE_RATIO: 3.0
40 |     IMPORTANCE_SAMPLE_RATIO: 0.75
41 |     TEST:
42 |       SEMANTIC_ON: False
43 |       INSTANCE_ON: True
44 |       PANOPTIC_ON: False
45 |       OVERLAP_THRESHOLD: 0.8
46 |       OBJECT_MASK_THRESHOLD: 0.8
47 |     DEPTH_DIM: 256
48 |     DEPTH_MAX: 80.
49 |     SILOG_WEIGHT: 3.0
50 |     REL_SQR_WEIGHT: 3.0
51 |     REL_ABS_WEIGHT: 3.0
52 | INPUT:
53 |   SAMPLING_FRAME_NUM: 1
54 | 


--------------------------------------------------------------------------------
/configs/SemKITTIDVPS/R50.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: base.yaml
 2 | MODEL:
 3 | #  WEIGHTS: "model_final_3c8ec9.pkl"
 4 |   META_ARCHITECTURE: "UniDVPS"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "UnifiedDecoderHead"
 7 |     IGNORE_VALUE: 255000
 8 |     NUM_CLASSES: 19
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   UNIFIED_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "UnifiedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     MATCHER: "dvps_matcher"
23 |     DEEP_SUPERVISION: True
24 |     NO_OBJECT_WEIGHT: 0.1
25 |     CLASS_WEIGHT: 2.0
26 |     MASK_WEIGHT: 5.0
27 |     DICE_WEIGHT: 5.0
28 |     HIDDEN_DIM: 256
29 |     NUM_OBJECT_QUERIES: 100
30 |     NHEADS: 8
31 |     DROPOUT: 0.0
32 |     DIM_FEEDFORWARD: 2048
33 |     ENC_LAYERS: 0
34 |     PRE_NORM: False
35 |     ENFORCE_INPUT_PROJ: False
36 |     SIZE_DIVISIBILITY: 32
37 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
38 |     TRAIN_NUM_POINTS: 491520
39 |     OVERSAMPLE_RATIO: 3.0
40 |     IMPORTANCE_SAMPLE_RATIO: 0.75
41 |     TEST:
42 |       SEMANTIC_ON: False
43 |       INSTANCE_ON: True
44 |       PANOPTIC_ON: False
45 |       OVERLAP_THRESHOLD: 0.8
46 |       OBJECT_MASK_THRESHOLD: 0.8
47 |     DEPTH_DIM: 256
48 |     DEPTH_MAX: 88.
49 |     SILOG_WEIGHT: 3.0
50 |     REL_SQR_WEIGHT: 3.0
51 |     REL_ABS_WEIGHT: 3.0
52 | INPUT:
53 |   SAMPLING_FRAME_NUM: 1
54 | 


--------------------------------------------------------------------------------
/configs/SemKITTIDVPS/base.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TEST: ("semkitti_dvps_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 10000
23 |   WARMUP_FACTOR: 1.0
24 |   WEIGHT_DECAY: 0.05
25 |   OPTIMIZER: "ADAMW"
26 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
27 |   BACKBONE_MULTIPLIER: 0.1
28 |   CLIP_GRADIENTS:
29 |     ENABLED: True
30 |     CLIP_TYPE: "full_model"
31 |     CLIP_VALUE: 0.01
32 |     NORM_TYPE: 2.0
33 |   AMP:
34 |     ENABLED: True
35 | INPUT:
36 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(8, 13)]"]
37 |   MAX_SIZE_TRAIN: 1280
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   MIN_SIZE_TEST: 784
40 |   MAX_SIZE_TEST: 1280
41 |   RANDOM_FLIP: "flip_by_clip"
42 |   SIZE_DIVISIBILITY: -1
43 |   CROP:
44 | #    ENABLED: True
45 |     ENABLED: False
46 |     TYPE: "absolute"
47 |     SIZE: (384, 1280)
48 |     RESCALE: (-1.0, -2.0)
49 |     WITH_PAD: True
50 |     SINGLE_CATEGORY_MAX_AREA: 1.0
51 |   COLOR_AUG_SSD: True
52 |   FORMAT: "RGB"
53 | TEST:
54 |   EVAL_PERIOD: 500
55 |   LOG_PERIOD: 50
56 | DATALOADER:
57 |   FILTER_EMPTY_ANNOTATIONS: False
58 |   NUM_WORKERS: 4
59 | VERSION: 2
60 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/mask2former_video/data_video/datasets/builtin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
 3 | 
 4 | import os
 5 | 
 6 | from .ytvis import (
 7 |     register_ytvis_instances,
 8 |     _get_ytvis_2019_instances_meta,
 9 |     _get_ytvis_2021_instances_meta,
10 | )
11 | 
12 | # ==== Predefined splits for YTVIS 2019 ===========
13 | _PREDEFINED_SPLITS_YTVIS_2019 = {
14 |     "ytvis_2019_train": ("YouTubeVIS_2019/train/JPEGImages",
15 |                          "YouTubeVIS_2019/train.json"),
16 |     "ytvis_2019_val": ("YouTubeVIS_2019/valid/JPEGImages",
17 |                        "YouTubeVIS_2019/valid.json"),
18 |     "ytvis_2019_test": ("YouTubeVIS_2019/test/JPEGImages",
19 |                         "YouTubeVIS_2019/test.json"),
20 | }
21 | 
22 | 
23 | # ==== Predefined splits for YTVIS 2021 ===========
24 | _PREDEFINED_SPLITS_YTVIS_2021 = {
25 |     "ytvis_2021_train": ("YouTubeVIS_2021/train/JPEGImages",
26 |                          "YouTubeVIS_2021/train.json"),
27 |     "ytvis_2021_val": ("YouTubeVIS_2021/valid/JPEGImages",
28 |                        "YouTubeVIS_2021/valid.json"),
29 |     "ytvis_2021_test": ("YouTubeVIS_2021/test/JPEGImages",
30 |                         "YouTubeVIS_2021/test.json"),
31 | }
32 | 
33 | 
34 | def register_all_ytvis_2019(root):
35 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items():
36 |         # Assume pre-defined datasets live in `./datasets`.
37 |         register_ytvis_instances(
38 |             key,
39 |             _get_ytvis_2019_instances_meta(),
40 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
41 |             os.path.join(root, image_root),
42 |         )
43 | 
44 | 
45 | def register_all_ytvis_2021(root):
46 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items():
47 |         # Assume pre-defined datasets live in `./datasets`.
48 |         register_ytvis_instances(
49 |             key,
50 |             _get_ytvis_2021_instances_meta(),
51 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
52 |             os.path.join(root, image_root),
53 |         )
54 | 
55 | 
56 | if __name__.endswith(".builtin"):
57 |     # Assume pre-defined datasets live in `./datasets`.
58 |     _root = os.getenv("DETECTRON2_DATASETS", "datasets")
59 |     register_all_ytvis_2019(_root)
60 |     register_all_ytvis_2021(_root)
61 | 


--------------------------------------------------------------------------------
/minvis/utils/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import math
 3 | from typing import List
 4 | import torch
 5 | 
 6 | from detectron2.solver.lr_scheduler import _get_warmup_factor_at_iter
 7 | 
 8 | # NOTE: PyTorch's LR scheduler interface uses names that assume the LR changes
 9 | # only on epoch boundaries. We typically use iteration based schedules instead.
10 | # As a result, "epoch" (e.g., as in self.last_epoch) should be understood to mean
11 | # "iteration" instead.
12 | 
13 | # FIXME: ideally this would be achieved with a CombinedLRScheduler, separating
14 | # MultiStepLR with WarmupLR but the current LRScheduler design doesn't allow it.
15 | 
16 | 
17 | class WarmupPolyLR(torch.optim.lr_scheduler._LRScheduler):
18 |     """
19 |     Poly learning rate schedule used to train DeepLab.
20 |     Paper: DeepLab: Semantic Image Segmentation with Deep Convolutional Nets,
21 |         Atrous Convolution, and Fully Connected CRFs.
22 |     Reference: https://github.com/tensorflow/models/blob/21b73d22f3ed05b650e85ac50849408dd36de32e/research/deeplab/utils/train_utils.py#L337  # noqa
23 |     """
24 | 
25 |     def __init__(
26 |         self,
27 |         optimizer: torch.optim.Optimizer,
28 |         max_iters: int,
29 |         warmup_factor: float = 0.001,
30 |         warmup_iters: int = 1000,
31 |         warmup_method: str = "linear",
32 |         last_epoch: int = -1,
33 |         power: float = 0.9,
34 |         constant_ending: float = 0.0,
35 |     ):
36 |         self.max_iters = max_iters
37 |         self.warmup_factor = warmup_factor
38 |         self.warmup_iters = warmup_iters
39 |         self.warmup_method = warmup_method
40 |         self.power = power
41 |         self.constant_ending = constant_ending
42 |         super().__init__(optimizer, last_epoch)
43 | 
44 |     def get_lr(self) -> List[float]:
45 |         warmup_factor = _get_warmup_factor_at_iter(
46 |             self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor
47 |         )
48 |         if self.constant_ending > 0 and warmup_factor == 1.0:
49 |             # Constant ending lr.
50 |             if (
51 |                 math.pow((1.0 - self.last_epoch / self.max_iters), self.power)
52 |                 < self.constant_ending
53 |             ):
54 |                 return [base_lr * self.constant_ending for base_lr in self.base_lrs]
55 |         return [
56 |             base_lr * warmup_factor * math.pow((1.0 - self.last_epoch / self.max_iters), self.power)
57 |             for base_lr in self.base_lrs
58 |         ]
59 | 
60 |     def _compute_values(self) -> List[float]:
61 |         # The new interface
62 |         return self.get_lr()
63 | 


--------------------------------------------------------------------------------
/mask2former/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         if mask is None:
31 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 |         not_mask = ~mask
33 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
34 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
35 |         if self.normalize:
36 |             eps = 1e-6
37 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 | 
40 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
41 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
42 | 
43 |         pos_x = x_embed[:, :, :, None] / dim_t
44 |         pos_y = y_embed[:, :, :, None] / dim_t
45 |         pos_x = torch.stack(
46 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 |         ).flatten(3)
48 |         pos_y = torch.stack(
49 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 |         return pos
53 |     
54 |     def __repr__(self, _repr_indent=4):
55 |         head = "Positional encoding " + self.__class__.__name__
56 |         body = [
57 |             "num_pos_feats: {}".format(self.num_pos_feats),
58 |             "temperature: {}".format(self.temperature),
59 |             "normalize: {}".format(self.normalize),
60 |             "scale: {}".format(self.scale),
61 |         ]
62 |         # _repr_indent = 4
63 |         lines = [head] + [" " * _repr_indent + line for line in body]
64 |         return "\n".join(lines)
65 | 


--------------------------------------------------------------------------------
/uni_dvps/data_video/datasets/builtin.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .cityscapes_dvps import (
 3 |     register_cityscapes_dvps,
 4 |     _get_cityscapes_dvps_meta
 5 | )
 6 | from .semkitti_dvps import (
 7 |     register_semkitti_dvps,
 8 |     _get_semkitti_dvps_meta
 9 | )
10 | 
11 | # ==== Predefined splits for Cityscpaes-DVPS ===========
12 | _PREDEFINED_SPLITS_CITYSCAPES_DVPS = {
13 |     "cityscapes_dvps_val": (
14 |         "cityscapes-dvps/video_sequence/val",
15 |         "cityscapes-dvps/video_sequence/val",
16 |         "cityscapes-dvps/video_sequence/dvps_cityscapes_val.json",
17 |     ),
18 | }
19 | 
20 | # ==== Predefined splits for SemKITTI-DVPS ===========
21 | _PREDEFINED_SPLITS_SEM_KITTI = {
22 |     "semkitti_dvps_val": (
23 |         "semkitti-dvps/video_sequence/val",
24 |         "semkitti-dvps/video_sequence/val",
25 |         "semkitti-dvps/video_sequence/dvps_semkitti_val.json"
26 |     ),
27 | }
28 | 
29 | def register_all_cityscapes_dvps(root):
30 |     for key, (image_dir, gt_dir, gt_json) in _PREDEFINED_SPLITS_CITYSCAPES_DVPS.items():
31 |         image_dir = os.path.join(root, image_dir)
32 |         gt_dir = os.path.join(root, gt_dir)
33 |         gt_json = os.path.join(root, gt_json)
34 | 
35 |         register_cityscapes_dvps(
36 |             key,
37 |             _get_cityscapes_dvps_meta(),
38 |             os.path.join(root, gt_json) if "://" not in gt_json else gt_json,
39 |             os.path.join(root, image_dir),
40 |             os.path.join(root, gt_dir),
41 |         )
42 | 
43 | def register_all_sem_kitti(root):
44 |     for key, (image_dir, gt_dir, gt_json) in _PREDEFINED_SPLITS_SEM_KITTI.items():
45 |         image_dir = os.path.join(root, image_dir)
46 |         gt_dir = os.path.join(root, gt_dir)
47 |         gt_json = os.path.join(root, gt_json)
48 | 
49 |         if "val" in key:
50 |             for eval_frames in [1, 5, 10, 20]:
51 |                 new_key = key+str(eval_frames)
52 |                 register_semkitti_dvps(
53 |                     new_key,
54 |                     _get_semkitti_dvps_meta(),
55 |                     os.path.join(root, gt_json) if "://" not in gt_json else gt_json,
56 |                     os.path.join(root, image_dir),
57 |                     os.path.join(root, gt_dir),
58 |                 )
59 |         else:
60 |             register_semkitti_dvps(
61 |                 key,
62 |                 _get_semkitti_dvps_meta(),
63 |                 os.path.join(root, gt_json) if "://" not in gt_json else gt_json,
64 |                 os.path.join(root, image_dir),
65 |                 os.path.join(root, gt_dir),
66 |             )
67 | 
68 | 
69 | if __name__.endswith(".builtin"):
70 |     # Assume pre-defined datasets live in `./datasets`.
71 |     _root = os.getenv("DETECTRON2_DATASETS", "datasets")
72 |     register_all_cityscapes_dvps(_root)
73 |     register_all_sem_kitti(_root)
74 | 
75 | 


--------------------------------------------------------------------------------
/mask2former_video/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine3D(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         # b, t, c, h, w
31 |         assert x.dim() == 5, f"{x.shape} should be a 5-dimensional Tensor, got {x.dim()}-dimensional Tensor instead"
32 |         if mask is None:
33 |             mask = torch.zeros((x.size(0), x.size(1), x.size(3), x.size(4)), device=x.device, dtype=torch.bool)
34 |         not_mask = ~mask
35 |         z_embed = not_mask.cumsum(1, dtype=torch.float32)
36 |         y_embed = not_mask.cumsum(2, dtype=torch.float32)
37 |         x_embed = not_mask.cumsum(3, dtype=torch.float32)
38 |         if self.normalize:
39 |             eps = 1e-6
40 |             z_embed = z_embed / (z_embed[:, -1:, :, :] + eps) * self.scale
41 |             y_embed = y_embed / (y_embed[:, :, -1:, :] + eps) * self.scale
42 |             x_embed = x_embed / (x_embed[:, :, :, -1:] + eps) * self.scale
43 | 
44 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
45 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
46 | 
47 |         dim_t_z = torch.arange((self.num_pos_feats * 2), dtype=torch.float32, device=x.device)
48 |         dim_t_z = self.temperature ** (2 * (dim_t_z // 2) / (self.num_pos_feats * 2))
49 | 
50 |         pos_x = x_embed[:, :, :, :, None] / dim_t
51 |         pos_y = y_embed[:, :, :, :, None] / dim_t
52 |         pos_z = z_embed[:, :, :, :, None] / dim_t_z
53 |         pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
54 |         pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
55 |         pos_z = torch.stack((pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), dim=5).flatten(4)
56 |         pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3)  # b, t, c, h, w
57 |         return pos
58 | 


--------------------------------------------------------------------------------
/mask2former_video/utils/memory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | import logging
 4 | from contextlib import contextmanager
 5 | from functools import wraps
 6 | import torch
 7 | from torch.cuda.amp import autocast
 8 | 
 9 | __all__ = ["retry_if_cuda_oom"]
10 | 
11 | 
12 | @contextmanager
13 | def _ignore_torch_cuda_oom():
14 |     """
15 |     A context which ignores CUDA OOM exception from pytorch.
16 |     """
17 |     try:
18 |         yield
19 |     except RuntimeError as e:
20 |         # NOTE: the string may change?
21 |         if "CUDA out of memory. " in str(e):
22 |             pass
23 |         else:
24 |             raise
25 | 
26 | 
27 | def retry_if_cuda_oom(func):
28 |     """
29 |     Makes a function retry itself after encountering
30 |     pytorch's CUDA OOM error.
31 |     It will first retry after calling `torch.cuda.empty_cache()`.
32 |     If that still fails, it will then retry by trying to convert inputs to CPUs.
33 |     In this case, it expects the function to dispatch to CPU implementation.
34 |     The return values may become CPU tensors as well and it's user's
35 |     responsibility to convert it back to CUDA tensor if needed.
36 |     Args:
37 |         func: a stateless callable that takes tensor-like objects as arguments
38 |     Returns:
39 |         a callable which retries `func` if OOM is encountered.
40 |     Examples:
41 |     ::
42 |         output = retry_if_cuda_oom(some_torch_function)(input1, input2)
43 |         # output may be on CPU even if inputs are on GPU
44 |     Note:
45 |         1. When converting inputs to CPU, it will only look at each argument and check
46 |            if it has `.device` and `.to` for conversion. Nested structures of tensors
47 |            are not supported.
48 |         2. Since the function might be called more than once, it has to be
49 |            stateless.
50 |     """
51 | 
52 |     def maybe_to_cpu(x):
53 |         try:
54 |             like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to")
55 |         except AttributeError:
56 |             like_gpu_tensor = False
57 |         if like_gpu_tensor:
58 |             return x.to(device="cpu").to(torch.float32)
59 |         else:
60 |             return x
61 | 
62 |     @wraps(func)
63 |     def wrapped(*args, **kwargs):
64 |         with _ignore_torch_cuda_oom():
65 |             return func(*args, **kwargs)
66 | 
67 |         # Clear cache and retry
68 |         torch.cuda.empty_cache()
69 |         with _ignore_torch_cuda_oom():
70 |             return func(*args, **kwargs)
71 | 
72 |         # Try on CPU. This slows down the code significantly, therefore print a notice.
73 |         logger = logging.getLogger(__name__)
74 |         logger.info("Attempting to copy inputs to CPU due to CUDA OOM")
75 |         new_args = (maybe_to_cpu(x) for x in args)
76 |         new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()}
77 |         with autocast(enabled=False):
78 |             return func(*new_args, **new_kwargs)
79 | 
80 |     return wrapped
81 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/uni_dvps/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import datetime
 3 | from detectron2.config import CfgNode as CN
 4 | 
 5 | 
 6 | def add_uni_dvps_config(cfg):
 7 |     cfg.INPUT.ROTATE_AUG = 0.
 8 |     cfg.INPUT.DEPTH_BOUND = True
 9 | 
10 |     cfg.OUTPUT_DIR = './output'
11 |     cfg.TEST.LOG_PERIOD = 1
12 |     cfg.SEED = 42
13 |     cfg.EVAL_FRAMES = 0
14 | 
15 |     #PanopticDepth aug
16 |     cfg.INPUT.CROP.WITH_PAD = True
17 |     cfg.INPUT.CROP.RESCALE = (0.8, 1.2)
18 | 
19 |     # DEPTH_FORMER
20 |     cfg.MODEL.DEPTH_FORMER = CN()
21 |     cfg.MODEL.DEPTH_FORMER.DEPTH_DIM = 256
22 |     cfg.MODEL.DEPTH_FORMER.DEPTH_MAX = 88.
23 |     cfg.MODEL.DEPTH_FORMER.SILOG_WEIGHT = 1.0
24 |     cfg.MODEL.DEPTH_FORMER.REL_SQR_WEIGHT = 1.0
25 |     cfg.MODEL.DEPTH_FORMER.REL_ABS_WEIGHT = 1.0
26 |     cfg.MODEL.DEPTH_FORMER.TRANSFORMER_DECODER_NAME = "VideoMultiScaleDepthTransformerDecoder_frame"
27 | 
28 |     # UNIFIED_FORMER
29 |     cfg.MODEL.UNIFIED_FORMER = CN()
30 | 
31 |     # loss
32 |     cfg.MODEL.UNIFIED_FORMER.DEEP_SUPERVISION = True
33 |     cfg.MODEL.UNIFIED_FORMER.NO_OBJECT_WEIGHT = 0.1
34 |     cfg.MODEL.UNIFIED_FORMER.CLASS_WEIGHT = 1.0
35 |     cfg.MODEL.UNIFIED_FORMER.DICE_WEIGHT = 1.0
36 |     cfg.MODEL.UNIFIED_FORMER.MASK_WEIGHT = 20.0
37 | 
38 |     # transformer config
39 |     cfg.MODEL.UNIFIED_FORMER.NHEADS = 8
40 |     cfg.MODEL.UNIFIED_FORMER.DROPOUT = 0.1
41 |     cfg.MODEL.UNIFIED_FORMER.DIM_FEEDFORWARD = 2048
42 |     cfg.MODEL.UNIFIED_FORMER.ENC_LAYERS = 0
43 |     cfg.MODEL.UNIFIED_FORMER.DEC_LAYERS = 6
44 |     cfg.MODEL.UNIFIED_FORMER.PRE_NORM = False
45 | 
46 |     cfg.MODEL.UNIFIED_FORMER.HIDDEN_DIM = 256
47 |     cfg.MODEL.UNIFIED_FORMER.NUM_OBJECT_QUERIES = 100
48 | 
49 |     cfg.MODEL.UNIFIED_FORMER.TRANSFORMER_IN_FEATURE = "res5"
50 |     cfg.MODEL.UNIFIED_FORMER.ENFORCE_INPUT_PROJ = False
51 | 
52 |     cfg.MODEL.UNIFIED_FORMER.DEPTH_DIM = 256
53 |     cfg.MODEL.UNIFIED_FORMER.DEPTH_MAX = 80.
54 |     cfg.MODEL.UNIFIED_FORMER.SILOG_WEIGHT = 1.0
55 |     cfg.MODEL.UNIFIED_FORMER.REL_SQR_WEIGHT = 1.0
56 |     cfg.MODEL.UNIFIED_FORMER.REL_ABS_WEIGHT = 1.0
57 | 
58 |     cfg.MODEL.UNIFIED_FORMER.TRANSFORMER_DECODER_NAME = "VideoMultiScaleMaskedTransformerDecoder_frame_unified_decoder"
59 |     cfg.MODEL.UNIFIED_FORMER.SIZE_DIVISIBILITY = 32
60 | 
61 |     # UNIFIED_FORMER inference config
62 |     cfg.MODEL.UNIFIED_FORMER.TEST = CN()
63 |     cfg.MODEL.UNIFIED_FORMER.TEST.SEMANTIC_ON = True
64 |     cfg.MODEL.UNIFIED_FORMER.TEST.INSTANCE_ON = False
65 |     cfg.MODEL.UNIFIED_FORMER.TEST.PANOPTIC_ON = False
66 |     cfg.MODEL.UNIFIED_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
67 |     cfg.MODEL.UNIFIED_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
68 |     cfg.MODEL.UNIFIED_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
69 |     cfg.MODEL.UNIFIED_FORMER.TEST.WINDOW_INFERENCE = False
70 | 
71 |     # point loss configs
72 |     # Number of points sampled during training for a mask point head.
73 |     cfg.MODEL.UNIFIED_FORMER.TRAIN_NUM_POINTS = 112 * 112
74 |     # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
75 |     # original paper.
76 |     cfg.MODEL.UNIFIED_FORMER.OVERSAMPLE_RATIO = 3.0
77 |     # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
78 |     # the original paper.
79 |     cfg.MODEL.UNIFIED_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
80 | 
81 |     # MATCHER
82 |     cfg.MODEL.UNIFIED_FORMER.MATCHER = "video_depth_matcher"


--------------------------------------------------------------------------------
/demo/demo.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import glob
  3 | import multiprocessing as mp
  4 | import os
  5 | import time
  6 | from torch.cuda.amp import autocast
  7 | from detectron2.config import get_cfg
  8 | from detectron2.data.detection_utils import read_image
  9 | from detectron2.projects.deeplab import add_deeplab_config
 10 | from detectron2.utils.logger import setup_logger
 11 | 
 12 | from mask2former import add_maskformer2_config
 13 | from mask2former_video import add_maskformer2_video_config
 14 | from minvis import add_minvis_config
 15 | from uni_dvps import add_uni_dvps_config
 16 | from predictor import VisualizationDemo_Panoptic
 17 | 
 18 | from tqdm import tqdm
 19 | import warnings
 20 | warnings.filterwarnings(action='ignore')
 21 | 
 22 | def setup_cfg(args):
 23 | 	# load config from file and command-line arguments
 24 | 	cfg = get_cfg()
 25 | 	add_deeplab_config(cfg)
 26 | 	add_maskformer2_config(cfg)
 27 | 	add_maskformer2_video_config(cfg)
 28 | 	add_minvis_config(cfg)
 29 | 	add_uni_dvps_config(cfg)
 30 | 	cfg.merge_from_file(args.config_file)
 31 | 	cfg.merge_from_list(args.opts)
 32 | 	cfg.freeze()
 33 | 	return cfg
 34 | 
 35 | def get_parser():
 36 | 	parser = argparse.ArgumentParser(description="unidvps demo for builtin configs")
 37 | 	parser.add_argument(
 38 | 		"--config-file",
 39 | 		default="configs/CityscapesDVPS/R50.yaml",
 40 | 		metavar="FILE",
 41 | 		help="path to config file",
 42 | 	)
 43 | 	parser.add_argument(
 44 | 		"--input",
 45 | 		help="directory of input video frames",
 46 | 		required=True,
 47 | 	)
 48 | 	parser.add_argument(
 49 | 		"--output",
 50 | 		help="directory to save output frames",
 51 | 		required=True,
 52 | 	)
 53 | 	parser.add_argument(
 54 | 		"--confidence-threshold",
 55 | 		type=float,
 56 | 		default=0.5,
 57 | 		help="Minimum score for instance predictions to be shown",
 58 | 	)
 59 | 	parser.add_argument(
 60 | 		"--opts",
 61 | 		help="Modify config options using the command-line 'KEY VALUE' pairs",
 62 | 		default=[],
 63 | 		nargs=argparse.REMAINDER,
 64 | 	)
 65 | 	return parser
 66 | 
 67 | if __name__ == "__main__":
 68 | 	mp.set_start_method("spawn", force=True)
 69 | 	args = get_parser().parse_args()
 70 | 	setup_logger(name="fvcore")
 71 | 	logger = setup_logger()
 72 | 	logger.info("Arguments: " + str(args))
 73 | 
 74 | 	cfg = setup_cfg(args)
 75 | 	demo = VisualizationDemo_Panoptic(cfg)
 76 | 
 77 | 	assert args.input and args.output
 78 | 
 79 | 	video_root = args.input
 80 | 	output_root = args.output
 81 | 
 82 | 	os.makedirs(output_root, exist_ok=True)
 83 | 	
 84 | 	frames_path = video_root
 85 | 	frames_path = glob.glob(os.path.expanduser(os.path.join(frames_path, '*.png')))
 86 | 	frames_path.sort()
 87 | 
 88 | 	vid_frames = []
 89 | 	for path in frames_path:
 90 | 		img = read_image(path, format="RGB")
 91 | 		vid_frames.append(img)
 92 | 
 93 | 	start_time = time.time()
 94 | 	with autocast():
 95 | 		predictions, visualized_output, visualized_depth_output = demo.run_on_video(vid_frames)
 96 | 
 97 | 	# save frames
 98 | 	for path, _vis_output in zip(frames_path, tqdm(visualized_output, initial=1)):
 99 | 		out_filename = os.path.join(output_root, os.path.basename(path).split('.png')[0]+'_seg.png')
100 | 		_vis_output.save(out_filename)
101 | 	print("Panoptic segmentation results are saved in {}".format(output_root))
102 | 
103 | 	for path, _vis_output in zip(frames_path, tqdm(visualized_depth_output, initial=1)):
104 | 		out_filename = os.path.join(output_root, os.path.basename(path).split('.png')[0]+'_depth.png')
105 | 		_vis_output.save(out_filename)
106 | 	print("Depth estimation results are saved in {}".format(output_root))
107 | 
108 | 


--------------------------------------------------------------------------------
/minvis/data_video/datasets/builtin.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA Corporation & Affiliates. All rights reserved.
 2 | #
 3 | # This work is made available under the Nvidia Source Code License-NC.
 4 | # To view a copy of this license, visit
 5 | # https://github.com/NVlabs/MinVIS/blob/main/LICENSE
 6 | 
 7 | # Copyright (c) Facebook, Inc. and its affiliates.
 8 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
 9 | 
10 | import os
11 | 
12 | from .ytvis import (
13 |     register_ytvis_instances,
14 |     _get_ytvis_2019_instances_meta,
15 |     _get_ytvis_2021_instances_meta,
16 |     _get_ovis_instances_meta,
17 | )
18 | 
19 | # ==== Predefined splits for YTVIS 2019 ===========
20 | _PREDEFINED_SPLITS_YTVIS_2019 = {
21 |     "ytvis_2019_train": ("YouTubeVIS_2019/train/JPEGImages",
22 |                          "YouTubeVIS_2019/train.json"),
23 |     "ytvis_2019_val": ("YouTubeVIS_2019/valid/JPEGImages",
24 |                        "YouTubeVIS_2019/valid.json"),
25 |     "ytvis_2019_test": ("YouTubeVIS_2019/test/JPEGImages",
26 |                         "YouTubeVIS_2019/test.json"),
27 | }
28 | 
29 | 
30 | # ==== Predefined splits for YTVIS 2021 ===========
31 | _PREDEFINED_SPLITS_YTVIS_2021 = {
32 |     "ytvis_2021_train": ("YouTubeVIS_2021/train/JPEGImages",
33 |                          "YouTubeVIS_2021/train.json"),
34 |     "ytvis_2021_val": ("YouTubeVIS_2021/valid/JPEGImages",
35 |                        "YouTubeVIS_2021/valid.json"),
36 |     "ytvis_2021_test": ("YouTubeVIS_2021/test/JPEGImages",
37 |                         "YouTubeVIS_2021/test.json"),
38 | }
39 | 
40 | # ==== Predefined splits for OVIS ===========
41 | _PREDEFINED_SPLITS_OVIS = {
42 |     "ovis_train": ("ovis/train",
43 |                          "ovis/annotations/annotations_train.json"),
44 |     "ovis_val": ("ovis/valid",
45 |                        "ovis/annotations/annotations_valid.json"),
46 |     "ovis_test": ("ovis/test",
47 |                         "ovis/annotations/annotations_test.json"),
48 | }
49 | 
50 | 
51 | def register_all_ytvis_2019(root):
52 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2019.items():
53 |         # Assume pre-defined datasets live in `./datasets`.
54 |         register_ytvis_instances(
55 |             key,
56 |             _get_ytvis_2019_instances_meta(),
57 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
58 |             os.path.join(root, image_root),
59 |         )
60 | 
61 | 
62 | def register_all_ytvis_2021(root):
63 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_YTVIS_2021.items():
64 |         # Assume pre-defined datasets live in `./datasets`.
65 |         register_ytvis_instances(
66 |             key,
67 |             _get_ytvis_2021_instances_meta(),
68 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
69 |             os.path.join(root, image_root),
70 |         )
71 | 
72 | 
73 | def register_all_ovis(root):
74 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_OVIS.items():
75 |         # Assume pre-defined datasets live in `./datasets`.
76 |         register_ytvis_instances(
77 |             key,
78 |             _get_ovis_instances_meta(),
79 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
80 |             os.path.join(root, image_root),
81 |         )
82 | 
83 | 
84 | if __name__.endswith(".builtin"):
85 |     # Assume pre-defined datasets live in `./datasets`.
86 |     _root = os.getenv("DETECTRON2_DATASETS", "datasets")
87 |     # register_all_ytvis_2019(_root)
88 |     # register_all_ytvis_2021(_root)
89 |     register_all_ovis(_root)
90 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/uni_dvps/data_video/build.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
  3 | 
  4 | from typing import Optional
  5 | import os
  6 | import torch
  7 | import time
  8 | 
  9 | from detectron2.engine import hooks
 10 | from detectron2.utils import comm
 11 | from fvcore.nn.precise_bn import get_bn_modules
 12 | 
 13 | from detectron2.utils.events import (
 14 |     EventWriter,
 15 |     get_event_storage,
 16 |     CommonMetricPrinter,
 17 |     JSONWriter,
 18 |     TensorboardXWriter
 19 | )
 20 | 
 21 | 
 22 | def build_hooks(self):
 23 |     cfg = self.cfg.clone()
 24 |     cfg.defrost()
 25 |     cfg.DATALOADER.NUM_WORKERS = 0  # save some memory and time for PreciseBN
 26 | 
 27 |     ret = [
 28 |         hooks.IterationTimer(),
 29 |         hooks.LRScheduler(),
 30 |         hooks.PreciseBN(
 31 |             # Run at the same freq as (but before) evaluation.
 32 |             cfg.TEST.EVAL_PERIOD,
 33 |             self.model,
 34 |             # Build a new data loader to not affect training
 35 |             self.build_train_loader(cfg),
 36 |             cfg.TEST.PRECISE_BN.NUM_ITER,
 37 |         )
 38 |         if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
 39 |         else None,
 40 |     ]
 41 | 
 42 |     # Do PreciseBN before checkpointer, because it updates the model and need to
 43 |     # be saved by checkpointer.
 44 |     # This is not always the best: if checkpointing has a different frequency,
 45 |     # some checkpoints may have more precise statistics than others.
 46 |     if comm.is_main_process():
 47 |         ret.append(hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD))
 48 | 
 49 |     def test_and_save_results():
 50 |         self._last_eval_results = self.test(self.cfg, self.model)
 51 |         return self._last_eval_results
 52 | 
 53 |     # Do evaluation after checkpointer, because then if it fails,
 54 |     # we can use the saved checkpoint to debug.
 55 |     ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
 56 | 
 57 |     if comm.is_main_process():
 58 |         # Here the default print/log frequency of each writer is used.
 59 |         # run writers in the end, so that evaluation metrics are written
 60 |         if cfg.OUTPUT_DIR:
 61 |             ret.append(hooks.PeriodicWriter(build_writers(cfg.OUTPUT_DIR, self.max_iter), period=cfg.TEST.LOG_PERIOD))
 62 |     return ret
 63 | 
 64 | def build_writers(output_dir: str, max_iter: Optional[int] = None):
 65 |     return [
 66 |         # It may not always print what you want to see, since it prints "common" metrics only.
 67 |         CommonMetricPrinter(max_iter),
 68 |         JSONWriter(os.path.join(output_dir, "metrics.json")),
 69 |         # TensorboardXWriter(output_dir),
 70 |         # WAndBWriter()
 71 |     ]
 72 | 
 73 | def run_step(self):
 74 |     """
 75 |     Implement the AMP training logic.
 76 |     """
 77 |     assert self.model.training, "[AMPTrainer] model was changed to eval mode!"
 78 |     assert torch.cuda.is_available(), "[AMPTrainer] CUDA is required for AMP training!"
 79 |     from torch.cuda.amp import autocast
 80 | 
 81 |     start = time.perf_counter()
 82 |     data = next(self._data_loader_iter)
 83 |     data_time = time.perf_counter() - start
 84 | 
 85 |     with autocast():
 86 |         loss_dict, image = self.model(data)
 87 |         if isinstance(loss_dict, torch.Tensor):
 88 |             losses = loss_dict
 89 |             loss_dict = {"total_loss": loss_dict}
 90 |         else:
 91 |             losses = sum(loss_dict.values())
 92 | 
 93 |     self.optimizer.zero_grad()
 94 |     # depth.retain_grad()
 95 |     self.grad_scaler.scale(losses).backward()
 96 | 
 97 |     self._write_metrics(loss_dict, data_time)
 98 | 
 99 |     if isinstance(image, torch.Tensor):
100 |         _log_images(image)
101 | 
102 |     self.grad_scaler.step(self.optimizer)
103 |     self.grad_scaler.update()
104 | 
105 | def _log_images(image):
106 |     image_name = "depth"
107 |     if comm.is_main_process():
108 |         storage = get_event_storage()
109 |         storage.put_image(image_name, image)


--------------------------------------------------------------------------------
/mask2former/test_time_augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | from itertools import count
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from fvcore.transforms import HFlipTransform
  9 | from torch import nn
 10 | from torch.nn.parallel import DistributedDataParallel
 11 | 
 12 | from detectron2.data.detection_utils import read_image
 13 | from detectron2.modeling import DatasetMapperTTA
 14 | 
 15 | 
 16 | __all__ = [
 17 |     "SemanticSegmentorWithTTA",
 18 | ]
 19 | 
 20 | 
 21 | class SemanticSegmentorWithTTA(nn.Module):
 22 |     """
 23 |     A SemanticSegmentor with test-time augmentation enabled.
 24 |     Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
 25 |     """
 26 | 
 27 |     def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
 28 |         """
 29 |         Args:
 30 |             cfg (CfgNode):
 31 |             model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
 32 |             tta_mapper (callable): takes a dataset dict and returns a list of
 33 |                 augmented versions of the dataset dict. Defaults to
 34 |                 `DatasetMapperTTA(cfg)`.
 35 |             batch_size (int): batch the augmented images into this batch size for inference.
 36 |         """
 37 |         super().__init__()
 38 |         if isinstance(model, DistributedDataParallel):
 39 |             model = model.module
 40 |         self.cfg = cfg.clone()
 41 | 
 42 |         self.model = model
 43 | 
 44 |         if tta_mapper is None:
 45 |             tta_mapper = DatasetMapperTTA(cfg)
 46 |         self.tta_mapper = tta_mapper
 47 |         self.batch_size = batch_size
 48 | 
 49 |     def __call__(self, batched_inputs):
 50 |         """
 51 |         Same input/output format as :meth:`SemanticSegmentor.forward`
 52 |         """
 53 | 
 54 |         def _maybe_read_image(dataset_dict):
 55 |             ret = copy.copy(dataset_dict)
 56 |             if "image" not in ret:
 57 |                 image = read_image(ret.pop("file_name"), self.model.input_format)
 58 |                 image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
 59 |                 ret["image"] = image
 60 |             if "height" not in ret and "width" not in ret:
 61 |                 ret["height"] = image.shape[1]
 62 |                 ret["width"] = image.shape[2]
 63 |             return ret
 64 | 
 65 |         processed_results = []
 66 |         for x in batched_inputs:
 67 |             result = self._inference_one_image(_maybe_read_image(x))
 68 |             processed_results.append(result)
 69 |         return processed_results
 70 | 
 71 |     def _inference_one_image(self, input):
 72 |         """
 73 |         Args:
 74 |             input (dict): one dataset dict with "image" field being a CHW tensor
 75 |         Returns:
 76 |             dict: one output dict
 77 |         """
 78 |         orig_shape = (input["height"], input["width"])
 79 |         augmented_inputs, tfms = self._get_augmented_inputs(input)
 80 | 
 81 |         final_predictions = None
 82 |         count_predictions = 0
 83 |         for input, tfm in zip(augmented_inputs, tfms):
 84 |             count_predictions += 1
 85 |             with torch.no_grad():
 86 |                 if final_predictions is None:
 87 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 88 |                         final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
 89 |                     else:
 90 |                         final_predictions = self.model([input])[0].pop("sem_seg")
 91 |                 else:
 92 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 93 |                         final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
 94 |                     else:
 95 |                         final_predictions += self.model([input])[0].pop("sem_seg")
 96 | 
 97 |         final_predictions = final_predictions / count_predictions
 98 |         return {"sem_seg": final_predictions}
 99 | 
100 |     def _get_augmented_inputs(self, input):
101 |         augmented_inputs = self.tta_mapper(input)
102 |         tfms = [x.pop("transforms") for x in augmented_inputs]
103 |         return augmented_inputs, tfms
104 | 


--------------------------------------------------------------------------------
/mask2former/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
  3 | """
  4 | Misc functions, including distributed helpers.
  5 | 
  6 | Mostly copy-paste from torchvision references.
  7 | """
  8 | from typing import List, Optional
  9 | 
 10 | import torch
 11 | import torch.distributed as dist
 12 | import torchvision
 13 | from torch import Tensor
 14 | 
 15 | 
 16 | def _max_by_axis(the_list):
 17 |     # type: (List[List[int]]) -> List[int]
 18 |     maxes = the_list[0]
 19 |     for sublist in the_list[1:]:
 20 |         for index, item in enumerate(sublist):
 21 |             maxes[index] = max(maxes[index], item)
 22 |     return maxes
 23 | 
 24 | 
 25 | class NestedTensor(object):
 26 |     def __init__(self, tensors, mask: Optional[Tensor]):
 27 |         self.tensors = tensors
 28 |         self.mask = mask
 29 | 
 30 |     def to(self, device):
 31 |         # type: (Device) -> NestedTensor # noqa
 32 |         cast_tensor = self.tensors.to(device)
 33 |         mask = self.mask
 34 |         if mask is not None:
 35 |             assert mask is not None
 36 |             cast_mask = mask.to(device)
 37 |         else:
 38 |             cast_mask = None
 39 |         return NestedTensor(cast_tensor, cast_mask)
 40 | 
 41 |     def decompose(self):
 42 |         return self.tensors, self.mask
 43 | 
 44 |     def __repr__(self):
 45 |         return str(self.tensors)
 46 | 
 47 | 
 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 49 |     # TODO make this more general
 50 |     if tensor_list[0].ndim == 3:
 51 |         if torchvision._is_tracing():
 52 |             # nested_tensor_from_tensor_list() does not export well to ONNX
 53 |             # call _onnx_nested_tensor_from_tensor_list() instead
 54 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
 55 | 
 56 |         # TODO make it support different-sized images
 57 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 58 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 59 |         batch_shape = [len(tensor_list)] + max_size
 60 |         b, c, h, w = batch_shape
 61 |         dtype = tensor_list[0].dtype
 62 |         device = tensor_list[0].device
 63 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 64 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 65 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 66 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 67 |             m[: img.shape[1], : img.shape[2]] = False
 68 |     else:
 69 |         raise ValueError("not supported")
 70 |     return NestedTensor(tensor, mask)
 71 | 
 72 | 
 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 75 | @torch.jit.unused
 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
 77 |     max_size = []
 78 |     for i in range(tensor_list[0].dim()):
 79 |         max_size_i = torch.max(
 80 |             torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
 81 |         ).to(torch.int64)
 82 |         max_size.append(max_size_i)
 83 |     max_size = tuple(max_size)
 84 | 
 85 |     # work around for
 86 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 87 |     # m[: img.shape[1], :img.shape[2]] = False
 88 |     # which is not yet supported in onnx
 89 |     padded_imgs = []
 90 |     padded_masks = []
 91 |     for img in tensor_list:
 92 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
 93 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
 94 |         padded_imgs.append(padded_img)
 95 | 
 96 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
 97 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
 98 |         padded_masks.append(padded_mask.to(torch.bool))
 99 | 
100 |     tensor = torch.stack(padded_imgs)
101 |     mask = torch.stack(padded_masks)
102 | 
103 |     return NestedTensor(tensor, mask=mask)
104 | 
105 | 
106 | def is_dist_avail_and_initialized():
107 |     if not dist.is_available():
108 |         return False
109 |     if not dist.is_initialized():
110 |         return False
111 |     return True
112 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 | 
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 | 
23 | 
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 | 
30 | 
31 | torch.manual_seed(3)
32 | 
33 | 
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 |     value = torch.rand(N, S, M, D).cuda() * 0.01
37 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 |     im2col_step = 2
41 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 |     fwdok = torch.allclose(output_cuda, output_pytorch)
44 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
45 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 | 
47 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 | 
49 | 
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 |     value = torch.rand(N, S, M, D).cuda() * 0.01
53 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 |     im2col_step = 2
57 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
61 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 | 
63 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 | 
65 | 
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 | 
68 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
69 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 |     im2col_step = 2
73 |     func = MSDeformAttnFunction.apply
74 | 
75 |     value.requires_grad = grad_value
76 |     sampling_locations.requires_grad = grad_sampling_loc
77 |     attention_weights.requires_grad = grad_attn_weight
78 | 
79 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 | 
81 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     check_forward_equal_with_pytorch_double()
86 |     check_forward_equal_with_pytorch_float()
87 | 
88 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 |         check_gradient_numerical(channels, True, True, True)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/uni_dvps/data_video/datasets/cityscapes_dvps.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import io
  3 | import json
  4 | import logging
  5 | import numpy as np
  6 | import os
  7 | import tqdm
  8 | import pycocotools.mask as mask_util
  9 | from fvcore.common.file_io import PathManager
 10 | from fvcore.common.timer import Timer
 11 | 
 12 | from detectron2.structures import Boxes, BoxMode, PolygonMasks
 13 | from detectron2.data import DatasetCatalog, MetadataCatalog
 14 | from detectron2.data.datasets.builtin_meta import CITYSCAPES_CATEGORIES
 15 | 
 16 | """
 17 | This file contains functions to parse Cityscapes_DVPS dataset of
 18 | COCO-format annotations into dicts in "Detectron2 format".
 19 | """
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | __all__ = ["load_cityscapes_dvps_json", "register_cityscapes_dvps"]
 24 | 
 25 | def _get_cityscapes_dvps_meta():
 26 |     thing_ids = [k["trainId"] for k in CITYSCAPES_CATEGORIES if k["isthing"] == 1]
 27 |     thing_classes = [k["name"] for k in CITYSCAPES_CATEGORIES if k["isthing"] == 1]
 28 |     thing_colors = [k["color"] for k in CITYSCAPES_CATEGORIES if k["isthing"] == 1]
 29 |     stuff_ids = [k["trainId"] for k in CITYSCAPES_CATEGORIES if k["isthing"] == 0]
 30 |     stuff_classes = [k["name"] for k in CITYSCAPES_CATEGORIES if k["isthing"] == 0]
 31 |     stuff_colors = [k["color"] for k in CITYSCAPES_CATEGORIES if k["isthing"] == 0]
 32 |     assert len(thing_ids) == 8, len(thing_ids)
 33 |     assert len(stuff_ids) == 11, len(stuff_ids)
 34 |     # Mapping from the incontiguous Cityscapes_DVPS category id to an id in [0, 10]
 35 |     thing_train_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
 36 |     stuff_train_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
 37 | 
 38 |     ret = {
 39 |         "thing_ids": thing_ids,
 40 |         "thing_classes": thing_classes,
 41 |         "thing_colors": thing_colors,
 42 |         "thing_train_id_to_contiguous_id": thing_train_id_to_contiguous_id,
 43 |         "stuff_ids": stuff_ids,
 44 |         "stuff_classes": stuff_classes,
 45 |         "stuff_colors": stuff_colors,
 46 |         "stuff_train_id_to_contiguous_id": stuff_train_id_to_contiguous_id
 47 |     }
 48 |     return ret
 49 | 
 50 | def load_cityscapes_dvps_json(gt_json, image_dir, gt_dir, meta, name):
 51 |     assert os.path.exists(gt_json), gt_json+" not exists"
 52 |     with open(gt_json) as f:
 53 |         file_dicts = json.load(f)
 54 | 
 55 |     dataset_dicts = []
 56 |     for file_dict in file_dicts:
 57 |         if file_dict["image"].split("_")[1] == "000000":
 58 |             record = {}
 59 |             record["height"] = file_dict["height"]
 60 |             record["width"] = file_dict["width"]
 61 |             # record["length"] = 6
 62 |             record["video_id"] = file_dict["image"].split("_")[0]
 63 |             record["file_names"] = [os.path.join(image_dir, file_dict["image"])]
 64 |             record["seg_file_names"] = [os.path.join(gt_dir, file_dict["seg"])]
 65 |             record["depth_file_names"] = [os.path.join(gt_dir, file_dict["depth"])]
 66 | 
 67 |             dataset_dicts.append(record)
 68 |         else:
 69 |             video_id = file_dict["image"].split("_")[0]
 70 |             image_name = os.path.join(image_dir, file_dict["image"])
 71 |             seg_gt_name = os.path.join(gt_dir, file_dict["seg"])
 72 |             depth_gt_name = os.path.join(gt_dir, file_dict["depth"])
 73 |             # video_idx = [i for i, dict in enumerate(dataset_dicts) if dict["video_id"] == video_id][0]
 74 | 
 75 |             video_idx = int(video_id)
 76 |             dataset_dicts[video_idx]["file_names"].append(image_name)
 77 |             dataset_dicts[video_idx]["seg_file_names"].append(seg_gt_name)
 78 |             dataset_dicts[video_idx]["depth_file_names"].append(depth_gt_name)
 79 | 
 80 |     logger.info("Loaded {} images from {}".format(len(file_dicts), image_dir))
 81 |     return dataset_dicts
 82 | 
 83 | 
 84 | def register_cityscapes_dvps(name, meta, gt_json, image_dir, gt_dir):
 85 |     """
 86 |     Register a dataset in Cityscapes_DVPS's json annotation format for DVPS.
 87 |     """
 88 |     assert isinstance(name, str), name
 89 |     assert isinstance(gt_json, (str, os.PathLike)), gt_json
 90 |     assert isinstance(image_dir, (str, os.PathLike)), image_dir
 91 |     assert isinstance(gt_dir, (str, os.PathLike)), gt_dir
 92 | 
 93 |     DatasetCatalog.register(name, lambda: load_cityscapes_dvps_json(gt_json, image_dir, gt_dir, meta, name))
 94 |     MetadataCatalog.get(name).set(
 95 |         panoptic_root=gt_dir,
 96 |         image_root=image_dir,
 97 |         gt_dir=gt_dir,
 98 |         evaluator_type="cityscapes_dvps",
 99 |         # ignore_label=255,
100 |         ignore_label=32,
101 |         label_divisor=1000,
102 |         **meta,
103 |     )
104 | 


--------------------------------------------------------------------------------
/mask2former/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | from detectron2.config import CfgNode as CN
  4 | 
  5 | 
  6 | def add_maskformer2_config(cfg):
  7 |     """
  8 |     Add config for MASK_FORMER.
  9 |     """
 10 |     # NOTE: configs from original maskformer
 11 |     # data config
 12 |     # select the dataset mapper
 13 |     cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
 14 |     # Color augmentation
 15 |     cfg.INPUT.COLOR_AUG_SSD = False
 16 |     # We retry random cropping until no single category in semantic segmentation GT occupies more
 17 |     # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
 18 |     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
 19 |     # Pad image and segmentation GT in dataset mapper.
 20 |     cfg.INPUT.SIZE_DIVISIBILITY = -1
 21 | 
 22 |     # solver config
 23 |     # weight decay on embedding
 24 |     cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
 25 |     # optimizer
 26 |     cfg.SOLVER.OPTIMIZER = "ADAMW"
 27 |     cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
 28 | 
 29 |     # mask_former model config
 30 |     cfg.MODEL.MASK_FORMER = CN()
 31 | 
 32 |     # loss
 33 |     cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
 34 |     cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
 35 |     cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0
 36 |     cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
 37 |     cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
 38 | 
 39 |     # transformer config
 40 |     cfg.MODEL.MASK_FORMER.NHEADS = 8
 41 |     cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
 42 |     cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
 43 |     cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
 44 |     cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
 45 |     cfg.MODEL.MASK_FORMER.PRE_NORM = False
 46 | 
 47 |     cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
 48 |     cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
 49 | 
 50 |     cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
 51 |     cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
 52 | 
 53 |     # mask_former inference config
 54 |     cfg.MODEL.MASK_FORMER.TEST = CN()
 55 |     cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
 56 |     cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
 57 |     cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
 58 |     cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
 59 |     cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
 60 |     cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
 61 | 
 62 |     # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
 63 |     # you can use this config to override
 64 |     cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
 65 | 
 66 |     # pixel decoder config
 67 |     cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
 68 |     # adding transformer in pixel decoder
 69 |     cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
 70 |     # pixel decoder
 71 |     cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
 72 | 
 73 |     # swin transformer backbone
 74 |     cfg.MODEL.SWIN = CN()
 75 |     cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
 76 |     cfg.MODEL.SWIN.PATCH_SIZE = 4
 77 |     cfg.MODEL.SWIN.EMBED_DIM = 96
 78 |     cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
 79 |     cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
 80 |     cfg.MODEL.SWIN.WINDOW_SIZE = 7
 81 |     cfg.MODEL.SWIN.MLP_RATIO = 4.0
 82 |     cfg.MODEL.SWIN.QKV_BIAS = True
 83 |     cfg.MODEL.SWIN.QK_SCALE = None
 84 |     cfg.MODEL.SWIN.DROP_RATE = 0.0
 85 |     cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
 86 |     cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
 87 |     cfg.MODEL.SWIN.APE = False
 88 |     cfg.MODEL.SWIN.PATCH_NORM = True
 89 |     cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
 90 |     cfg.MODEL.SWIN.USE_CHECKPOINT = False
 91 | 
 92 |     # NOTE: maskformer2 extra configs
 93 |     # transformer module
 94 |     cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder"
 95 | 
 96 |     # LSJ aug
 97 |     cfg.INPUT.IMAGE_SIZE = 1024
 98 |     cfg.INPUT.MIN_SCALE = 0.1
 99 |     cfg.INPUT.MAX_SCALE = 2.0
100 | 
101 |     # MSDeformAttn encoder configs
102 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
103 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
104 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
105 | 
106 |     # point loss configs
107 |     # Number of points sampled during training for a mask point head.
108 |     cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112
109 |     # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
110 |     # original paper.
111 |     cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0
112 |     # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
113 |     # the original paper.
114 |     cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
115 | 


--------------------------------------------------------------------------------
/uni_dvps/modeling/meta_arch/unified_decoder_head.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import logging
  3 | from typing import Dict
  4 | from torch import nn
  5 | 
  6 | from detectron2.config import configurable
  7 | from detectron2.layers import ShapeSpec
  8 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
  9 | 
 10 | from mask2former.modeling.pixel_decoder.fpn import build_pixel_decoder
 11 | from ..transformer_decoder.unified_transformer_decoder import build_unified_transformer_decoder
 12 | 
 13 | 
 14 | @SEM_SEG_HEADS_REGISTRY.register()
 15 | class UnifiedDecoderHead(nn.Module):
 16 |     _version = 2
 17 | 
 18 |     def _load_from_state_dict(
 19 |             self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
 20 |     ):
 21 |         # version = local_metadata.get("version", None)
 22 |         # if version is None or version < 2:
 23 |         # Do not warn if train from scratch
 24 |         scratch = True
 25 |         logger = logging.getLogger(__name__)
 26 |         for k in list(state_dict.keys()):
 27 |             newk = k
 28 |             if "sem_seg_head" in k and k.startswith(prefix + "predictor"):
 29 |                 newk = k.replace("predictor", "unified_decoder")
 30 |             if newk != k:
 31 |                 state_dict[newk] = state_dict[k]
 32 |                 del state_dict[k]
 33 |                 scratch = False
 34 | 
 35 |         if not scratch:
 36 |             logger.warning(
 37 |                 f"Weight format of {self.__class__.__name__} have changed! "
 38 |                 "Please upgrade your models. Applying automatic conversion now ..."
 39 |             )
 40 | 
 41 |     @configurable
 42 |     def __init__(
 43 |         self,
 44 |         input_shape: Dict[str, ShapeSpec],
 45 |         *,
 46 |         num_classes: int,
 47 |         pixel_decoder: nn.Module,
 48 |         loss_weight: float = 1.0,
 49 |         ignore_value: int = -1,
 50 |         # extra parameters
 51 |         unified_decoder: nn.Module,
 52 |         transformer_in_feature: str,
 53 |     ):
 54 |         super().__init__()
 55 |         input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
 56 |         self.in_features = [k for k, v in input_shape]
 57 |         feature_strides = [v.stride for k, v in input_shape]
 58 |         feature_channels = [v.channels for k, v in input_shape]
 59 |         self.num_classes = num_classes
 60 | 
 61 |         self.ignore_value = ignore_value
 62 |         self.common_stride = 4
 63 |         self.loss_weight = loss_weight
 64 | 
 65 |         self.pixel_decoder = pixel_decoder
 66 |         self.unified_decoder = unified_decoder
 67 |         self.transformer_in_feature = transformer_in_feature
 68 | 
 69 |     @classmethod
 70 |     def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
 71 |         # figure out in_channels to transformer predictor
 72 |         if cfg.MODEL.UNIFIED_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
 73 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 74 |         elif cfg.MODEL.UNIFIED_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
 75 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
 76 |         elif cfg.MODEL.UNIFIED_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder":  # for unidvps
 77 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 78 |         else:
 79 |             transformer_predictor_in_channels = input_shape[cfg.MODEL.UNIFIED_FORMER.TRANSFORMER_IN_FEATURE].channels
 80 | 
 81 |         return {
 82 |             "input_shape": {
 83 |                 k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
 84 |             },
 85 |             "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
 86 |             "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
 87 |             "pixel_decoder": build_pixel_decoder(cfg, input_shape),
 88 |             "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
 89 |             "transformer_in_feature": cfg.MODEL.UNIFIED_FORMER.TRANSFORMER_IN_FEATURE,
 90 |             "unified_decoder": build_unified_transformer_decoder(
 91 |                 cfg,
 92 |                 transformer_predictor_in_channels,
 93 |                 mask_classification=True,
 94 |             ),
 95 |         }
 96 | 
 97 |     def forward(self, features, mask=None):
 98 |         return self.layers(features, mask)
 99 | 
100 |     def layers(self, features, mask=None):
101 |         mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features)
102 |         if self.transformer_in_feature == "multi_scale_pixel_decoder":
103 |             predictions = self.unified_decoder(multi_scale_features, mask_features, mask)
104 | 
105 |         return predictions


--------------------------------------------------------------------------------
/mask2former/data/datasets/register_ade20k_instance.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import json
 3 | import logging
 4 | import numpy as np
 5 | import os
 6 | from PIL import Image
 7 | 
 8 | from detectron2.data import DatasetCatalog, MetadataCatalog
 9 | from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
10 | from detectron2.utils.file_io import PathManager
11 | 
12 | ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}]
13 | 
14 | 
15 | _PREDEFINED_SPLITS = {
16 |     # point annotations without masks
17 |     "ade20k_instance_train": (
18 |         "ADEChallengeData2016/images/training",
19 |         "ADEChallengeData2016/ade20k_instance_train.json",
20 |     ),
21 |     "ade20k_instance_val": (
22 |         "ADEChallengeData2016/images/validation",
23 |         "ADEChallengeData2016/ade20k_instance_val.json",
24 |     ),
25 | }
26 | 
27 | 
28 | def _get_ade_instances_meta():
29 |     thing_ids = [k["id"] for k in ADE_CATEGORIES]
30 |     assert len(thing_ids) == 100, len(thing_ids)
31 |     # Mapping from the incontiguous ADE category id to an id in [0, 99]
32 |     thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
33 |     thing_classes = [k["name"] for k in ADE_CATEGORIES]
34 |     ret = {
35 |         "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
36 |         "thing_classes": thing_classes,
37 |     }
38 |     return ret
39 | 
40 | 
41 | def register_all_ade20k_instance(root):
42 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS.items():
43 |         # Assume pre-defined datasets live in `./datasets`.
44 |         register_coco_instances(
45 |             key,
46 |             _get_ade_instances_meta(),
47 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
48 |             os.path.join(root, image_root),
49 |         )
50 | 
51 | 
52 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
53 | register_all_ade20k_instance(_root)
54 | 


--------------------------------------------------------------------------------
/mask2former/evaluation/instance_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import contextlib
  3 | import copy
  4 | import io
  5 | import itertools
  6 | import json
  7 | import logging
  8 | import numpy as np
  9 | import os
 10 | import pickle
 11 | from collections import OrderedDict
 12 | import pycocotools.mask as mask_util
 13 | import torch
 14 | from pycocotools.coco import COCO
 15 | from pycocotools.cocoeval import COCOeval
 16 | from tabulate import tabulate
 17 | 
 18 | import detectron2.utils.comm as comm
 19 | from detectron2.config import CfgNode
 20 | from detectron2.data import MetadataCatalog
 21 | from detectron2.data.datasets.coco import convert_to_coco_json
 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
 23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt
 24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou
 25 | from detectron2.utils.file_io import PathManager
 26 | from detectron2.utils.logger import create_small_table
 27 | 
 28 | 
 29 | # modified from COCOEvaluator for instance segmetnat
 30 | class InstanceSegEvaluator(COCOEvaluator):
 31 |     """
 32 |     Evaluate AR for object proposals, AP for instance detection/segmentation, AP
 33 |     for keypoint detection outputs using COCO's metrics.
 34 |     See http://cocodataset.org/#detection-eval and
 35 |     http://cocodataset.org/#keypoints-eval to understand its metrics.
 36 |     The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
 37 |     the metric cannot be computed (e.g. due to no predictions made).
 38 | 
 39 |     In addition to COCO, this evaluator is able to support any bounding box detection,
 40 |     instance segmentation, or keypoint detection dataset.
 41 |     """
 42 | 
 43 |     def _eval_predictions(self, predictions, img_ids=None):
 44 |         """
 45 |         Evaluate predictions. Fill self._results with the metrics of the tasks.
 46 |         """
 47 |         self._logger.info("Preparing results for COCO format ...")
 48 |         coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
 49 |         tasks = self._tasks or self._tasks_from_predictions(coco_results)
 50 | 
 51 |         # unmap the category ids for COCO
 52 |         if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
 53 |             dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
 54 |             # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
 55 |             # num_classes = len(all_contiguous_ids)
 56 |             # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
 57 | 
 58 |             reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
 59 |             for result in coco_results:
 60 |                 category_id = result["category_id"]
 61 |                 # assert category_id < num_classes, (
 62 |                 #     f"A prediction has class={category_id}, "
 63 |                 #     f"but the dataset only has {num_classes} classes and "
 64 |                 #     f"predicted class id should be in [0, {num_classes - 1}]."
 65 |                 # )
 66 |                 assert category_id in reverse_id_mapping, (
 67 |                     f"A prediction has class={category_id}, "
 68 |                     f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
 69 |                 )
 70 |                 result["category_id"] = reverse_id_mapping[category_id]
 71 | 
 72 |         if self._output_dir:
 73 |             file_path = os.path.join(self._output_dir, "coco_instances_results.json")
 74 |             self._logger.info("Saving results to {}".format(file_path))
 75 |             with PathManager.open(file_path, "w") as f:
 76 |                 f.write(json.dumps(coco_results))
 77 |                 f.flush()
 78 | 
 79 |         if not self._do_evaluation:
 80 |             self._logger.info("Annotations are not available for evaluation.")
 81 |             return
 82 | 
 83 |         self._logger.info(
 84 |             "Evaluating predictions with {} COCO API...".format(
 85 |                 "unofficial" if self._use_fast_impl else "official"
 86 |             )
 87 |         )
 88 |         for task in sorted(tasks):
 89 |             assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
 90 |             coco_eval = (
 91 |                 _evaluate_predictions_on_coco(
 92 |                     self._coco_api,
 93 |                     coco_results,
 94 |                     task,
 95 |                     kpt_oks_sigmas=self._kpt_oks_sigmas,
 96 |                     use_fast_impl=self._use_fast_impl,
 97 |                     img_ids=img_ids,
 98 |                     max_dets_per_image=self._max_dets_per_image,
 99 |                 )
100 |                 if len(coco_results) > 0
101 |                 else None  # cocoapi does not handle empty results very well
102 |             )
103 | 
104 |             res = self._derive_coco_results(
105 |                 coco_eval, task, class_names=self._metadata.get("thing_classes")
106 |             )
107 |             self._results[task] = res
108 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Uni-DVPS (RA-L 2024)
  2 | ### [Project Page](https://jiyeon-k1m.github.io/uni-dvps) | [Paper](https://ieeexplore.ieee.org/document/10517661)
  3 | This repository contains the official implementation of the RA-L 2024 paper, 
  4 | "Uni-DVPS: Unified Model for Depth-Aware Video Panoptic Segmentation".
  5 | 
  6 | <img alt="teaser" src="assets/teaser.png">
  7 | 
  8 | ## Installation
  9 | ### Requirements
 10 | - Ubuntu 18.04 with Python 3.7
 11 | - PyTorch 1.9.1
 12 | - CUDA 11.1
 13 | - Detectron2: [Detectron2 installation instruction](https://detectron2.readthedocs.io/en/latest/tutorials/install.html)
 14 | - OpenCV is optional but needed for demo and visualization
 15 | 
 16 | ### Environment setup
 17 | `CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit.
 18 | ```shell
 19 | conda create --name unidvps python=3.7
 20 | conda activate unidvps
 21 | 
 22 | # pytorch installation
 23 | pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html
 24 | 
 25 | # opencv installation
 26 | pip install -U opencv-python
 27 | 
 28 | # detectron2 installation
 29 | git clone --recursive git@github.com:facebookresearch/detectron2.git
 30 | cd detectron2
 31 | git checkout 1315c8977e867b9979b4fa712638c1975ca8c78f
 32 | pip install -e .
 33 | 
 34 | pip install git+https://github.com/cocodataset/panopticapi.git
 35 | pip install git+https://github.com/mcordts/cityscapesScripts.git
 36 | 
 37 | cd ..
 38 | git clone https://github.com/postech-ami/Uni-DVPS.git
 39 | cd Uni-DVPS
 40 | pip install -r requirements.txt
 41 | ```
 42 | ### CUDA kernel for MSDeformAttm
 43 | After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn.
 44 | 
 45 | ```shell
 46 | cd mask2former/modeling/pixel_decoder/ops
 47 | sh make.sh
 48 | ```
 49 | 
 50 | ## Preparing data
 51 | First, download [Cityscapes-DVPS](https://github.com/joe-siyuan-qiao/ViP-DeepLab/blob/master/cityscapes-dvps/README.md) and [SemKITTI-DVPS](https://github.com/joe-siyuan-qiao/ViP-DeepLab/tree/master/semkitti-dvps) datasets.   
 52 | Then download dataset json file from [this link](https://drive.google.com/drive/folders/1mVnO-bnwblx9sgPPqtfQ6_zMyky9GpC5?usp=sharing).
 53 | The datasets are assumed to exist in a directory specified by the environment variable `DETECTRON2_DATASETS`.
 54 | 
 55 | You can set the location of dataset directory by ```export DETECTRON2_DATASETS=/path/to/dataset```.   
 56 | If it is unset, the default will be `./datasets` relative to your current working directory.
 57 | 
 58 | We follow the format of Detectron2 Custom Datasets. Please refer [this page](https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html) if you want to use your own dataset.   
 59 | 
 60 | ### Expected dataset structure
 61 | ```
 62 | $DETECTRON2_DATASETS
 63 | ├── cityscapes-dvps
 64 | │   └── video_sequence
 65 | │       ├── dvps_cityscapes_val.json
 66 | │       └── val
 67 | │           ├── 000000_000000_munster_000168_000004_leftImg8bit.png
 68 | │           ├── 000000_000000_munster_000168_000004_gtFine_instanceTrainIds.png
 69 | │           ├── 000000_000000_munster_000168_000004_depth.png
 70 | │           └── ...
 71 | └── semkitti-dvps
 72 |     └── video_sequence
 73 |         ├── dvps_semkitti_val.json
 74 |         └── val
 75 |             ├── 000008_000000_leftImg8bit.png  
 76 |             ├── 000008_000000_gtFine_class.png             
 77 |             ├── 000008_000000_gtFine_instance.png
 78 |             ├── 000008_000000_depth_707.0911865234375.png
 79 |             └── ...
 80 | ```
 81 | 
 82 | ### Pretrained models
 83 | The trained models are available for download in [this link](https://drive.google.com/drive/folders/1PCIoEbvv6U3Te2M3iZrp9ys2_kDa-Xvm?usp=sharing).
 84 | 
 85 | ## Getting Start with Uni-DVPS
 86 | ### Demo
 87 | Visualize the results of video panoptic segmentation and depth estimation.
 88 | ```shell
 89 | python demo/demo.py \
 90 | --config-file configs/CityscapesDVPS/R50.yaml \
 91 | --input /path/to/video/frames \
 92 | --output /output/folder \
 93 | --opts MODEL.WEIGHTS /path/to/checkpoint_file
 94 | ```
 95 | <img alt="demo" src="assets/demo.png">
 96 | 
 97 | ### Evaluation on Cityscapes-DVPS dataset
 98 | You can evaluate the model on Cityscapes-DVPS dataset.
 99 | ```shell
100 | python train.py \
101 | --config-file configs/CityscapesDVPS/R50.yaml \
102 | --eval-only \
103 | OUTPUT_DIR /output/folder \
104 | MODEL.WEIGHTS /path/to/checkpoint_file
105 | ```
106 | 
107 | ### Evaluation on SemKITTI-DVPS dataset
108 | 
109 | You can evaluate the model on SemKITTI-DVPS dataset.   
110 | Please set the argument `EVAL_FRAMES` among {1, 5, 10, 20}.
111 | 
112 | ```shell
113 | python train.py \
114 | --config-file configs/SemKITTIDVPS/R50.yaml \
115 | --eval-only \
116 | EVAL_FRAMES 5 \
117 | OUTPUT_DIR /output/folder \
118 | MODEL.WEIGHTS /path/to/checkpoint_file
119 | ```
120 | ## Citation
121 | If you use Uni-DVPS in your research or wish to use our code, please consider citing:
122 | ```
123 | @article{jiyeon2024unidvps,
124 |   title={Uni-DVPS: Unified Model for Depth-Aware Video Panoptic Segmentation},
125 |   author={Ji-Yeon, Kim and Hyun-Bin, Oh and Byung-Ki, Kwon and Kim, Dahun and Kwon, Yongjin and Oh, Tae-Hyun},
126 |   journal={IEEE Robotics and Automation Letters},
127 |   year={2024},
128 |   publisher={IEEE}
129 | }
130 | ```
131 | 
132 | ## Acknowledgement
133 | The implementation of Uni-DVPS is largely fine-tuned from [Mask2former](https://github.com/facebookresearch/Mask2Former) and [MinVIS](https://github.com/NVlabs/MinVIS).   
134 | Depth-aware Video Panoptic Segmentation datasets are from [ViP-DeepLab](https://github.com/joe-siyuan-qiao/ViP-DeepLab).   
135 | We would like to sincerely thank the authors for generously sharing their code and data.
136 | 
137 | > This work was supported by Institute of Information & Communications Technology Planning & Evaluation (IITP) grant funded by the Korea government (MSIT) (No. 2020-0-00004, Development of Previsional Intelligence based on Long-term Visual Memory Network) and Institute of Information & communications Technology Planning \& Evaluation (IITP) grant funded by the Korea government(MSIT) (No.2022-0-00290, Visual Intelligence for Space-Time Understanding and Generation based on Multi-layered Visual Common Sense).


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Uni-DVPS Training Script.
  3 | This script is based on Mask2Former and MinVIS.
  4 | """
  5 | import os
  6 | import copy
  7 | import itertools
  8 | import logging
  9 | import torch
 10 | from collections import OrderedDict
 11 | from typing import Any, Dict, List, Set
 12 | 
 13 | # detectron2
 14 | from detectron2.checkpoint import DetectionCheckpointer
 15 | import detectron2.utils.comm as comm
 16 | from detectron2.config import get_cfg
 17 | from detectron2.engine import (
 18 |     DefaultTrainer,
 19 |     default_argument_parser,
 20 |     default_setup,
 21 |     launch,
 22 | )
 23 | from detectron2.evaluation import (
 24 |     DatasetEvaluator,
 25 |     inference_on_dataset,
 26 |     print_csv_format,
 27 |     verify_results,
 28 | )
 29 | from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler
 30 | from detectron2.solver.build import maybe_add_gradient_clipping
 31 | from detectron2.utils.logger import setup_logger
 32 | 
 33 | # models
 34 | from mask2former import add_maskformer2_config
 35 | from mask2former_video import add_maskformer2_video_config
 36 | from minvis import (
 37 |     add_minvis_config,
 38 |     build_detection_test_loader,
 39 | )
 40 | from uni_dvps import (
 41 |     add_uni_dvps_config,
 42 |     CityscapesDVPSDatasetMapper,
 43 |     CityscapesDVPSEvaluator,
 44 |     SemkittiDVPSDatasetMapper,
 45 |     SemkittiDVPSEvaluator,
 46 | )
 47 | 
 48 | import warnings
 49 | warnings.filterwarnings(action='ignore')
 50 | 
 51 | 
 52 | class Trainer(DefaultTrainer):
 53 |     @classmethod
 54 |     def build_evaluator(cls, cfg, dataset_name, output_folder=None):
 55 |         if output_folder is None:
 56 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
 57 |             os.makedirs(output_folder, exist_ok=True)
 58 | 
 59 |         if "cityscapes" in dataset_name:
 60 |             evaluator = CityscapesDVPSEvaluator(dataset_name, output_folder)
 61 |         if "kitti" in dataset_name:
 62 |             evaluator = SemkittiDVPSEvaluator(dataset_name, output_folder, eval_frame=int(dataset_name.split('val')[1]))
 63 | 
 64 |         return evaluator
 65 | 
 66 |     @classmethod
 67 |     def build_test_loader(cls, cfg, dataset_name):
 68 |         dataset_name = cfg.DATASETS.TEST[0]
 69 |         if "cityscapes" in dataset_name:
 70 |             mapper = CityscapesDVPSDatasetMapper(cfg, is_train=False)
 71 |         if "kitti" in dataset_name:
 72 |             mapper = SemkittiDVPSDatasetMapper(cfg, is_train= False)
 73 |         return build_detection_test_loader(cfg, dataset_name, mapper=mapper)
 74 | 
 75 |     @classmethod
 76 |     def test(cls, cfg, model, evaluators=None, eval_frames=None):
 77 |         from torch.cuda.amp import autocast
 78 |         logger = logging.getLogger(__name__)
 79 | 
 80 |         if isinstance(evaluators, DatasetEvaluator):
 81 |             evaluators = [evaluators]
 82 |         if evaluators is not None:
 83 |             assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format(
 84 |                 len(cfg.DATASETS.TEST), len(evaluators)
 85 |             )
 86 | 
 87 |         results = OrderedDict()
 88 |         for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
 89 |             data_loader = cls.build_test_loader(cfg, dataset_name)
 90 |             if evaluators is not None:
 91 |                 evaluator = evaluators[idx]
 92 |             else:
 93 |                 try:
 94 |                     evaluator = cls.build_evaluator(cfg, dataset_name)
 95 |                 except NotImplementedError:
 96 |                     logger.warn(
 97 |                         "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
 98 |                         "or implement its `build_evaluator` method."
 99 |                     )
100 |                     results[dataset_name] = {}
101 |                     continue
102 |             with autocast():
103 |                 results_i = inference_on_dataset(model, data_loader, evaluator)
104 |             results[dataset_name] = results_i
105 |             if comm.is_main_process():
106 |                 assert isinstance(
107 |                     results_i, dict
108 |                 ), "Evaluator must return a dict on the main process. Got {} instead.".format(
109 |                     results_i
110 |                 )
111 |                 logger.info("Evaluation results for {} in csv format:".format(dataset_name))
112 |                 print_csv_format(results_i)
113 | 
114 |         if len(results) == 1:
115 |             results = list(results.values())[0]
116 |         return results
117 | 
118 | 
119 | def setup(args):
120 |     """
121 |     Create configs and perform basic setups.
122 |     """
123 |     cfg = get_cfg()
124 |     # for poly lr schedule
125 |     add_deeplab_config(cfg)
126 |     add_maskformer2_config(cfg)
127 |     add_maskformer2_video_config(cfg)
128 |     add_minvis_config(cfg)
129 |     add_uni_dvps_config(cfg)
130 |     cfg.merge_from_file(args.config_file)
131 |     cfg.merge_from_list(args.opts)
132 |     if cfg.EVAL_FRAMES:
133 |         cfg.DATASETS.TEST = (cfg.DATASETS.TEST[0]+str(cfg.EVAL_FRAMES),)
134 |     cfg.freeze()
135 |     default_setup(cfg, args)
136 |     setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="uni_dvps")
137 |     return cfg
138 | 
139 | 
140 | def main(args):
141 |     cfg = setup(args)
142 |     if args.eval_only:
143 |         model = Trainer.build_model(cfg)
144 |         DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
145 |             cfg.MODEL.WEIGHTS, resume=args.resume
146 |         )
147 |         res = Trainer.test(cfg, model)
148 |         if cfg.TEST.AUG.ENABLED:
149 |             raise NotImplementedError
150 |         if comm.is_main_process():
151 |             verify_results(cfg, res)
152 |         return res
153 | 
154 |     trainer = Trainer(cfg)
155 |     trainer.resume_or_load(resume=args.resume)
156 |     return trainer.train()
157 | 
158 | 
159 | if __name__ == "__main__":
160 |     args = default_argument_parser().parse_args()
161 |     print("Command Line Args:", args)
162 |     launch(
163 |         main,
164 |         args.num_gpus,
165 |         num_machines=args.num_machines,
166 |         machine_rank=args.machine_rank,
167 |         dist_url=args.dist_url,
168 |         args=(args,),
169 |     )
170 | 


--------------------------------------------------------------------------------
/mask2former/modeling/meta_arch/mask_former_head.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import logging
  3 | from copy import deepcopy
  4 | from typing import Callable, Dict, List, Optional, Tuple, Union
  5 | 
  6 | import fvcore.nn.weight_init as weight_init
  7 | from torch import nn
  8 | from torch.nn import functional as F
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
 12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
 13 | 
 14 | from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder
 15 | from ..pixel_decoder.fpn import build_pixel_decoder
 16 | 
 17 | 
 18 | @SEM_SEG_HEADS_REGISTRY.register()
 19 | class MaskFormerHead(nn.Module):
 20 | 
 21 |     _version = 2
 22 | 
 23 |     def _load_from_state_dict(
 24 |         self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
 25 |     ):
 26 |         version = local_metadata.get("version", None)
 27 |         if version is None or version < 2:
 28 |             # Do not warn if train from scratch
 29 |             scratch = True
 30 |             logger = logging.getLogger(__name__)
 31 |             for k in list(state_dict.keys()):
 32 |                 newk = k
 33 |                 if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
 34 |                     # newk = k.replace(prefix, prefix + "pixel_decoder.")
 35 |                     newk = k.replace(prefix, prefix)
 36 |                     # logger.debug(f"{k} ==> {newk}")
 37 |                 if newk != k:
 38 |                     state_dict[newk] = state_dict[k]
 39 |                     del state_dict[k]
 40 |                     scratch = False
 41 | 
 42 |             if not scratch:
 43 |                 logger.warning(
 44 |                     f"Weight format of {self.__class__.__name__} have changed! "
 45 |                     "Please upgrade your models. Applying automatic conversion now ..."
 46 |                 )
 47 | 
 48 |     @configurable
 49 |     def __init__(
 50 |         self,
 51 |         input_shape: Dict[str, ShapeSpec],
 52 |         *,
 53 |         num_classes: int,
 54 |         pixel_decoder: nn.Module,
 55 |         loss_weight: float = 1.0,
 56 |         ignore_value: int = -1,
 57 |         # extra parameters
 58 |         transformer_predictor: nn.Module,
 59 |         transformer_in_feature: str,
 60 |     ):
 61 |         """
 62 |         NOTE: this interface is experimental.
 63 |         Args:
 64 |             input_shape: shapes (channels and stride) of the input features
 65 |             num_classes: number of classes to predict
 66 |             pixel_decoder: the pixel decoder module
 67 |             loss_weight: loss weight
 68 |             ignore_value: category id to be ignored during training.
 69 |             transformer_predictor: the transformer decoder that makes prediction
 70 |             transformer_in_feature: input feature name to the transformer_predictor
 71 |         """
 72 |         super().__init__()
 73 |         input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
 74 |         self.in_features = [k for k, v in input_shape]
 75 |         feature_strides = [v.stride for k, v in input_shape]
 76 |         feature_channels = [v.channels for k, v in input_shape]
 77 | 
 78 |         self.ignore_value = ignore_value
 79 |         self.common_stride = 4
 80 |         self.loss_weight = loss_weight
 81 | 
 82 |         self.pixel_decoder = pixel_decoder
 83 |         self.predictor = transformer_predictor
 84 |         self.transformer_in_feature = transformer_in_feature
 85 | 
 86 |         self.num_classes = num_classes
 87 | 
 88 |     @classmethod
 89 |     def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
 90 |         # figure out in_channels to transformer predictor
 91 |         if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
 92 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 93 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
 94 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
 95 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder":  # for maskformer2
 96 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 97 |         else:
 98 |             transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels
 99 | 
100 |         return {
101 |             "input_shape": {
102 |                 k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
103 |             },
104 |             "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
105 |             "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
106 |             "pixel_decoder": build_pixel_decoder(cfg, input_shape),
107 |             "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
108 |             "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
109 |             "transformer_predictor": build_transformer_decoder(
110 |                 cfg,
111 |                 transformer_predictor_in_channels,
112 |                 mask_classification=True,
113 |             ),
114 |         }
115 | 
116 |     def forward(self, features, mask=None):
117 |         return self.layers(features, mask)
118 | 
119 |     def layers(self, features, mask=None):
120 |         mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features)
121 |         if self.transformer_in_feature == "multi_scale_pixel_decoder":
122 |             predictions = self.predictor(multi_scale_features, mask_features, mask)
123 |         else:
124 |             if self.transformer_in_feature == "transformer_encoder":
125 |                 assert (
126 |                     transformer_encoder_features is not None
127 |                 ), "Please use the TransformerEncoderPixelDecoder."
128 |                 predictions = self.predictor(transformer_encoder_features, mask_features, mask)
129 |             elif self.transformer_in_feature == "pixel_embedding":
130 |                 predictions = self.predictor(mask_features, mask_features, mask)
131 |             else:
132 |                 predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
133 |         return predictions
134 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
  3 | import copy
  4 | import logging
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import detection_utils as utils
 11 | from detectron2.data import transforms as T
 12 | from detectron2.data.transforms import TransformGen
 13 | from detectron2.structures import BitMasks, Boxes, Instances
 14 | 
 15 | __all__ = ["COCOPanopticNewBaselineDatasetMapper"]
 16 | 
 17 | 
 18 | def build_transform_gen(cfg, is_train):
 19 |     """
 20 |     Create a list of default :class:`Augmentation` from config.
 21 |     Now it includes resizing and flipping.
 22 |     Returns:
 23 |         list[Augmentation]
 24 |     """
 25 |     assert is_train, "Only support training augmentation"
 26 |     image_size = cfg.INPUT.IMAGE_SIZE
 27 |     min_scale = cfg.INPUT.MIN_SCALE
 28 |     max_scale = cfg.INPUT.MAX_SCALE
 29 | 
 30 |     augmentation = []
 31 | 
 32 |     if cfg.INPUT.RANDOM_FLIP != "none":
 33 |         augmentation.append(
 34 |             T.RandomFlip(
 35 |                 horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
 36 |                 vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
 37 |             )
 38 |         )
 39 | 
 40 |     augmentation.extend([
 41 |         T.ResizeScale(
 42 |             min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
 43 |         ),
 44 |         T.FixedSizeCrop(crop_size=(image_size, image_size)),
 45 |     ])
 46 | 
 47 |     return augmentation
 48 | 
 49 | 
 50 | # This is specifically designed for the COCO dataset.
 51 | class COCOPanopticNewBaselineDatasetMapper:
 52 |     """
 53 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 54 |     and map it into a format used by MaskFormer.
 55 | 
 56 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 57 | 
 58 |     The callable currently does the following:
 59 | 
 60 |     1. Read the image from "file_name"
 61 |     2. Applies geometric transforms to the image and annotation
 62 |     3. Find and applies suitable cropping to the image and annotation
 63 |     4. Prepare image and annotation to Tensors
 64 |     """
 65 | 
 66 |     @configurable
 67 |     def __init__(
 68 |         self,
 69 |         is_train=True,
 70 |         *,
 71 |         tfm_gens,
 72 |         image_format,
 73 |     ):
 74 |         """
 75 |         NOTE: this interface is experimental.
 76 |         Args:
 77 |             is_train: for training or inference
 78 |             augmentations: a list of augmentations or deterministic transforms to apply
 79 |             crop_gen: crop augmentation
 80 |             tfm_gens: data augmentation
 81 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 82 |         """
 83 |         self.tfm_gens = tfm_gens
 84 |         logging.getLogger(__name__).info(
 85 |             "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
 86 |                 str(self.tfm_gens)
 87 |             )
 88 |         )
 89 | 
 90 |         self.img_format = image_format
 91 |         self.is_train = is_train
 92 | 
 93 |     @classmethod
 94 |     def from_config(cls, cfg, is_train=True):
 95 |         # Build augmentation
 96 |         tfm_gens = build_transform_gen(cfg, is_train)
 97 | 
 98 |         ret = {
 99 |             "is_train": is_train,
100 |             "tfm_gens": tfm_gens,
101 |             "image_format": cfg.INPUT.FORMAT,
102 |         }
103 |         return ret
104 | 
105 |     def __call__(self, dataset_dict):
106 |         """
107 |         Args:
108 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
109 | 
110 |         Returns:
111 |             dict: a format that builtin models in detectron2 accept
112 |         """
113 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
114 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
115 |         utils.check_image_size(dataset_dict, image)
116 | 
117 |         image, transforms = T.apply_transform_gens(self.tfm_gens, image)
118 |         image_shape = image.shape[:2]  # h, w
119 | 
120 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
121 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
122 |         # Therefore it's important to use torch.Tensor.
123 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
124 | 
125 |         if not self.is_train:
126 |             # USER: Modify this if you want to keep them for some reason.
127 |             dataset_dict.pop("annotations", None)
128 |             return dataset_dict
129 | 
130 |         if "pan_seg_file_name" in dataset_dict:
131 |             pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
132 |             segments_info = dataset_dict["segments_info"]
133 | 
134 |             # apply the same transformation to panoptic segmentation
135 |             pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
136 | 
137 |             from panopticapi.utils import rgb2id
138 | 
139 |             pan_seg_gt = rgb2id(pan_seg_gt)
140 | 
141 |             instances = Instances(image_shape)
142 |             classes = []
143 |             masks = []
144 |             for segment_info in segments_info:
145 |                 class_id = segment_info["category_id"]
146 |                 if not segment_info["iscrowd"]:
147 |                     classes.append(class_id)
148 |                     masks.append(pan_seg_gt == segment_info["id"])
149 | 
150 |             classes = np.array(classes)
151 |             instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
152 |             if len(masks) == 0:
153 |                 # Some image does not have annotation (all ignored)
154 |                 instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
155 |                 instances.gt_boxes = Boxes(torch.zeros((0, 4)))
156 |             else:
157 |                 masks = BitMasks(
158 |                     torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
159 |                 )
160 |                 instances.gt_masks = masks.tensor
161 |                 instances.gt_boxes = masks.get_bounding_boxes()
162 | 
163 |             dataset_dict["instances"] = instances
164 | 
165 |         return dataset_dict
166 | 


--------------------------------------------------------------------------------
/minvis/data_video/augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
  3 | 
  4 | import numpy as np
  5 | import logging
  6 | import sys
  7 | from fvcore.transforms.transform import (
  8 |     HFlipTransform,
  9 |     NoOpTransform,
 10 |     VFlipTransform,
 11 | )
 12 | from PIL import Image
 13 | 
 14 | from detectron2.data import transforms as T
 15 | 
 16 | 
 17 | class ResizeShortestEdge(T.Augmentation):
 18 |     """
 19 |     Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
 20 |     If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
 21 |     """
 22 | 
 23 |     def __init__(
 24 |         self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1
 25 |     ):
 26 |         """
 27 |         Args:
 28 |             short_edge_length (list[int]): If ``sample_style=="range"``,
 29 |                 a [min, max] interval from which to sample the shortest edge length.
 30 |                 If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
 31 |             max_size (int): maximum allowed longest edge length.
 32 |             sample_style (str): either "range" or "choice".
 33 |         """
 34 |         super().__init__()
 35 |         assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style
 36 | 
 37 |         self.is_range = ("range" in sample_style)
 38 |         if isinstance(short_edge_length, int):
 39 |             short_edge_length = (short_edge_length, short_edge_length)
 40 |         if self.is_range:
 41 |             assert len(short_edge_length) == 2, (
 42 |                 "short_edge_length must be two values using 'range' sample style."
 43 |                 f" Got {short_edge_length}!"
 44 |             )
 45 |         self._cnt = 0
 46 |         self._init(locals())
 47 | 
 48 |     def get_transform(self, image):
 49 |         if self._cnt % self.clip_frame_cnt == 0:
 50 |             if self.is_range:
 51 |                 self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
 52 |             else:
 53 |                 self.size = np.random.choice(self.short_edge_length)
 54 |             if self.size == 0:
 55 |                 return NoOpTransform()
 56 | 
 57 |             self._cnt = 0   # avoiding overflow
 58 |         self._cnt += 1
 59 | 
 60 |         h, w = image.shape[:2]
 61 | 
 62 |         scale = self.size * 1.0 / min(h, w)
 63 |         if h < w:
 64 |             newh, neww = self.size, scale * w
 65 |         else:
 66 |             newh, neww = scale * h, self.size
 67 |         if max(newh, neww) > self.max_size:
 68 |             scale = self.max_size * 1.0 / max(newh, neww)
 69 |             newh = newh * scale
 70 |             neww = neww * scale
 71 |         neww = int(neww + 0.5)
 72 |         newh = int(newh + 0.5)
 73 |         return T.ResizeTransform(h, w, newh, neww, self.interp)
 74 | 
 75 | 
 76 | class RandomFlip(T.Augmentation):
 77 |     """
 78 |     Flip the image horizontally or vertically with the given probability.
 79 |     """
 80 | 
 81 |     def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1):
 82 |         """
 83 |         Args:
 84 |             prob (float): probability of flip.
 85 |             horizontal (boolean): whether to apply horizontal flipping
 86 |             vertical (boolean): whether to apply vertical flipping
 87 |         """
 88 |         super().__init__()
 89 | 
 90 |         if horizontal and vertical:
 91 |             raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.")
 92 |         if not horizontal and not vertical:
 93 |             raise ValueError("At least one of horiz or vert has to be True!")
 94 |         self._cnt = 0
 95 | 
 96 |         self._init(locals())
 97 | 
 98 |     def get_transform(self, image):
 99 |         if self._cnt % self.clip_frame_cnt == 0:
100 |             self.do = self._rand_range() < self.prob
101 |             self._cnt = 0   # avoiding overflow
102 |         self._cnt += 1
103 | 
104 |         h, w = image.shape[:2]
105 | 
106 |         if self.do:
107 |             if self.horizontal:
108 |                 return HFlipTransform(w)
109 |             elif self.vertical:
110 |                 return VFlipTransform(h)
111 |         else:
112 |             return NoOpTransform()
113 | 
114 | 
115 | def build_augmentation(cfg, is_train):
116 |     logger = logging.getLogger(__name__)
117 |     aug_list = []
118 |     if is_train:
119 |         # Crop
120 |         if cfg.INPUT.CROP.ENABLED:
121 |             aug_list.append(T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
122 | 
123 |         # Resize
124 |         min_size = cfg.INPUT.MIN_SIZE_TRAIN
125 |         max_size = cfg.INPUT.MAX_SIZE_TRAIN
126 |         sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
127 |         ms_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM if "by_clip" in cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING else 1
128 |         aug_list.append(ResizeShortestEdge(min_size, max_size, sample_style, clip_frame_cnt=ms_clip_frame_cnt))
129 | 
130 |         # Flip
131 |         if cfg.INPUT.RANDOM_FLIP != "none":
132 |             if cfg.INPUT.RANDOM_FLIP == "flip_by_clip":
133 |                 flip_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM
134 |             else:
135 |                 flip_clip_frame_cnt = 1
136 | 
137 |             aug_list.append(
138 |                 # NOTE using RandomFlip modified for the support of flip maintenance
139 |                 RandomFlip(
140 |                     horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"),
141 |                     vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
142 |                     clip_frame_cnt=flip_clip_frame_cnt,
143 |                 )
144 |             )
145 | 
146 |         # Additional augmentations : brightness, contrast, saturation, rotation
147 |         augmentations = cfg.INPUT.AUGMENTATIONS
148 |         if "brightness" in augmentations:
149 |             aug_list.append(T.RandomBrightness(0.9, 1.1))
150 |         if "contrast" in augmentations:
151 |             aug_list.append(T.RandomContrast(0.9, 1.1))
152 |         if "saturation" in augmentations:
153 |             aug_list.append(T.RandomSaturation(0.9, 1.1))
154 |         if "rotation" in augmentations:
155 |             aug_list.append(
156 |                 T.RandomRotation(
157 |                     [-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], sample_style="range"
158 |                 )
159 |             )
160 |     else:
161 |         # Resize
162 |         min_size = cfg.INPUT.MIN_SIZE_TEST
163 |         max_size = cfg.INPUT.MAX_SIZE_TEST
164 |         sample_style = "choice"
165 |         aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
166 | 
167 |     return aug_list
168 | 


--------------------------------------------------------------------------------
/mask2former_video/data_video/augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
  3 | 
  4 | import numpy as np
  5 | import logging
  6 | import sys
  7 | from fvcore.transforms.transform import (
  8 |     HFlipTransform,
  9 |     NoOpTransform,
 10 |     VFlipTransform,
 11 | )
 12 | from PIL import Image
 13 | 
 14 | from detectron2.data import transforms as T
 15 | 
 16 | 
 17 | class ResizeShortestEdge(T.Augmentation):
 18 |     """
 19 |     Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
 20 |     If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
 21 |     """
 22 | 
 23 |     def __init__(
 24 |         self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1
 25 |     ):
 26 |         """
 27 |         Args:
 28 |             short_edge_length (list[int]): If ``sample_style=="range"``,
 29 |                 a [min, max] interval from which to sample the shortest edge length.
 30 |                 If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
 31 |             max_size (int): maximum allowed longest edge length.
 32 |             sample_style (str): either "range" or "choice".
 33 |         """
 34 |         super().__init__()
 35 |         assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style
 36 | 
 37 |         self.is_range = ("range" in sample_style)
 38 |         if isinstance(short_edge_length, int):
 39 |             short_edge_length = (short_edge_length, short_edge_length)
 40 |         if self.is_range:
 41 |             assert len(short_edge_length) == 2, (
 42 |                 "short_edge_length must be two values using 'range' sample style."
 43 |                 f" Got {short_edge_length}!"
 44 |             )
 45 |         self._cnt = 0
 46 |         self._init(locals())
 47 | 
 48 |     def get_transform(self, image):
 49 |         if self._cnt % self.clip_frame_cnt == 0:
 50 |             if self.is_range:
 51 |                 self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
 52 |             else:
 53 |                 self.size = np.random.choice(self.short_edge_length)
 54 |             if self.size == 0:
 55 |                 return NoOpTransform()
 56 | 
 57 |             self._cnt = 0   # avoiding overflow
 58 |         self._cnt += 1
 59 | 
 60 |         h, w = image.shape[:2]
 61 | 
 62 |         scale = self.size * 1.0 / min(h, w)
 63 |         if h < w:
 64 |             newh, neww = self.size, scale * w
 65 |         else:
 66 |             newh, neww = scale * h, self.size
 67 |         if max(newh, neww) > self.max_size:
 68 |             scale = self.max_size * 1.0 / max(newh, neww)
 69 |             newh = newh * scale
 70 |             neww = neww * scale
 71 |         neww = int(neww + 0.5)
 72 |         newh = int(newh + 0.5)
 73 |         return T.ResizeTransform(h, w, newh, neww, self.interp)
 74 | 
 75 | 
 76 | class RandomFlip(T.Augmentation):
 77 |     """
 78 |     Flip the image horizontally or vertically with the given probability.
 79 |     """
 80 | 
 81 |     def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1):
 82 |         """
 83 |         Args:
 84 |             prob (float): probability of flip.
 85 |             horizontal (boolean): whether to apply horizontal flipping
 86 |             vertical (boolean): whether to apply vertical flipping
 87 |         """
 88 |         super().__init__()
 89 | 
 90 |         if horizontal and vertical:
 91 |             raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.")
 92 |         if not horizontal and not vertical:
 93 |             raise ValueError("At least one of horiz or vert has to be True!")
 94 |         self._cnt = 0
 95 | 
 96 |         self._init(locals())
 97 | 
 98 |     def get_transform(self, image):
 99 |         if self._cnt % self.clip_frame_cnt == 0:
100 |             self.do = self._rand_range() < self.prob
101 |             self._cnt = 0   # avoiding overflow
102 |         self._cnt += 1
103 | 
104 |         h, w = image.shape[:2]
105 | 
106 |         if self.do:
107 |             if self.horizontal:
108 |                 return HFlipTransform(w)
109 |             elif self.vertical:
110 |                 return VFlipTransform(h)
111 |         else:
112 |             return NoOpTransform()
113 | 
114 | 
115 | def build_augmentation(cfg, is_train):
116 |     logger = logging.getLogger(__name__)
117 |     aug_list = []
118 |     if is_train:
119 |         # Crop
120 |         if cfg.INPUT.CROP.ENABLED:
121 |             aug_list.append(T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
122 | 
123 |         # Resize
124 |         min_size = cfg.INPUT.MIN_SIZE_TRAIN
125 |         max_size = cfg.INPUT.MAX_SIZE_TRAIN
126 |         sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
127 |         ms_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM if "by_clip" in cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING else 1
128 |         aug_list.append(ResizeShortestEdge(min_size, max_size, sample_style, clip_frame_cnt=ms_clip_frame_cnt))
129 | 
130 |         # Flip
131 |         if cfg.INPUT.RANDOM_FLIP != "none":
132 |             if cfg.INPUT.RANDOM_FLIP == "flip_by_clip":
133 |                 flip_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM
134 |             else:
135 |                 flip_clip_frame_cnt = 1
136 | 
137 |             aug_list.append(
138 |                 # NOTE using RandomFlip modified for the support of flip maintenance
139 |                 RandomFlip(
140 |                     horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"),
141 |                     vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
142 |                     clip_frame_cnt=flip_clip_frame_cnt,
143 |                 )
144 |             )
145 | 
146 |         # Additional augmentations : brightness, contrast, saturation, rotation
147 |         augmentations = cfg.INPUT.AUGMENTATIONS
148 |         if "brightness" in augmentations:
149 |             aug_list.append(T.RandomBrightness(0.9, 1.1))
150 |         if "contrast" in augmentations:
151 |             aug_list.append(T.RandomContrast(0.9, 1.1))
152 |         if "saturation" in augmentations:
153 |             aug_list.append(T.RandomSaturation(0.9, 1.1))
154 |         if "rotation" in augmentations:
155 |             aug_list.append(
156 |                 T.RandomRotation(
157 |                     [-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], sample_style="range"
158 |                 )
159 |             )
160 |     else:
161 |         # Resize
162 |         min_size = cfg.INPUT.MIN_SIZE_TEST
163 |         max_size = cfg.INPUT.MAX_SIZE_TEST
164 |         sample_style = "choice"
165 |         aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
166 | 
167 |     return aug_list
168 | 


--------------------------------------------------------------------------------
/uni_dvps/data_video/augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/sukjunhwang/IFC
  3 | 
  4 | import numpy as np
  5 | import logging
  6 | import sys
  7 | from fvcore.transforms.transform import (
  8 |     HFlipTransform,
  9 |     NoOpTransform,
 10 |     VFlipTransform,
 11 | )
 12 | from PIL import Image
 13 | 
 14 | from detectron2.data import transforms as T
 15 | from detectron2.projects.point_rend import ColorAugSSDTransform
 16 | from typing import Tuple
 17 | from fvcore.transforms.transform import (
 18 |     CropTransform,
 19 |     PadTransform,
 20 |     TransformList,
 21 | )
 22 | 
 23 | class FixedSizeCenterCrop(T.Augmentation):
 24 |     """
 25 |     If `crop_size` is smaller than the input image size, then it uses a center crop of
 26 |     the crop size. If `crop_size` is larger than the input image size, then it pads
 27 |     the around of the image to the crop size.
 28 |     """
 29 | 
 30 |     def __init__(self, crop_size: Tuple[int], pad_value: float = 128.0, with_pad=True):
 31 |         """
 32 |         Args:
 33 |             crop_size: target image (height, width).
 34 |             pad_value: the padding value.
 35 |         """
 36 |         super().__init__()
 37 |         self._init(locals())
 38 | 
 39 |     def get_transform(self, image: np.ndarray) -> TransformList:
 40 |         # Compute the image scale and scaled size.
 41 |         input_size = image.shape[:2]
 42 |         output_size = self.crop_size
 43 | 
 44 |         # Add random crop if the image is scaled up.
 45 |         max_offset = np.subtract(input_size, output_size)
 46 |         max_offset = np.maximum(max_offset, 0)
 47 |         offset = np.multiply(max_offset, 0.5)#np.random.uniform(0.0, 1.0))
 48 |         offset = np.round(offset).astype(int)
 49 |         crop_transform = CropTransform(
 50 |             offset[1], offset[0], output_size[1], output_size[0], input_size[1], input_size[0]
 51 |         )
 52 |         if not self.with_pad:
 53 |             return TransformList([crop_transform, ])
 54 | 
 55 |         # Add padding if the image is scaled down.
 56 |         pad_size = np.subtract(output_size, input_size)
 57 |         pad_size = np.maximum(pad_size, 0)
 58 |         pad_size_0 = pad_size // 2
 59 |         pad_size_1 = pad_size - pad_size_0
 60 |         original_size = np.minimum(input_size, output_size)
 61 |         pad_transform = PadTransform(
 62 |             pad_size_0[1], pad_size_0[0], pad_size_1[1], pad_size_1[0], original_size[1], original_size[0], self.pad_value
 63 |         )
 64 | 
 65 |         return TransformList([crop_transform, pad_transform])
 66 | 
 67 | class ResizeShortestEdge(T.Augmentation):
 68 |     """
 69 |     Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
 70 |     If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
 71 |     """
 72 | 
 73 |     def __init__(
 74 |         self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1
 75 |     ):
 76 |         """
 77 |         Args:
 78 |             short_edge_length (list[int]): If ``sample_style=="range"``,
 79 |                 a [min, max] interval from which to sample the shortest edge length.
 80 |                 If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
 81 |             max_size (int): maximum allowed longest edge length.
 82 |             sample_style (str): either "range" or "choice".
 83 |         """
 84 |         super().__init__()
 85 |         assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style
 86 | 
 87 |         self.is_range = ("range" in sample_style)
 88 |         if isinstance(short_edge_length, int):
 89 |             short_edge_length = (short_edge_length, short_edge_length)
 90 |         if self.is_range:
 91 |             assert len(short_edge_length) == 2, (
 92 |                 "short_edge_length must be two values using 'range' sample style."
 93 |                 f" Got {short_edge_length}!"
 94 |             )
 95 |         self._cnt = 0
 96 |         self._init(locals())
 97 | 
 98 |     def get_transform(self, image):
 99 |         if self._cnt % self.clip_frame_cnt == 0:
100 |             if self.is_range:
101 |                 self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
102 |             else:
103 |                 self.size = np.random.choice(self.short_edge_length)
104 |             if self.size == 0:
105 |                 return NoOpTransform()
106 | 
107 |             self._cnt = 0   # avoiding overflow
108 |         self._cnt += 1
109 | 
110 |         h, w = image.shape[:2]
111 | 
112 |         scale = self.size * 1.0 / min(h, w)
113 |         if h < w:
114 |             newh, neww = self.size, scale * w
115 |         else:
116 |             newh, neww = scale * h, self.size
117 |         if max(newh, neww) > self.max_size:
118 |             scale = self.max_size * 1.0 / max(newh, neww)
119 |             newh = newh * scale
120 |             neww = neww * scale
121 |         neww = int(neww + 0.5)
122 |         newh = int(newh + 0.5)
123 |         return T.ResizeTransform(h, w, newh, neww, self.interp)
124 | 
125 | class RandomFlip(T.Augmentation):
126 |     """
127 |     Flip the image horizontally or vertically with the given probability.
128 |     """
129 | 
130 |     def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1):
131 |         """
132 |         Args:
133 |             prob (float): probability of flip.
134 |             horizontal (boolean): whether to apply horizontal flipping
135 |             vertical (boolean): whether to apply vertical flipping
136 |         """
137 |         super().__init__()
138 | 
139 |         if horizontal and vertical:
140 |             raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.")
141 |         if not horizontal and not vertical:
142 |             raise ValueError("At least one of horiz or vert has to be True!")
143 |         self._cnt = 0
144 | 
145 |         self._init(locals())
146 | 
147 |     def get_transform(self, image):
148 |         if self._cnt % self.clip_frame_cnt == 0:
149 |             self.do = self._rand_range() < self.prob
150 |             self._cnt = 0   # avoiding overflow
151 |         self._cnt += 1
152 | 
153 |         h, w = image.shape[:2]
154 | 
155 |         if self.do:
156 |             if self.horizontal:
157 |                 return HFlipTransform(w)
158 |             elif self.vertical:
159 |                 return VFlipTransform(h)
160 |         else:
161 |             return NoOpTransform()
162 | 
163 | def build_augmentation(cfg, is_train):
164 |     logger = logging.getLogger(__name__)
165 |     aug_list = []
166 |     print("aug_list: ", aug_list)
167 |     return aug_list
168 | 
169 | def build_semkitti_augmentation(cfg, is_train):
170 |     logger = logging.getLogger(__name__)
171 |     aug_list = []
172 |     print("aug_list: ", aug_list)
173 |     return aug_list
174 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.nn import functional as F
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import detection_utils as utils
 11 | from detectron2.data import transforms as T
 12 | from detectron2.structures import BitMasks, Instances
 13 | 
 14 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
 15 | 
 16 | __all__ = ["MaskFormerPanopticDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for panoptic segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         ignore_label,
 40 |         size_divisibility,
 41 |     ):
 42 |         """
 43 |         NOTE: this interface is experimental.
 44 |         Args:
 45 |             is_train: for training or inference
 46 |             augmentations: a list of augmentations or deterministic transforms to apply
 47 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 48 |             ignore_label: the label that is ignored to evaluation
 49 |             size_divisibility: pad image size to be divisible by this value
 50 |         """
 51 |         super().__init__(
 52 |             is_train,
 53 |             augmentations=augmentations,
 54 |             image_format=image_format,
 55 |             ignore_label=ignore_label,
 56 |             size_divisibility=size_divisibility,
 57 |         )
 58 | 
 59 |     def __call__(self, dataset_dict):
 60 |         """
 61 |         Args:
 62 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 63 | 
 64 |         Returns:
 65 |             dict: a format that builtin models in detectron2 accept
 66 |         """
 67 |         assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
 68 | 
 69 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 70 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
 71 |         utils.check_image_size(dataset_dict, image)
 72 | 
 73 |         # semantic segmentation
 74 |         if "sem_seg_file_name" in dataset_dict:
 75 |             # PyTorch transformation not implemented for uint16, so converting it to double first
 76 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
 77 |         else:
 78 |             sem_seg_gt = None
 79 | 
 80 |         # panoptic segmentation
 81 |         if "pan_seg_file_name" in dataset_dict:
 82 |             pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
 83 |             segments_info = dataset_dict["segments_info"]
 84 |         else:
 85 |             pan_seg_gt = None
 86 |             segments_info = None
 87 | 
 88 |         if pan_seg_gt is None:
 89 |             raise ValueError(
 90 |                 "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
 91 |                     dataset_dict["file_name"]
 92 |                 )
 93 |             )
 94 | 
 95 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
 96 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
 97 |         image = aug_input.image
 98 |         if sem_seg_gt is not None:
 99 |             sem_seg_gt = aug_input.sem_seg
100 | 
101 |         # apply the same transformation to panoptic segmentation
102 |         pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
103 | 
104 |         from panopticapi.utils import rgb2id
105 | 
106 |         pan_seg_gt = rgb2id(pan_seg_gt)
107 | 
108 |         # Pad image and segmentation label here!
109 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
110 |         if sem_seg_gt is not None:
111 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
112 |         pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
113 | 
114 |         if self.size_divisibility > 0:
115 |             image_size = (image.shape[-2], image.shape[-1])
116 |             padding_size = [
117 |                 0,
118 |                 self.size_divisibility - image_size[1],
119 |                 0,
120 |                 self.size_divisibility - image_size[0],
121 |             ]
122 |             image = F.pad(image, padding_size, value=128).contiguous()
123 |             if sem_seg_gt is not None:
124 |                 sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
125 |             pan_seg_gt = F.pad(
126 |                 pan_seg_gt, padding_size, value=0
127 |             ).contiguous()  # 0 is the VOID panoptic label
128 | 
129 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
130 | 
131 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
132 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
133 |         # Therefore it's important to use torch.Tensor.
134 |         dataset_dict["image"] = image
135 |         if sem_seg_gt is not None:
136 |             dataset_dict["sem_seg"] = sem_seg_gt.long()
137 | 
138 |         if "annotations" in dataset_dict:
139 |             raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
140 | 
141 |         # Prepare per-category binary masks
142 |         pan_seg_gt = pan_seg_gt.numpy()
143 |         instances = Instances(image_shape)
144 |         classes = []
145 |         masks = []
146 |         for segment_info in segments_info:
147 |             class_id = segment_info["category_id"]
148 |             if not segment_info["iscrowd"]:
149 |                 classes.append(class_id)
150 |                 masks.append(pan_seg_gt == segment_info["id"])
151 | 
152 |         classes = np.array(classes)
153 |         instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
154 |         if len(masks) == 0:
155 |             # Some image does not have annotation (all ignored)
156 |             instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
157 |         else:
158 |             masks = BitMasks(
159 |                 torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
160 |             )
161 |             instances.gt_masks = masks.tensor
162 | 
163 |         dataset_dict["instances"] = instances
164 | 
165 |         return dataset_dict
166 | 


--------------------------------------------------------------------------------
/minvis/video_mask2former_transformer_decoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021-2022, NVIDIA Corporation & Affiliates. All rights reserved.
  2 | #
  3 | # This work is made available under the Nvidia Source Code License-NC.
  4 | # To view a copy of this license, visit
  5 | # https://github.com/NVlabs/MinVIS/blob/main/LICENSE
  6 | 
  7 | # Copyright (c) Facebook, Inc. and its affiliates.
  8 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
  9 | import torch
 10 | from torch import nn, Tensor
 11 | from torch.nn import functional as F
 12 | 
 13 | from detectron2.config import configurable
 14 | 
 15 | from mask2former.modeling.transformer_decoder.maskformer_transformer_decoder import TRANSFORMER_DECODER_REGISTRY
 16 | from mask2former.modeling.transformer_decoder.position_encoding import PositionEmbeddingSine
 17 | 
 18 | from mask2former_video.modeling.transformer_decoder.video_mask2former_transformer_decoder import VideoMultiScaleMaskedTransformerDecoder
 19 | import einops
 20 | 
 21 | 
 22 | @TRANSFORMER_DECODER_REGISTRY.register()
 23 | class VideoMultiScaleMaskedTransformerDecoder_frame(VideoMultiScaleMaskedTransformerDecoder):
 24 | 
 25 |     @configurable
 26 |     def __init__(
 27 |         self,
 28 |         in_channels,
 29 |         mask_classification=True,
 30 |         *,
 31 |         num_classes: int,
 32 |         hidden_dim: int,
 33 |         num_queries: int,
 34 |         nheads: int,
 35 |         dim_feedforward: int,
 36 |         dec_layers: int,
 37 |         pre_norm: bool,
 38 |         mask_dim: int,
 39 |         enforce_input_project: bool,
 40 |         # video related
 41 |         num_frames,
 42 |     ):
 43 |         super().__init__(
 44 |             in_channels=in_channels, 
 45 |             mask_classification=mask_classification,
 46 |             num_classes=num_classes,
 47 |             hidden_dim=hidden_dim,
 48 |             num_queries=num_queries,
 49 |             nheads=nheads,
 50 |             dim_feedforward=dim_feedforward,
 51 |             dec_layers=dec_layers,
 52 |             pre_norm=pre_norm,
 53 |             mask_dim=mask_dim,
 54 |             enforce_input_project=enforce_input_project,
 55 |             num_frames=num_frames,
 56 |         )
 57 | 
 58 |         # use 2D positional embedding
 59 |         N_steps = hidden_dim // 2
 60 |         self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
 61 | 
 62 |     def forward(self, x, mask_features, mask = None):
 63 |         # x is a list of multi-scale feature
 64 |         assert len(x) == self.num_feature_levels
 65 |         src = []
 66 |         pos = []
 67 |         size_list = []
 68 | 
 69 |         # disable mask, it does not affect performance
 70 |         del mask
 71 | 
 72 |         for i in range(self.num_feature_levels):
 73 |             size_list.append(x[i].shape[-2:])
 74 |             pos.append(self.pe_layer(x[i], None).flatten(2))
 75 |             src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None])
 76 | 
 77 |             # flatten NxCxHxW to HWxNxC
 78 |             pos[-1] = pos[-1].permute(2, 0, 1)
 79 |             src[-1] = src[-1].permute(2, 0, 1)
 80 | 
 81 |         _, bs, _ = src[0].shape
 82 | 
 83 |         # QxNxC
 84 |         query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
 85 |         output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
 86 | 
 87 |         predictions_class = []
 88 |         predictions_mask = []
 89 | 
 90 |         # prediction heads on learnable query features
 91 |         outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0])
 92 |         predictions_class.append(outputs_class)
 93 |         predictions_mask.append(outputs_mask)
 94 | 
 95 |         for i in range(self.num_layers):
 96 |             level_index = i % self.num_feature_levels
 97 |             # prevent NaN output
 98 |             attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False
 99 |             # attention: cross-attention first
100 |             output = self.transformer_cross_attention_layers[i](
101 |                 output, src[level_index],
102 |                 memory_mask=attn_mask,
103 |                 memory_key_padding_mask=None,  # here we do not apply masking on padded region
104 |                 pos=pos[level_index], query_pos=query_embed
105 |             )
106 | 
107 |             output = self.transformer_self_attention_layers[i](
108 |                 output, tgt_mask=None,
109 |                 tgt_key_padding_mask=None,
110 |                 query_pos=query_embed
111 |             )
112 |             
113 |             # FFN
114 |             output = self.transformer_ffn_layers[i](
115 |                 output
116 |             )
117 | 
118 |             outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels])
119 |             predictions_class.append(outputs_class)
120 |             predictions_mask.append(outputs_mask)
121 | 
122 |         assert len(predictions_class) == self.num_layers + 1
123 | 
124 |         # expand BT to B, T  
125 |         bt = predictions_mask[-1].shape[0]
126 |         bs = bt // self.num_frames if self.training else 1
127 |         t = bt // bs
128 |         for i in range(len(predictions_mask)):
129 |             predictions_mask[i] = einops.rearrange(predictions_mask[i], '(b t) q h w -> b q t h w', t=t)
130 | 
131 |         for i in range(len(predictions_class)):
132 |             predictions_class[i] = einops.rearrange(predictions_class[i], '(b t) q c -> b t q c', t=t)
133 | 
134 |         pred_embds = self.decoder_norm(output)
135 |         pred_embds = einops.rearrange(pred_embds, 'q (b t) c -> b c t q', t=t)
136 | 
137 |         out = {
138 |             'pred_logits': predictions_class[-1],
139 |             'pred_masks': predictions_mask[-1],
140 |             'aux_outputs': self._set_aux_loss(
141 |                 predictions_class if self.mask_classification else None, predictions_mask
142 |             ),
143 |             'pred_embds': pred_embds,
144 |         }
145 |         
146 |         return out
147 | 
148 |     def forward_prediction_heads(self, output, mask_features, attn_mask_target_size):
149 |         decoder_output = self.decoder_norm(output)
150 |         decoder_output = decoder_output.transpose(0, 1)
151 |         outputs_class = self.class_embed(decoder_output)
152 |         mask_embed = self.mask_embed(decoder_output)
153 |         outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
154 | 
155 |         # NOTE: prediction is of higher-resolution
156 |         # [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW]
157 |         attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bilinear", align_corners=False)
158 |         # must use bool type
159 |         # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged.
160 |         attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool()
161 |         attn_mask = attn_mask.detach()
162 | 
163 |         return outputs_class, outputs_mask, attn_mask
164 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | # Copyright (c) Facebook, Inc. and its affiliates.
 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import print_function
 14 | from __future__ import division
 15 | 
 16 | import warnings
 17 | import math
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | import torch.nn.functional as F
 22 | from torch.nn.init import xavier_uniform_, constant_
 23 | 
 24 | from ..functions import MSDeformAttnFunction
 25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
 26 | 
 27 | 
 28 | def _is_power_of_2(n):
 29 |     if (not isinstance(n, int)) or (n < 0):
 30 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 31 |     return (n & (n-1) == 0) and n != 0
 32 | 
 33 | 
 34 | class MSDeformAttn(nn.Module):
 35 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 36 |         """
 37 |         Multi-Scale Deformable Attention Module
 38 |         :param d_model      hidden dimension
 39 |         :param n_levels     number of feature levels
 40 |         :param n_heads      number of attention heads
 41 |         :param n_points     number of sampling points per attention head per feature level
 42 |         """
 43 |         super().__init__()
 44 |         if d_model % n_heads != 0:
 45 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 46 |         _d_per_head = d_model // n_heads
 47 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 48 |         if not _is_power_of_2(_d_per_head):
 49 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 50 |                           "which is more efficient in our CUDA implementation.")
 51 | 
 52 |         self.im2col_step = 128
 53 | 
 54 |         self.d_model = d_model
 55 |         self.n_levels = n_levels
 56 |         self.n_heads = n_heads
 57 |         self.n_points = n_points
 58 | 
 59 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 60 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 61 |         self.value_proj = nn.Linear(d_model, d_model)
 62 |         self.output_proj = nn.Linear(d_model, d_model)
 63 | 
 64 |         self._reset_parameters()
 65 | 
 66 |     def _reset_parameters(self):
 67 |         constant_(self.sampling_offsets.weight.data, 0.)
 68 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 69 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 70 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 71 |         for i in range(self.n_points):
 72 |             grid_init[:, :, i, :] *= i + 1
 73 |         with torch.no_grad():
 74 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 75 |         constant_(self.attention_weights.weight.data, 0.)
 76 |         constant_(self.attention_weights.bias.data, 0.)
 77 |         xavier_uniform_(self.value_proj.weight.data)
 78 |         constant_(self.value_proj.bias.data, 0.)
 79 |         xavier_uniform_(self.output_proj.weight.data)
 80 |         constant_(self.output_proj.bias.data, 0.)
 81 | 
 82 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 83 |         """
 84 |         :param query                       (N, Length_{query}, C)
 85 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 86 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 87 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 88 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 89 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 90 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 91 | 
 92 |         :return output                     (N, Length_{query}, C)
 93 |         """
 94 |         N, Len_q, _ = query.shape
 95 |         N, Len_in, _ = input_flatten.shape
 96 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 97 | 
 98 |         value = self.value_proj(input_flatten)
 99 |         if input_padding_mask is not None:
100 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
101 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105 |         # N, Len_q, n_heads, n_levels, n_points, 2
106 |         if reference_points.shape[-1] == 2:
107 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
108 |             sampling_locations = reference_points[:, :, None, :, None, :] \
109 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
110 |         elif reference_points.shape[-1] == 4:
111 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
112 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
113 |         else:
114 |             raise ValueError(
115 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
116 |         try:
117 |             output = MSDeformAttnFunction.apply(
118 |                 value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
119 |         except:
120 |             # CPU
121 |             output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
122 |         # # For FLOPs calculation only
123 |         # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
124 |         output = self.output_proj(output)
125 |         return output
126 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import pycocotools.mask as mask_util
  7 | import torch
  8 | from torch.nn import functional as F
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data import transforms as T
 13 | from detectron2.projects.point_rend import ColorAugSSDTransform
 14 | from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
 15 | 
 16 | __all__ = ["MaskFormerInstanceDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerInstanceDatasetMapper:
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for instance segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         size_divisibility,
 40 |     ):
 41 |         """
 42 |         NOTE: this interface is experimental.
 43 |         Args:
 44 |             is_train: for training or inference
 45 |             augmentations: a list of augmentations or deterministic transforms to apply
 46 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 47 |             size_divisibility: pad image size to be divisible by this value
 48 |         """
 49 |         self.is_train = is_train
 50 |         self.tfm_gens = augmentations
 51 |         self.img_format = image_format
 52 |         self.size_divisibility = size_divisibility
 53 | 
 54 |         logger = logging.getLogger(__name__)
 55 |         mode = "training" if is_train else "inference"
 56 |         logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
 57 | 
 58 |     @classmethod
 59 |     def from_config(cls, cfg, is_train=True):
 60 |         # Build augmentation
 61 |         augs = [
 62 |             T.ResizeShortestEdge(
 63 |                 cfg.INPUT.MIN_SIZE_TRAIN,
 64 |                 cfg.INPUT.MAX_SIZE_TRAIN,
 65 |                 cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
 66 |             )
 67 |         ]
 68 |         if cfg.INPUT.CROP.ENABLED:
 69 |             augs.append(
 70 |                 T.RandomCrop(
 71 |                     cfg.INPUT.CROP.TYPE,
 72 |                     cfg.INPUT.CROP.SIZE,
 73 |                 )
 74 |             )
 75 |         if cfg.INPUT.COLOR_AUG_SSD:
 76 |             augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
 77 |         augs.append(T.RandomFlip())
 78 | 
 79 |         ret = {
 80 |             "is_train": is_train,
 81 |             "augmentations": augs,
 82 |             "image_format": cfg.INPUT.FORMAT,
 83 |             "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
 84 |         }
 85 |         return ret
 86 | 
 87 |     def __call__(self, dataset_dict):
 88 |         """
 89 |         Args:
 90 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 91 | 
 92 |         Returns:
 93 |             dict: a format that builtin models in detectron2 accept
 94 |         """
 95 |         assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
 96 | 
 97 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 98 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
 99 |         utils.check_image_size(dataset_dict, image)
100 | 
101 |         aug_input = T.AugInput(image)
102 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
103 |         image = aug_input.image
104 | 
105 |         # transform instnace masks
106 |         assert "annotations" in dataset_dict
107 |         for anno in dataset_dict["annotations"]:
108 |             anno.pop("keypoints", None)
109 | 
110 |         annos = [
111 |             utils.transform_instance_annotations(obj, transforms, image.shape[:2])
112 |             for obj in dataset_dict.pop("annotations")
113 |             if obj.get("iscrowd", 0) == 0
114 |         ]
115 | 
116 |         if len(annos):
117 |             assert "segmentation" in annos[0]
118 |         segms = [obj["segmentation"] for obj in annos]
119 |         masks = []
120 |         for segm in segms:
121 |             if isinstance(segm, list):
122 |                 # polygon
123 |                 masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
124 |             elif isinstance(segm, dict):
125 |                 # COCO RLE
126 |                 masks.append(mask_util.decode(segm))
127 |             elif isinstance(segm, np.ndarray):
128 |                 assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
129 |                     segm.ndim
130 |                 )
131 |                 # mask array
132 |                 masks.append(segm)
133 |             else:
134 |                 raise ValueError(
135 |                     "Cannot convert segmentation of type '{}' to BitMasks!"
136 |                     "Supported types are: polygons as list[list[float] or ndarray],"
137 |                     " COCO-style RLE as a dict, or a binary segmentation mask "
138 |                     " in a 2D numpy array of shape HxW.".format(type(segm))
139 |                 )
140 | 
141 |         # Pad image and segmentation label here!
142 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
143 |         masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]
144 | 
145 |         classes = [int(obj["category_id"]) for obj in annos]
146 |         classes = torch.tensor(classes, dtype=torch.int64)
147 | 
148 |         if self.size_divisibility > 0:
149 |             image_size = (image.shape[-2], image.shape[-1])
150 |             padding_size = [
151 |                 0,
152 |                 self.size_divisibility - image_size[1],
153 |                 0,
154 |                 self.size_divisibility - image_size[0],
155 |             ]
156 |             # pad image
157 |             image = F.pad(image, padding_size, value=128).contiguous()
158 |             # pad mask
159 |             masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]
160 | 
161 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
162 | 
163 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
164 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
165 |         # Therefore it's important to use torch.Tensor.
166 |         dataset_dict["image"] = image
167 | 
168 |         # Prepare per-category binary masks
169 |         instances = Instances(image_shape)
170 |         instances.gt_classes = classes
171 |         if len(masks) == 0:
172 |             # Some image does not have annotation (all ignored)
173 |             instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
174 |         else:
175 |             masks = BitMasks(torch.stack(masks))
176 |             instances.gt_masks = masks.tensor
177 | 
178 |         dataset_dict["instances"] = instances
179 | 
180 |         return dataset_dict
181 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.nn import functional as F
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import MetadataCatalog
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data import transforms as T
 13 | from detectron2.projects.point_rend import ColorAugSSDTransform
 14 | from detectron2.structures import BitMasks, Instances
 15 | 
 16 | __all__ = ["MaskFormerSemanticDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerSemanticDatasetMapper:
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for semantic segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         ignore_label,
 40 |         size_divisibility,
 41 |     ):
 42 |         """
 43 |         NOTE: this interface is experimental.
 44 |         Args:
 45 |             is_train: for training or inference
 46 |             augmentations: a list of augmentations or deterministic transforms to apply
 47 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 48 |             ignore_label: the label that is ignored to evaluation
 49 |             size_divisibility: pad image size to be divisible by this value
 50 |         """
 51 |         self.is_train = is_train
 52 |         self.tfm_gens = augmentations
 53 |         self.img_format = image_format
 54 |         self.ignore_label = ignore_label
 55 |         self.size_divisibility = size_divisibility
 56 | 
 57 |         logger = logging.getLogger(__name__)
 58 |         mode = "training" if is_train else "inference"
 59 |         logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
 60 | 
 61 |     @classmethod
 62 |     def from_config(cls, cfg, is_train=True):
 63 |         # Build augmentation
 64 |         augs = [
 65 |             T.ResizeShortestEdge(
 66 |                 cfg.INPUT.MIN_SIZE_TRAIN,
 67 |                 cfg.INPUT.MAX_SIZE_TRAIN,
 68 |                 cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
 69 |             )
 70 |         ]
 71 |         if cfg.INPUT.CROP.ENABLED:
 72 |             augs.append(
 73 |                 T.RandomCrop_CategoryAreaConstraint(
 74 |                     cfg.INPUT.CROP.TYPE,
 75 |                     cfg.INPUT.CROP.SIZE,
 76 |                     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
 77 |                     cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
 78 |                 )
 79 |             )
 80 |         if cfg.INPUT.COLOR_AUG_SSD:
 81 |             augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
 82 |         augs.append(T.RandomFlip())
 83 | 
 84 |         # Assume always applies to the training set.
 85 |         dataset_names = cfg.DATASETS.TRAIN
 86 |         meta = MetadataCatalog.get(dataset_names[0])
 87 |         ignore_label = meta.ignore_label
 88 | 
 89 |         ret = {
 90 |             "is_train": is_train,
 91 |             "augmentations": augs,
 92 |             "image_format": cfg.INPUT.FORMAT,
 93 |             "ignore_label": ignore_label,
 94 |             "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
 95 |         }
 96 |         return ret
 97 | 
 98 |     def __call__(self, dataset_dict):
 99 |         """
100 |         Args:
101 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
102 | 
103 |         Returns:
104 |             dict: a format that builtin models in detectron2 accept
105 |         """
106 |         assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
107 | 
108 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
109 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
110 |         utils.check_image_size(dataset_dict, image)
111 | 
112 |         if "sem_seg_file_name" in dataset_dict:
113 |             # PyTorch transformation not implemented for uint16, so converting it to double first
114 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
115 |         else:
116 |             sem_seg_gt = None
117 | 
118 |         if sem_seg_gt is None:
119 |             raise ValueError(
120 |                 "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
121 |                     dataset_dict["file_name"]
122 |                 )
123 |             )
124 | 
125 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
126 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
127 |         image = aug_input.image
128 |         sem_seg_gt = aug_input.sem_seg
129 | 
130 |         # Pad image and segmentation label here!
131 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
132 |         if sem_seg_gt is not None:
133 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
134 | 
135 |         if self.size_divisibility > 0:
136 |             image_size = (image.shape[-2], image.shape[-1])
137 |             padding_size = [
138 |                 0,
139 |                 self.size_divisibility - image_size[1],
140 |                 0,
141 |                 self.size_divisibility - image_size[0],
142 |             ]
143 |             image = F.pad(image, padding_size, value=128).contiguous()
144 |             if sem_seg_gt is not None:
145 |                 sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
146 | 
147 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
148 | 
149 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
150 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
151 |         # Therefore it's important to use torch.Tensor.
152 |         dataset_dict["image"] = image
153 | 
154 |         if sem_seg_gt is not None:
155 |             dataset_dict["sem_seg"] = sem_seg_gt.long()
156 | 
157 |         if "annotations" in dataset_dict:
158 |             raise ValueError("Semantic segmentation dataset should not have 'annotations'.")
159 | 
160 |         # Prepare per-category binary masks
161 |         if sem_seg_gt is not None:
162 |             sem_seg_gt = sem_seg_gt.numpy()
163 |             instances = Instances(image_shape)
164 |             classes = np.unique(sem_seg_gt)
165 |             # remove ignored region
166 |             classes = classes[classes != self.ignore_label]
167 |             instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
168 | 
169 |             masks = []
170 |             for class_id in classes:
171 |                 masks.append(sem_seg_gt == class_id)
172 | 
173 |             if len(masks) == 0:
174 |                 # Some image does not have annotation (all ignored)
175 |                 instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
176 |             else:
177 |                 masks = BitMasks(
178 |                     torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
179 |                 )
180 |                 instances.gt_masks = masks.tensor
181 | 
182 |             dataset_dict["instances"] = instances
183 | 
184 |         return dataset_dict
185 | 


--------------------------------------------------------------------------------
/mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
  3 | import fvcore.nn.weight_init as weight_init
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | from detectron2.config import configurable
  9 | from detectron2.layers import Conv2d
 10 | from detectron2.utils.registry import Registry
 11 | 
 12 | from .position_encoding import PositionEmbeddingSine
 13 | from .transformer import Transformer
 14 | 
 15 | 
 16 | TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE")
 17 | TRANSFORMER_DECODER_REGISTRY.__doc__ = """
 18 | Registry for transformer module in MaskFormer.
 19 | """
 20 | 
 21 | 
 22 | def build_transformer_decoder(cfg, in_channels, mask_classification=True):
 23 |     """
 24 |     Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`.
 25 |     """
 26 |     name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME
 27 |     return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification)
 28 | 
 29 | 
 30 | @TRANSFORMER_DECODER_REGISTRY.register()
 31 | class StandardTransformerDecoder(nn.Module):
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         in_channels,
 36 |         mask_classification=True,
 37 |         *,
 38 |         num_classes: int,
 39 |         hidden_dim: int,
 40 |         num_queries: int,
 41 |         nheads: int,
 42 |         dropout: float,
 43 |         dim_feedforward: int,
 44 |         enc_layers: int,
 45 |         dec_layers: int,
 46 |         pre_norm: bool,
 47 |         deep_supervision: bool,
 48 |         mask_dim: int,
 49 |         enforce_input_project: bool,
 50 |     ):
 51 |         """
 52 |         NOTE: this interface is experimental.
 53 |         Args:
 54 |             in_channels: channels of the input features
 55 |             mask_classification: whether to add mask classifier or not
 56 |             num_classes: number of classes
 57 |             hidden_dim: Transformer feature dimension
 58 |             num_queries: number of queries
 59 |             nheads: number of heads
 60 |             dropout: dropout in Transformer
 61 |             dim_feedforward: feature dimension in feedforward network
 62 |             enc_layers: number of Transformer encoder layers
 63 |             dec_layers: number of Transformer decoder layers
 64 |             pre_norm: whether to use pre-LayerNorm or not
 65 |             deep_supervision: whether to add supervision to every decoder layers
 66 |             mask_dim: mask feature dimension
 67 |             enforce_input_project: add input project 1x1 conv even if input
 68 |                 channels and hidden dim is identical
 69 |         """
 70 |         super().__init__()
 71 | 
 72 |         self.mask_classification = mask_classification
 73 | 
 74 |         # positional encoding
 75 |         N_steps = hidden_dim // 2
 76 |         self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
 77 | 
 78 |         transformer = Transformer(
 79 |             d_model=hidden_dim,
 80 |             dropout=dropout,
 81 |             nhead=nheads,
 82 |             dim_feedforward=dim_feedforward,
 83 |             num_encoder_layers=enc_layers,
 84 |             num_decoder_layers=dec_layers,
 85 |             normalize_before=pre_norm,
 86 |             return_intermediate_dec=deep_supervision,
 87 |         )
 88 | 
 89 |         self.num_queries = num_queries
 90 |         self.transformer = transformer
 91 |         hidden_dim = transformer.d_model
 92 | 
 93 |         self.query_embed = nn.Embedding(num_queries, hidden_dim)
 94 | 
 95 |         if in_channels != hidden_dim or enforce_input_project:
 96 |             self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1)
 97 |             weight_init.c2_xavier_fill(self.input_proj)
 98 |         else:
 99 |             self.input_proj = nn.Sequential()
100 |         self.aux_loss = deep_supervision
101 | 
102 |         # output FFNs
103 |         if self.mask_classification:
104 |             self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
105 |         self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
106 | 
107 |     @classmethod
108 |     def from_config(cls, cfg, in_channels, mask_classification):
109 |         ret = {}
110 |         ret["in_channels"] = in_channels
111 |         ret["mask_classification"] = mask_classification
112 | 
113 |         ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
114 |         ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
115 |         ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
116 |         # Transformer parameters:
117 |         ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
118 |         ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
119 |         ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
120 |         ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
121 |         ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
122 |         ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
123 |         ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
124 |         ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
125 | 
126 |         ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
127 | 
128 |         return ret
129 | 
130 |     def forward(self, x, mask_features, mask=None):
131 |         if mask is not None:
132 |             mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
133 |         pos = self.pe_layer(x, mask)
134 | 
135 |         src = x
136 |         hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos)
137 | 
138 |         if self.mask_classification:
139 |             outputs_class = self.class_embed(hs)
140 |             out = {"pred_logits": outputs_class[-1]}
141 |         else:
142 |             out = {}
143 | 
144 |         if self.aux_loss:
145 |             # [l, bs, queries, embed]
146 |             mask_embed = self.mask_embed(hs)
147 |             outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features)
148 |             out["pred_masks"] = outputs_seg_masks[-1]
149 |             out["aux_outputs"] = self._set_aux_loss(
150 |                 outputs_class if self.mask_classification else None, outputs_seg_masks
151 |             )
152 |         else:
153 |             # FIXME h_boxes takes the last one computed, keep this in mind
154 |             # [bs, queries, embed]
155 |             mask_embed = self.mask_embed(hs[-1])
156 |             outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
157 |             out["pred_masks"] = outputs_seg_masks
158 |         return out
159 | 
160 |     @torch.jit.unused
161 |     def _set_aux_loss(self, outputs_class, outputs_seg_masks):
162 |         # this is a workaround to make torchscript happy, as torchscript
163 |         # doesn't support dictionary with non-homogeneous values, such
164 |         # as a dict having both a Tensor and a list.
165 |         if self.mask_classification:
166 |             return [
167 |                 {"pred_logits": a, "pred_masks": b}
168 |                 for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
169 |             ]
170 |         else:
171 |             return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
172 | 
173 | 
174 | class MLP(nn.Module):
175 |     """Very simple multi-layer perceptron (also called FFN)"""
176 | 
177 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
178 |         super().__init__()
179 |         self.num_layers = num_layers
180 |         h = [hidden_dim] * (num_layers - 1)
181 |         self.layers = nn.ModuleList(
182 |             nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
183 |         )
184 | 
185 |     def forward(self, x):
186 |         for i, layer in enumerate(self.layers):
187 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
188 |         return x
189 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
  3 | import copy
  4 | import logging
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import detection_utils as utils
 11 | from detectron2.data import transforms as T
 12 | from detectron2.data.transforms import TransformGen
 13 | from detectron2.structures import BitMasks, Instances
 14 | 
 15 | from pycocotools import mask as coco_mask
 16 | 
 17 | __all__ = ["COCOInstanceNewBaselineDatasetMapper"]
 18 | 
 19 | 
 20 | def convert_coco_poly_to_mask(segmentations, height, width):
 21 |     masks = []
 22 |     for polygons in segmentations:
 23 |         rles = coco_mask.frPyObjects(polygons, height, width)
 24 |         mask = coco_mask.decode(rles)
 25 |         if len(mask.shape) < 3:
 26 |             mask = mask[..., None]
 27 |         mask = torch.as_tensor(mask, dtype=torch.uint8)
 28 |         mask = mask.any(dim=2)
 29 |         masks.append(mask)
 30 |     if masks:
 31 |         masks = torch.stack(masks, dim=0)
 32 |     else:
 33 |         masks = torch.zeros((0, height, width), dtype=torch.uint8)
 34 |     return masks
 35 | 
 36 | 
 37 | def build_transform_gen(cfg, is_train):
 38 |     """
 39 |     Create a list of default :class:`Augmentation` from config.
 40 |     Now it includes resizing and flipping.
 41 |     Returns:
 42 |         list[Augmentation]
 43 |     """
 44 |     assert is_train, "Only support training augmentation"
 45 |     image_size = cfg.INPUT.IMAGE_SIZE
 46 |     min_scale = cfg.INPUT.MIN_SCALE
 47 |     max_scale = cfg.INPUT.MAX_SCALE
 48 | 
 49 |     augmentation = []
 50 | 
 51 |     if cfg.INPUT.RANDOM_FLIP != "none":
 52 |         augmentation.append(
 53 |             T.RandomFlip(
 54 |                 horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
 55 |                 vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
 56 |             )
 57 |         )
 58 | 
 59 |     augmentation.extend([
 60 |         T.ResizeScale(
 61 |             min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
 62 |         ),
 63 |         T.FixedSizeCrop(crop_size=(image_size, image_size)),
 64 |     ])
 65 | 
 66 |     return augmentation
 67 | 
 68 | 
 69 | # This is specifically designed for the COCO dataset.
 70 | class COCOInstanceNewBaselineDatasetMapper:
 71 |     """
 72 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 73 |     and map it into a format used by MaskFormer.
 74 | 
 75 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 76 | 
 77 |     The callable currently does the following:
 78 | 
 79 |     1. Read the image from "file_name"
 80 |     2. Applies geometric transforms to the image and annotation
 81 |     3. Find and applies suitable cropping to the image and annotation
 82 |     4. Prepare image and annotation to Tensors
 83 |     """
 84 | 
 85 |     @configurable
 86 |     def __init__(
 87 |         self,
 88 |         is_train=True,
 89 |         *,
 90 |         tfm_gens,
 91 |         image_format,
 92 |     ):
 93 |         """
 94 |         NOTE: this interface is experimental.
 95 |         Args:
 96 |             is_train: for training or inference
 97 |             augmentations: a list of augmentations or deterministic transforms to apply
 98 |             tfm_gens: data augmentation
 99 |             image_format: an image format supported by :func:`detection_utils.read_image`.
100 |         """
101 |         self.tfm_gens = tfm_gens
102 |         logging.getLogger(__name__).info(
103 |             "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
104 |         )
105 | 
106 |         self.img_format = image_format
107 |         self.is_train = is_train
108 |     
109 |     @classmethod
110 |     def from_config(cls, cfg, is_train=True):
111 |         # Build augmentation
112 |         tfm_gens = build_transform_gen(cfg, is_train)
113 | 
114 |         ret = {
115 |             "is_train": is_train,
116 |             "tfm_gens": tfm_gens,
117 |             "image_format": cfg.INPUT.FORMAT,
118 |         }
119 |         return ret
120 | 
121 |     def __call__(self, dataset_dict):
122 |         """
123 |         Args:
124 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
125 | 
126 |         Returns:
127 |             dict: a format that builtin models in detectron2 accept
128 |         """
129 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
130 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
131 |         utils.check_image_size(dataset_dict, image)
132 | 
133 |         # TODO: get padding mask
134 |         # by feeding a "segmentation mask" to the same transforms
135 |         padding_mask = np.ones(image.shape[:2])
136 | 
137 |         image, transforms = T.apply_transform_gens(self.tfm_gens, image)
138 |         # the crop transformation has default padding value 0 for segmentation
139 |         padding_mask = transforms.apply_segmentation(padding_mask)
140 |         padding_mask = ~ padding_mask.astype(bool)
141 | 
142 |         image_shape = image.shape[:2]  # h, w
143 | 
144 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
145 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
146 |         # Therefore it's important to use torch.Tensor.
147 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
148 |         dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))
149 | 
150 |         if not self.is_train:
151 |             # USER: Modify this if you want to keep them for some reason.
152 |             dataset_dict.pop("annotations", None)
153 |             return dataset_dict
154 | 
155 |         if "annotations" in dataset_dict:
156 |             # USER: Modify this if you want to keep them for some reason.
157 |             for anno in dataset_dict["annotations"]:
158 |                 # Let's always keep mask
159 |                 # if not self.mask_on:
160 |                 #     anno.pop("segmentation", None)
161 |                 anno.pop("keypoints", None)
162 | 
163 |             # USER: Implement additional transformations if you have other types of data
164 |             annos = [
165 |                 utils.transform_instance_annotations(obj, transforms, image_shape)
166 |                 for obj in dataset_dict.pop("annotations")
167 |                 if obj.get("iscrowd", 0) == 0
168 |             ]
169 |             # NOTE: does not support BitMask due to augmentation
170 |             # Current BitMask cannot handle empty objects
171 |             instances = utils.annotations_to_instances(annos, image_shape)
172 |             # After transforms such as cropping are applied, the bounding box may no longer
173 |             # tightly bound the object. As an example, imagine a triangle object
174 |             # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
175 |             # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
176 |             # the intersection of original bounding box and the cropping box.
177 |             instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
178 |             # Need to filter empty instances first (due to augmentation)
179 |             instances = utils.filter_empty_instances(instances)
180 |             # Generate masks from polygon
181 |             h, w = instances.image_size
182 |             # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
183 |             if hasattr(instances, 'gt_masks'):
184 |                 gt_masks = instances.gt_masks
185 |                 gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
186 |                 instances.gt_masks = gt_masks
187 |             dataset_dict["instances"] = instances
188 | 
189 |         return dataset_dict
190 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*!
  2 | **************************************************************************************************
  3 | * Deformable DETR
  4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
  5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  6 | **************************************************************************************************
  7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  8 | **************************************************************************************************
  9 | */
 10 | 
 11 | /*!
 12 | * Copyright (c) Facebook, Inc. and its affiliates.
 13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 14 | */
 15 | 
 16 | #include <vector>
 17 | #include "cuda/ms_deform_im2col_cuda.cuh"
 18 | 
 19 | #include <ATen/ATen.h>
 20 | #include <ATen/cuda/CUDAContext.h>
 21 | #include <cuda.h>
 22 | #include <cuda_runtime.h>
 23 | 
 24 | 
 25 | at::Tensor ms_deform_attn_cuda_forward(
 26 |     const at::Tensor &value, 
 27 |     const at::Tensor &spatial_shapes,
 28 |     const at::Tensor &level_start_index,
 29 |     const at::Tensor &sampling_loc,
 30 |     const at::Tensor &attn_weight,
 31 |     const int im2col_step)
 32 | {
 33 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 34 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 35 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 36 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 37 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 38 | 
 39 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
 40 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
 41 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
 42 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
 43 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
 44 | 
 45 |     const int batch = value.size(0);
 46 |     const int spatial_size = value.size(1);
 47 |     const int num_heads = value.size(2);
 48 |     const int channels = value.size(3);
 49 | 
 50 |     const int num_levels = spatial_shapes.size(0);
 51 | 
 52 |     const int num_query = sampling_loc.size(1);
 53 |     const int num_point = sampling_loc.size(4);
 54 | 
 55 |     const int im2col_step_ = std::min(batch, im2col_step);
 56 | 
 57 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
 58 |     
 59 |     auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
 60 | 
 61 |     const int batch_n = im2col_step_;
 62 |     auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
 63 |     auto per_value_size = spatial_size * num_heads * channels;
 64 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
 65 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
 66 |     for (int n = 0; n < batch/im2col_step_; ++n)
 67 |     {
 68 |         auto columns = output_n.select(0, n);
 69 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
 70 |             ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
 71 |                 value.data<scalar_t>() + n * im2col_step_ * per_value_size,
 72 |                 spatial_shapes.data<int64_t>(),
 73 |                 level_start_index.data<int64_t>(),
 74 |                 sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
 75 |                 attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
 76 |                 batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
 77 |                 columns.data<scalar_t>());
 78 | 
 79 |         }));
 80 |     }
 81 | 
 82 |     output = output.view({batch, num_query, num_heads*channels});
 83 | 
 84 |     return output;
 85 | }
 86 | 
 87 | 
 88 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
 89 |     const at::Tensor &value, 
 90 |     const at::Tensor &spatial_shapes,
 91 |     const at::Tensor &level_start_index,
 92 |     const at::Tensor &sampling_loc,
 93 |     const at::Tensor &attn_weight,
 94 |     const at::Tensor &grad_output,
 95 |     const int im2col_step)
 96 | {
 97 | 
 98 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 99 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
100 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
101 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
102 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
103 |     AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
104 | 
105 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
106 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
107 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
108 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
109 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
110 |     AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
111 | 
112 |     const int batch = value.size(0);
113 |     const int spatial_size = value.size(1);
114 |     const int num_heads = value.size(2);
115 |     const int channels = value.size(3);
116 | 
117 |     const int num_levels = spatial_shapes.size(0);
118 | 
119 |     const int num_query = sampling_loc.size(1);
120 |     const int num_point = sampling_loc.size(4);
121 | 
122 |     const int im2col_step_ = std::min(batch, im2col_step);
123 | 
124 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
125 | 
126 |     auto grad_value = at::zeros_like(value);
127 |     auto grad_sampling_loc = at::zeros_like(sampling_loc);
128 |     auto grad_attn_weight = at::zeros_like(attn_weight);
129 | 
130 |     const int batch_n = im2col_step_;
131 |     auto per_value_size = spatial_size * num_heads * channels;
132 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
133 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
134 |     auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
135 |     
136 |     for (int n = 0; n < batch/im2col_step_; ++n)
137 |     {
138 |         auto grad_output_g = grad_output_n.select(0, n);
139 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
140 |             ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
141 |                                     grad_output_g.data<scalar_t>(),
142 |                                     value.data<scalar_t>() + n * im2col_step_ * per_value_size,
143 |                                     spatial_shapes.data<int64_t>(),
144 |                                     level_start_index.data<int64_t>(),
145 |                                     sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
146 |                                     attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
147 |                                     batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
148 |                                     grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
149 |                                     grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
150 |                                     grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
151 | 
152 |         }));
153 |     }
154 | 
155 |     return {
156 |         grad_value, grad_sampling_loc, grad_attn_weight
157 |     };
158 | }


--------------------------------------------------------------------------------
/mask2former/data/datasets/register_coco_panoptic_annos_semseg.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import json
  3 | import os
  4 | 
  5 | from detectron2.data import DatasetCatalog, MetadataCatalog
  6 | from detectron2.data.datasets import load_sem_seg
  7 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
  8 | from detectron2.utils.file_io import PathManager
  9 | 
 10 | 
 11 | _PREDEFINED_SPLITS_COCO_PANOPTIC = {
 12 |     "coco_2017_train_panoptic": (
 13 |         # This is the original panoptic annotation directory
 14 |         "coco/panoptic_train2017",
 15 |         "coco/annotations/panoptic_train2017.json",
 16 |         # This directory contains semantic annotations that are
 17 |         # converted from panoptic annotations.
 18 |         # It is used by PanopticFPN.
 19 |         # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
 20 |         # to create these directories.
 21 |         "coco/panoptic_semseg_train2017",
 22 |     ),
 23 |     "coco_2017_val_panoptic": (
 24 |         "coco/panoptic_val2017",
 25 |         "coco/annotations/panoptic_val2017.json",
 26 |         "coco/panoptic_semseg_val2017",
 27 |     ),
 28 | }
 29 | 
 30 | 
 31 | def get_metadata():
 32 |     meta = {}
 33 |     # The following metadata maps contiguous id from [0, #thing categories +
 34 |     # #stuff categories) to their names and colors. We have to replica of the
 35 |     # same name and color under "thing_*" and "stuff_*" because the current
 36 |     # visualization function in D2 handles thing and class classes differently
 37 |     # due to some heuristic used in Panoptic FPN. We keep the same naming to
 38 |     # enable reusing existing visualization functions.
 39 |     thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
 40 |     thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
 41 |     stuff_classes = [k["name"] for k in COCO_CATEGORIES]
 42 |     stuff_colors = [k["color"] for k in COCO_CATEGORIES]
 43 | 
 44 |     meta["thing_classes"] = thing_classes
 45 |     meta["thing_colors"] = thing_colors
 46 |     meta["stuff_classes"] = stuff_classes
 47 |     meta["stuff_colors"] = stuff_colors
 48 | 
 49 |     # Convert category id for training:
 50 |     #   category id: like semantic segmentation, it is the class id for each
 51 |     #   pixel. Since there are some classes not used in evaluation, the category
 52 |     #   id is not always contiguous and thus we have two set of category ids:
 53 |     #       - original category id: category id in the original dataset, mainly
 54 |     #           used for evaluation.
 55 |     #       - contiguous category id: [0, #classes), in order to train the linear
 56 |     #           softmax classifier.
 57 |     thing_dataset_id_to_contiguous_id = {}
 58 |     stuff_dataset_id_to_contiguous_id = {}
 59 | 
 60 |     for i, cat in enumerate(COCO_CATEGORIES):
 61 |         if cat["isthing"]:
 62 |             thing_dataset_id_to_contiguous_id[cat["id"]] = i
 63 |         # else:
 64 |         #     stuff_dataset_id_to_contiguous_id[cat["id"]] = i
 65 | 
 66 |         # in order to use sem_seg evaluator
 67 |         stuff_dataset_id_to_contiguous_id[cat["id"]] = i
 68 | 
 69 |     meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
 70 |     meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
 71 | 
 72 |     return meta
 73 | 
 74 | 
 75 | def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
 76 |     """
 77 |     Args:
 78 |         image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
 79 |         gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
 80 |         json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
 81 |     Returns:
 82 |         list[dict]: a list of dicts in Detectron2 standard format. (See
 83 |         `Using Custom Datasets </tutorials/datasets.html>`_ )
 84 |     """
 85 | 
 86 |     def _convert_category_id(segment_info, meta):
 87 |         if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
 88 |             segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
 89 |                 segment_info["category_id"]
 90 |             ]
 91 |             segment_info["isthing"] = True
 92 |         else:
 93 |             segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
 94 |                 segment_info["category_id"]
 95 |             ]
 96 |             segment_info["isthing"] = False
 97 |         return segment_info
 98 | 
 99 |     with PathManager.open(json_file) as f:
100 |         json_info = json.load(f)
101 | 
102 |     ret = []
103 |     for ann in json_info["annotations"]:
104 |         image_id = int(ann["image_id"])
105 |         # TODO: currently we assume image and label has the same filename but
106 |         # different extension, and images have extension ".jpg" for COCO. Need
107 |         # to make image extension a user-provided argument if we extend this
108 |         # function to support other COCO-like datasets.
109 |         image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
110 |         label_file = os.path.join(gt_dir, ann["file_name"])
111 |         sem_label_file = os.path.join(semseg_dir, ann["file_name"])
112 |         segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
113 |         ret.append(
114 |             {
115 |                 "file_name": image_file,
116 |                 "image_id": image_id,
117 |                 "pan_seg_file_name": label_file,
118 |                 "sem_seg_file_name": sem_label_file,
119 |                 "segments_info": segments_info,
120 |             }
121 |         )
122 |     assert len(ret), f"No images found in {image_dir}!"
123 |     assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
124 |     assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
125 |     assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
126 |     return ret
127 | 
128 | 
129 | def register_coco_panoptic_annos_sem_seg(
130 |     name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
131 | ):
132 |     panoptic_name = name
133 |     delattr(MetadataCatalog.get(panoptic_name), "thing_classes")
134 |     delattr(MetadataCatalog.get(panoptic_name), "thing_colors")
135 |     MetadataCatalog.get(panoptic_name).set(
136 |         thing_classes=metadata["thing_classes"],
137 |         thing_colors=metadata["thing_colors"],
138 |         # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"],
139 |     )
140 | 
141 |     # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg"
142 |     semantic_name = name + "_with_sem_seg"
143 |     DatasetCatalog.register(
144 |         semantic_name,
145 |         lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata),
146 |     )
147 |     MetadataCatalog.get(semantic_name).set(
148 |         sem_seg_root=sem_seg_root,
149 |         panoptic_root=panoptic_root,
150 |         image_root=image_root,
151 |         panoptic_json=panoptic_json,
152 |         json_file=instances_json,
153 |         evaluator_type="coco_panoptic_seg",
154 |         ignore_label=255,
155 |         label_divisor=1000,
156 |         **metadata,
157 |     )
158 | 
159 | 
160 | def register_all_coco_panoptic_annos_sem_seg(root):
161 |     for (
162 |         prefix,
163 |         (panoptic_root, panoptic_json, semantic_root),
164 |     ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
165 |         prefix_instances = prefix[: -len("_panoptic")]
166 |         instances_meta = MetadataCatalog.get(prefix_instances)
167 |         image_root, instances_json = instances_meta.image_root, instances_meta.json_file
168 | 
169 |         register_coco_panoptic_annos_sem_seg(
170 |             prefix,
171 |             get_metadata(),
172 |             image_root,
173 |             os.path.join(root, panoptic_root),
174 |             os.path.join(root, panoptic_json),
175 |             os.path.join(root, semantic_root),
176 |             instances_json,
177 |         )
178 | 
179 | 
180 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
181 | register_all_coco_panoptic_annos_sem_seg(_root)
182 | 


--------------------------------------------------------------------------------
/uni_dvps/data_video/datasets/semkitti_dvps.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import io
  3 | import json
  4 | import logging
  5 | import numpy as np
  6 | import os
  7 | import tqdm
  8 | import pycocotools.mask as mask_util
  9 | from fvcore.common.file_io import PathManager
 10 | from fvcore.common.timer import Timer
 11 | 
 12 | from detectron2.structures import Boxes, BoxMode, PolygonMasks
 13 | from detectron2.data import DatasetCatalog, MetadataCatalog
 14 | 
 15 | """
 16 | This file contains functions to parse SemKITTI-DVPS dataset into dicts in "Detectron2 format".
 17 | """
 18 | logger = logging.getLogger(__name__)
 19 | __all__ = ["load_semkitti_dvps_json", "register_semkitti_dvps"]
 20 | 
 21 | SEMKITTI_CATEGORIES = [
 22 |     {"color": (245, 150, 100), "isthing": 1, "id": 10, "trainId": 0, "name": "car"},
 23 |     {"color": (245, 230, 100), "isthing": 1, "id": 11, "trainId": 1, "name": "bicycle"},
 24 |     {"color": (150,  60,  30), "isthing": 1, "id": 15, "trainId": 2, "name": "motorcycle"},
 25 |     {"color": (180,  30,  80), "isthing": 1, "id": 18, "trainId": 3, "name": "truck"},
 26 |     {"color": (255,   0,   0), "isthing": 1, "id": 20, "trainId": 4, "name": "other-vehicle"},
 27 |     {"color": ( 30,  30, 255), "isthing": 1, "id": 30, "trainId": 5, "name": "person"},
 28 |     {"color": (200,  40, 255), "isthing": 1, "id": 31, "trainId": 6, "name": "bicyclist"},
 29 |     {"color": ( 90,  30, 150), "isthing": 1, "id": 32, "trainId": 7, "name": "motorcyclist"},
 30 | 
 31 |     {"color": (255,   0, 255), "isthing": 0, "id": 40, "trainId": 8, "name": "road"},
 32 |     {"color": (255, 150, 255), "isthing": 0, "id": 44, "trainId": 9, "name": "parking"},
 33 |     {"color": ( 75,   0,  75), "isthing": 0, "id": 48, "trainId": 10, "name": "sidewalk"},
 34 |     {"color": ( 75,   0, 175), "isthing": 0, "id": 49, "trainId": 11, "name": "other-ground"},
 35 |     {"color": (  0, 200, 255), "isthing": 0, "id": 50, "trainId": 12, "name": "building"},
 36 |     {"color": ( 50, 120, 255), "isthing": 0, "id": 51, "trainId": 13, "name": "fence"},
 37 |     {"color": (  0, 175,   0), "isthing": 0, "id": 70, "trainId": 14, "name": "vegetation"},
 38 |     {"color": (  0,  60, 135), "isthing": 0, "id": 71, "trainId": 15, "name": "trunk"},
 39 |     {"color": ( 80, 240, 150), "isthing": 0, "id": 72, "trainId": 16, "name": "terrain"},
 40 |     {"color": (150, 240, 255), "isthing": 0, "id": 80, "trainId": 17, "name": "pole"},
 41 |     {"color": (  0,   0, 255), "isthing": 0, "id": 81, "trainId": 18, "name": "traffic-sign"},
 42 | ]
 43 | 
 44 | def _get_semkitti_dvps_meta():
 45 |     thing_ids = [k["trainId"] for k in SEMKITTI_CATEGORIES if k["isthing"] == 1]
 46 |     thing_classes = [k["name"] for k in SEMKITTI_CATEGORIES if k["isthing"] == 1]
 47 |     thing_colors = [k["color"] for k in SEMKITTI_CATEGORIES if k["isthing"] == 1]
 48 |     stuff_ids = [k["trainId"] for k in SEMKITTI_CATEGORIES if k["isthing"] == 0]
 49 |     stuff_classes = [k["name"] for k in SEMKITTI_CATEGORIES if k["isthing"] == 0]
 50 |     stuff_colors = [k["color"] for k in SEMKITTI_CATEGORIES if k["isthing"] == 0]
 51 | 
 52 |     assert len(thing_ids) == 8, len(thing_ids)
 53 |     assert len(stuff_ids) == 11, len(stuff_ids)
 54 | 
 55 |     # Mapping from the incontiguous SEMKITTI_DVPS category id to an id in [0, 10]
 56 |     thing_train_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
 57 |     stuff_train_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
 58 | 
 59 |     ret = {
 60 |         "thing_ids": thing_ids,
 61 |         "thing_classes": thing_classes,
 62 |         "thing_colors": thing_colors,
 63 |         "thing_train_id_to_contiguous_id": thing_train_id_to_contiguous_id,
 64 |         "stuff_ids": stuff_ids,
 65 |         "stuff_classes": stuff_classes,
 66 |         "stuff_colors": stuff_colors,
 67 |         "stuff_train_id_to_contiguous_id": stuff_train_id_to_contiguous_id
 68 |     }
 69 | 
 70 |     return ret
 71 | 
 72 | def load_semkitti_dvps_json(gt_json, image_dir, gt_dir, meta, name):
 73 |     assert os.path.exists(gt_json), gt_json+" not exists"
 74 |     with open(gt_json) as f:
 75 |         file_dicts = json.load(f)
 76 | 
 77 |     dataset_dicts = []
 78 |     if 'train' in name:
 79 |         for vid in file_dicts.keys():
 80 |             for fid in file_dicts[vid].keys():
 81 |                 if fid == "000000":
 82 |                     record = {}
 83 |                     record["video_id"] = vid
 84 |                     record["height"] = file_dicts[vid][fid]["height"]
 85 |                     record["width"] = file_dicts[vid][fid]["width"]
 86 |                     record["file_names"] = [os.path.join(image_dir, file_dicts[vid][fid]["image"])]
 87 |                     record["class_file_names"] = [os.path.join(image_dir, file_dicts[vid][fid]["class"])]
 88 |                     record["instance_file_names"] = [os.path.join(image_dir, file_dicts[vid][fid]["instance"])]
 89 |                     record["depth_file_names"] = [os.path.join(image_dir, file_dicts[vid][fid]["depth"])]
 90 |                     dataset_dicts.append(record)
 91 |                 else:
 92 |                     dataset_dicts[-1]["file_names"].append(os.path.join(image_dir, file_dicts[vid][fid]["image"]))
 93 |                     dataset_dicts[-1]["class_file_names"].append(os.path.join(image_dir, file_dicts[vid][fid]["class"]))
 94 |                     dataset_dicts[-1]["instance_file_names"].append(os.path.join(image_dir, file_dicts[vid][fid]["instance"]))
 95 |                     dataset_dicts[-1]["depth_file_names"].append(os.path.join(image_dir, file_dicts[vid][fid]["depth"]))
 96 | 
 97 |     elif 'val' in name:
 98 |         len_vid = int(name.split('val')[1])
 99 |         for vid in file_dicts.keys():
100 |             for fid in file_dicts[vid].keys():
101 |                 if int(fid)+len_vid > len(file_dicts[vid]):
102 |                     continue
103 |                 for i in range(len_vid):
104 |                     if i == 0:
105 |                         record = {}
106 |                         record["video_id"] = vid
107 |                         record["height"] = file_dicts[vid][fid]["height"]
108 |                         record["width"] = file_dicts[vid][fid]["width"]
109 |                         record["file_names"] = [os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][fid]["image"])]
110 |                         record["class_file_names"] = [os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][fid]["class"])]
111 |                         record["instance_file_names"] = [os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][fid]["instance"])]
112 |                         record["depth_file_names"] = [os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][fid]["depth"])]
113 |                         dataset_dicts.append(record)
114 |                         i += 1
115 |                     else:
116 |                         next_fid = '{0:06d}'.format(int(fid)+i)
117 |                         dataset_dicts[-1]["file_names"].append(os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][next_fid]["image"]))
118 |                         dataset_dicts[-1]["class_file_names"].append(os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][next_fid]["class"]))
119 |                         dataset_dicts[-1]["instance_file_names"].append(os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][next_fid]["instance"]))
120 |                         dataset_dicts[-1]["depth_file_names"].append(os.path.join(image_dir, str(fid)+'_'+file_dicts[vid][next_fid]["depth"]))
121 |                         i += 1
122 | 
123 | 
124 |     # logger.info("Loaded {} images from {}".format(len(file_dicts), image_dir))
125 |     return dataset_dicts
126 | 
127 | 
128 | def register_semkitti_dvps(name, meta, gt_json, image_dir, gt_dir):
129 |     """
130 |     Register a dataset in Cityscapes_DVPS's json annotation format for DVPS.
131 |     """
132 |     assert isinstance(name, str), name
133 |     assert isinstance(gt_json, (str, os.PathLike)), gt_json
134 |     assert isinstance(image_dir, (str, os.PathLike)), image_dir
135 |     assert isinstance(gt_dir, (str, os.PathLike)), gt_dir
136 | 
137 |     DatasetCatalog.register(name, lambda: load_semkitti_dvps_json(gt_json, image_dir, gt_dir, meta, name))
138 |     MetadataCatalog.get(name).set(
139 |         panoptic_root=gt_dir,
140 |         image_root=image_dir,
141 |         gt_dir=gt_dir,
142 |         evaluator_type="semkitti_dvps",
143 |         ignore_label=255,
144 |         **meta,
145 |     )
146 | 


--------------------------------------------------------------------------------