├── util
    ├── __init__.py
    ├── box_ops.py
    └── stanza_utils.py
├── models
    ├── albef
    │   ├── models
    │   │   └── __init__.py
    │   ├── config_bert.json
    │   └── VL_Transformer_ITM.py
    ├── vlp_model_builder.py
    ├── helpers.py
    └── vlp_model.py
├── mask2former
    ├── evaluation
    │   ├── __init__.py
    │   └── instance_evaluation.py
    ├── utils
    │   ├── __init__.py
    │   └── misc.py
    ├── modeling
    │   ├── backbone
    │   │   └── __init__.py
    │   ├── meta_arch
    │   │   ├── __init__.py
    │   │   └── mask_former_head.py
    │   ├── pixel_decoder
    │   │   ├── __init__.py
    │   │   └── ops
    │   │   │   ├── make.sh
    │   │   │   ├── modules
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn.py
    │   │   │   ├── functions
    │   │   │       ├── __init__.py
    │   │   │       └── ms_deform_attn_func.py
    │   │   │   ├── src
    │   │   │       ├── vision.cpp
    │   │   │       ├── cuda
    │   │   │       │   └── ms_deform_attn_cuda.h
    │   │   │       ├── cpu
    │   │   │       │   ├── ms_deform_attn_cpu.h
    │   │   │       │   └── ms_deform_attn_cpu.cpp
    │   │   │       └── ms_deform_attn.h
    │   │   │   ├── setup.py
    │   │   │   └── test.py
    │   ├── transformer_decoder
    │   │   ├── __init__.py
    │   │   ├── position_encoding.py
    │   │   └── maskformer_transformer_decoder.py
    │   └── __init__.py
    ├── __init__.py
    ├── test_time_augmentation.py
    └── config.py
├── image.png
├── configs
    ├── ade20k
    │   ├── semantic-segmentation
    │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   ├── swin
    │   │   │   ├── maskformer2_swin_tiny_bs16_160k.yaml
    │   │   │   ├── maskformer2_swin_small_bs16_160k.yaml
    │   │   │   ├── maskformer2_swin_base_384_bs16_160k_res640.yaml
    │   │   │   ├── maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml
    │   │   │   └── maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml
    │   │   ├── maskformer2_R50_bs16_160k.yaml
    │   │   └── Base-ADE20K-SemanticSegmentation.yaml
    │   ├── instance-segmentation
    │   │   ├── swin
    │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml
    │   │   ├── maskformer2_R50_bs16_160k.yaml
    │   │   └── Base-ADE20K-InstanceSegmentation.yaml
    │   └── panoptic-segmentation
    │   │   ├── swin
    │   │       └── maskformer2_swin_large_IN21k_384_bs16_160k.yaml
    │   │   ├── maskformer2_R50_bs16_160k.yaml
    │   │   └── Base-ADE20K-PanopticSegmentation.yaml
    ├── coco
    │   ├── instance-segmentation
    │   │   ├── maskformer2_R101_bs16_50ep.yaml
    │   │   ├── swin
    │   │   │   ├── maskformer2_swin_small_bs16_50ep.yaml
    │   │   │   ├── maskformer2_swin_tiny_bs16_50ep.yaml
    │   │   │   ├── maskformer2_swin_base_384_bs16_50ep.yaml
    │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml
    │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml
    │   │   ├── Base-COCO-InstanceSegmentation.yaml
    │   │   └── maskformer2_R50_bs16_50ep.yaml
    │   └── panoptic-segmentation
    │   │   ├── maskformer2_R101_bs16_50ep.yaml
    │   │   ├── swin
    │   │       ├── maskformer2_swin_small_bs16_50ep.yaml
    │   │       ├── maskformer2_swin_tiny_bs16_50ep.yaml
    │   │       ├── maskformer2_swin_base_384_bs16_50ep.yaml
    │   │       ├── maskformer2_swin_base_IN21k_384_bs16_50ep.yaml
    │   │       └── maskformer2_swin_large_IN21k_384_bs16_100ep.yaml
    │   │   ├── Base-COCO-PanopticSegmentation.yaml
    │   │   └── maskformer2_R50_bs16_50ep.yaml
    ├── cityscapes
    │   ├── instance-segmentation
    │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   ├── swin
    │   │   │   ├── maskformer2_swin_tiny_bs16_90k.yaml
    │   │   │   ├── maskformer2_swin_small_bs16_90k.yaml
    │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │   │   ├── maskformer2_R50_bs16_90k.yaml
    │   │   └── Base-Cityscapes-InstanceSegmentation.yaml
    │   ├── panoptic-segmentation
    │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   ├── swin
    │   │   │   ├── maskformer2_swin_tiny_bs16_90k.yaml
    │   │   │   ├── maskformer2_swin_small_bs16_90k.yaml
    │   │   │   ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │   │   ├── maskformer2_R50_bs16_90k.yaml
    │   │   └── Base-Cityscapes-PanopticSegmentation.yaml
    │   └── semantic-segmentation
    │   │   ├── maskformer2_R101_bs16_90k.yaml
    │   │   ├── swin
    │   │       ├── maskformer2_swin_tiny_bs16_90k.yaml
    │   │       ├── maskformer2_swin_small_bs16_90k.yaml
    │   │       ├── maskformer2_swin_base_IN21k_384_bs16_90k.yaml
    │   │       └── maskformer2_swin_large_IN21k_384_bs16_90k.yaml
    │   │   ├── maskformer2_R50_bs16_90k.yaml
    │   │   └── Base-Cityscapes-SemanticSegmentation.yaml
    ├── youtubevis_2019
    │   ├── video_maskformer2_R101_bs16_8ep.yaml
    │   ├── swin
    │   │   ├── video_maskformer2_swin_small_bs16_8ep.yaml
    │   │   ├── video_maskformer2_swin_tiny_bs16_8ep.yaml
    │   │   ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml
    │   │   └── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
    │   ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
    │   └── video_maskformer2_R50_bs16_8ep.yaml
    ├── youtubevis_2021
    │   ├── video_maskformer2_R101_bs16_8ep.yaml
    │   ├── swin
    │   │   ├── video_maskformer2_swin_small_bs16_8ep.yaml
    │   │   ├── video_maskformer2_swin_tiny_bs16_8ep.yaml
    │   │   ├── video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml
    │   │   └── video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml
    │   ├── Base-YouTubeVIS-VideoInstanceSegmentation.yaml
    │   └── video_maskformer2_R50_bs16_8ep.yaml
    ├── med_config.json
    ├── mapillary-vistas
    │   ├── panoptic-segmentation
    │   │   ├── swin
    │   │   │   └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml
    │   │   ├── maskformer_R50_bs16_300k.yaml
    │   │   └── Base-MapillaryVistas-PanopticSegmentation.yaml
    │   └── semantic-segmentation
    │   │   ├── swin
    │   │       └── maskformer2_swin_large_IN21k_384_bs16_300k.yaml
    │   │   ├── maskformer2_R50_bs16_300k.yaml
    │   │   └── Base-MapillaryVistas-SemanticSegmentation.yaml
    └── freesolo
    │   ├── freesolo_30k.yaml
    │   └── freemask.yaml
├── dataset
    ├── concat_dataset.py
    ├── __init__.py
    ├── refexp_eval.py
    ├── categories.py
    ├── image_to_seq_augmenter.py
    ├── a2d_eval.py
    ├── coco.py
    ├── samplers.py
    └── jhmdb.py
├── demo
    ├── Logger.py
    ├── postproc.py
    └── itergradcam.py
├── evaluator.py
├── .gitignore
└── README.md


/util/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/albef/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mask2former/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VoyageWang/IteRPrimE/HEAD/image.png


--------------------------------------------------------------------------------
/mask2former/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .maskformer_transformer_decoder import StandardTransformerDecoder
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | 


--------------------------------------------------------------------------------
/mask2former/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .backbone.swin import D2SwinTransformer
3 | from .pixel_decoder.fpn import BasePixelDecoder
4 | from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
5 | from .meta_arch.mask_former_head import MaskFormerHead
6 | from .meta_arch.per_pixel_baseline import PerPixelBaselineHead, PerPixelBaselinePlusHead
7 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   WEIGHTS: "R-101.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2019/video_maskformer2_R101_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_eba159.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2021/video_maskformer2_R101_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_eba159.pkl"
 4 |   RESNETS:
 5 |     DEPTH: 101
 6 |     STEM_TYPE: "basic"  # not used
 7 |     STEM_OUT_CHANNELS: 64
 8 |     STRIDE_IN_1X1: False
 9 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10 |     # NORM: "SyncBN"
11 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
12 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "swin_small_patch4_window7_224.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2019/swin/video_maskformer2_swin_small_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_1e7f22.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2019/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_86143f.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2021/swin/video_maskformer2_swin_small_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_1e7f22.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2021/swin/video_maskformer2_swin_tiny_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 96
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [3, 6, 12, 24]
 9 |     WINDOW_SIZE: 7
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |   WEIGHTS: "model_final_86143f.pkl"
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.120, 57.375]
16 | INPUT:
17 |   MIN_SIZE_TEST: 480
18 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/models/vlp_model_builder.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | from .vlp_model import VLPModel
 3 | from models.albef.engine import ALBEF
 4 | 
 5 | class VLPModelBuilder:
 6 |     MODEL_ID_MAP = {
 7 |         'ALBEF': ALBEF,
 8 |         'TCL': ALBEF,
 9 |     }
10 |     def __call__(self, model_id, **kwargs) -> VLPModel:
11 |         if model_id not in self.MODEL_ID_MAP.keys():
12 |             raise ValueError('Meaningless model_id: {}, you have to choose from: {}'.format(model_id, self.MODEL_ID_MAP.keys()))
13 |         return self.MODEL_ID_MAP[model_id](model_id, **kwargs)


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2019/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_83d103.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TEST: 480
19 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2021/swin/video_maskformer2_swin_base_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_83d103.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TEST: 480
19 | 


--------------------------------------------------------------------------------
/models/albef/config_bert.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertForMaskedLM"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "type_vocab_size": 2,
18 |   "vocab_size": 30522,
19 |   "fusion_layer": 6,
20 |   "encoder_width": 768
21 | }


--------------------------------------------------------------------------------
/configs/med_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "type_vocab_size": 2,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true   
21 | }
22 | 


--------------------------------------------------------------------------------
/configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | 


--------------------------------------------------------------------------------
/configs/mapillary-vistas/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | 


--------------------------------------------------------------------------------
/configs/mapillary-vistas/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_300k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 100
19 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2019/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_e5f453.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | INPUT:
20 |   MIN_SIZE_TEST: 480
21 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "../checkpoints/model_final_f07440.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | SOLVER:
20 |   STEPS: (655556, 710184)
21 |   MAX_ITER: 737500
22 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2021/swin/video_maskformer2_swin_large_IN21k_384_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../video_maskformer2_R50_bs16_8ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "model_final_e5f453.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | # OOM when using a larger test size
20 | # INPUT:
21 | #   MIN_SIZE_TEST: 480
22 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 |   MASK_FORMER:
18 |     NUM_OBJECT_QUERIES: 200
19 | SOLVER:
20 |   STEPS: (655556, 710184)
21 |   MAX_ITER: 737500
22 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | # Copyright (c) Facebook, Inc. and its affiliates.
11 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
12 | 
13 | python setup.py build install
14 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn import MSDeformAttn
13 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from .ms_deform_attn_func import MSDeformAttnFunction
13 | 
14 | 


--------------------------------------------------------------------------------
/mask2former/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from . import data  # register all new datasets
 3 | from . import modeling
 4 | 
 5 | # config
 6 | from .config import add_maskformer2_config
 7 | 
 8 | # dataset loading
 9 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
10 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
11 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
12 |     MaskFormerInstanceDatasetMapper,
13 | )
14 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
15 |     MaskFormerPanopticDatasetMapper,
16 | )
17 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
18 |     MaskFormerSemanticDatasetMapper,
19 | )
20 | 
21 | # models
22 | from .maskformer_model import MaskFormer
23 | from .test_time_augmentation import SemanticSegmentorWithTTA
24 | 
25 | # evaluation
26 | from .evaluation.instance_evaluation import InstanceSegEvaluator
27 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include "ms_deform_attn.h"
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
20 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
21 | }
22 | 


--------------------------------------------------------------------------------
/configs/freesolo/freesolo_30k.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "PseudoSOLOv2"
 3 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 4 |   PIXEL_STD: [58.395, 57.120, 57.375]
 5 |   WEIGHTS: "training_dir/pre-trained/DenseCL/densecl_r101_imagenet_200ep.pkl"
 6 |   MASK_ON: True
 7 |   BACKBONE:
 8 |     NAME: "build_resnet_fpn_backbone"
 9 |     FREEZE_AT: 0
10 |   RESNETS:
11 |     STRIDE_IN_1X1: False
12 |     DEPTH: 101
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |   FPN:
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |   SOLOV2:
17 |     NUM_CLASSES: 2
18 |     LOSS:
19 |       FOCAL_WEIGHT: 1.0
20 |       DICE_WEIGHT: 1.0
21 | SOLVER:
22 |   LR_SCHEDULER_NAME: "WarmupMultiStepLR"
23 |   MAX_ITER: 30000
24 |   STEPS: []
25 |   IMS_PER_BATCH: 16
26 |   BASE_LR: 0.001
27 |   WARMUP_FACTOR: 0.01
28 |   WARMUP_ITERS: 1000
29 |   CHECKPOINT_PERIOD: 5000
30 | DATASETS:
31 |   TRAIN: ("coco_2017_train_unlabeled_densecl_r101",)
32 |   TEST: ("coco_2017_val",)
33 | INPUT:
34 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
35 |   MASK_FORMAT: "bitmask"
36 |   FORMAT: "RGB"
37 | 


--------------------------------------------------------------------------------
/configs/freesolo/freemask.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "PseudoSOLOv2"
 3 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 4 |   PIXEL_STD: [58.395, 57.120, 57.375]
 5 |   WEIGHTS: "training_dir/pre-trained/DenseCL/densecl_r101_imagenet_200ep.pkl"
 6 |   MASK_ON: True
 7 |   BACKBONE:
 8 |     NAME: "build_resnet_backbone"
 9 |     FREEZE_AT: 0
10 |   RESNETS:
11 |     STRIDE_IN_1X1: False
12 |     DEPTH: 101
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |   FPN:
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |   SOLOV2:
17 |     IS_FREEMASK: True
18 |     NUM_CLASSES: 2
19 |     LOSS:
20 |       FOCAL_WEIGHT: 1.0
21 |       DICE_WEIGHT: 1.0
22 | SOLVER:
23 |   LR_SCHEDULER_NAME: "WarmupMultiStepLR"
24 |   MAX_ITER: 30000
25 |   STEPS: []
26 |   IMS_PER_BATCH: 16
27 |   BASE_LR: 0.001
28 |   WARMUP_FACTOR: 0.01
29 |   WARMUP_ITERS: 1000
30 |   CHECKPOINT_PERIOD: 5000
31 | DATASETS:
32 |   TRAIN: ("coco_2017_train_unlabeled_densecl_r101",)
33 |   TEST: ("coco_2017_val",)
34 | INPUT:
35 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
36 |   MASK_FORMAT: "bitmask"
37 |   FORMAT: "RGB"
38 | 


--------------------------------------------------------------------------------
/dataset/concat_dataset.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # ------------------------------------------------------------------------
 5 | 
 6 | from pathlib import Path
 7 | 
 8 | import torch
 9 | import torch.utils.data
10 | 
11 | from torch.utils.data import Dataset, ConcatDataset
12 | from .refexp2seq import build as build_seq_refexp
13 | from .ytvos import build as build_ytvs
14 | from dataset import ytvos
15 | 
16 | 
17 | 
18 | def build(image_set, args):
19 |     concat_data = []
20 | 
21 |     print('preparing coco2seq dataset ....')
22 |     coco_names = ["refcoco", "refcoco+", "refcocog"]
23 |     for name in coco_names:
24 |         coco_seq =  build_seq_refexp(name, image_set, args)
25 |         concat_data.append(coco_seq)
26 | 
27 |     print('preparing ytvos dataset  .... ')
28 |     ytvos_dataset = build_ytvs(image_set, args)
29 |     concat_data.append(ytvos_dataset)
30 | 
31 |     concat_data = ConcatDataset(concat_data)
32 | 
33 |     return concat_data
34 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_base_patch4_window12_384.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 192
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [6, 12, 24, 48]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "D2SwinTransformer"
 5 |   SWIN:
 6 |     EMBED_DIM: 128
 7 |     DEPTHS: [2, 2, 18, 2]
 8 |     NUM_HEADS: [4, 8, 16, 32]
 9 |     WINDOW_SIZE: 12
10 |     APE: False
11 |     DROP_PATH_RATE: 0.3
12 |     PATCH_NORM: True
13 |     PRETRAIN_IMG_SIZE: 384
14 |   WEIGHTS: "/home/vegetabot/Filesys/CodeField_win/Mask2Former/weights/model_final_7e47bf.pkl"
15 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
16 |   PIXEL_STD: [58.395, 57.120, 57.375]
17 | INPUT:
18 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
20 |   MIN_SIZE_TEST: 640
21 |   MAX_SIZE_TRAIN: 2560
22 |   MAX_SIZE_TEST: 2560
23 |   CROP:
24 |     ENABLED: True
25 |     TYPE: "absolute"
26 |     SIZE: (640, 640)
27 |     SINGLE_CATEGORY_MAX_AREA: 1.0
28 |   COLOR_AUG_SSD: True
29 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
30 |   FORMAT: "RGB"
31 | TEST:
32 |   EVAL_PERIOD: 5000
33 |   AUG:
34 |     ENABLED: False
35 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36 |     MAX_SIZE: 4480
37 |     FLIP: True
38 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train",)
18 |   TEST: ("coco_2017_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_instance_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | DATALOADER:
45 |   FILTER_EMPTY_ANNOTATIONS: True
46 |   NUM_WORKERS: 4
47 | VERSION: 2
48 | 


--------------------------------------------------------------------------------
/demo/Logger.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | import os.path as osp
 3 | import sys
 4 | 
 5 | def mkdir_if_missing(directory):
 6 |     if not osp.exists(directory):
 7 |         try:
 8 |             os.makedirs(directory)
 9 |         except OSError as e:
10 |             if e.errno != errno.EEXIST:
11 |                 raise   
12 | class Logger(object):
13 |     """
14 |     Write console output to external text file.
15 |     Code imported from https://github.com/Cysu/open-reid/blob/master/reid/utils/logging.py.
16 |     """  
17 |     def __init__(self, fpath=None, mode = 'w'):
18 |         self.console = sys.stdout
19 |         self.file = None
20 |         if fpath is not None:
21 |             mkdir_if_missing(osp.dirname(fpath))
22 |             self.file = open(fpath, mode)
23 | 
24 |     def __del__(self):
25 |         self.close()
26 | 
27 |     def __enter__(self):
28 |         pass
29 | 
30 |     def __exit__(self, *args):
31 |         self.close()
32 | 
33 |     def write(self, msg):
34 |         self.console.write(msg)
35 |         if self.file is not None:
36 |             self.file.write(msg)
37 | 
38 |     def flush(self):
39 |         self.console.flush()
40 |         if self.file is not None:
41 |             self.file.flush()
42 |             os.fsync(self.file.fileno())
43 | 
44 |     def close(self):
45 |         self.console.close()
46 |         if self.file is not None:
47 |             self.file.close()


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("coco_2017_train_panoptic",)
18 |   TEST: ("coco_2017_val_panoptic_with_sem_seg",)  # to evaluate instance and semantic performance as well
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   STEPS: (327778, 355092)
23 |   MAX_ITER: 368750
24 |   WARMUP_FACTOR: 1.0
25 |   WARMUP_ITERS: 10
26 |   WEIGHT_DECAY: 0.05
27 |   OPTIMIZER: "ADAMW"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   IMAGE_SIZE: 1024
38 |   MIN_SCALE: 0.1
39 |   MAX_SCALE: 2.0
40 |   FORMAT: "RGB"
41 |   DATASET_MAPPER_NAME: "coco_panoptic_lsj"
42 | TEST:
43 |   EVAL_PERIOD: 5000
44 | DATALOADER:
45 |   FILTER_EMPTY_ANNOTATIONS: True
46 |   NUM_WORKERS: 4
47 | VERSION: 2
48 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2019/Base-YouTubeVIS-VideoInstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     # NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TRAIN: ("ytvis_2019_train",)
19 |   TEST: ("ytvis_2019_val",)
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   STEPS: (4000,)
24 |   MAX_ITER: 6000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 10
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   RANDOM_FLIP: "flip_by_clip"
40 |   AUGMENTATIONS: []
41 |   MIN_SIZE_TRAIN: (360, 480)
42 |   MIN_SIZE_TEST: 360
43 |   CROP:
44 |     ENABLED: False
45 |     TYPE: "absolute_range"
46 |     SIZE: (600, 720)
47 |   FORMAT: "RGB"
48 | TEST:
49 |   EVAL_PERIOD: 0
50 | DATALOADER:
51 |   FILTER_EMPTY_ANNOTATIONS: False
52 |   NUM_WORKERS: 4
53 | VERSION: 2
54 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2021/Base-YouTubeVIS-VideoInstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   MASK_ON: True
 9 |   RESNETS:
10 |     DEPTH: 50
11 |     STEM_TYPE: "basic"  # not used
12 |     STEM_OUT_CHANNELS: 64
13 |     STRIDE_IN_1X1: False
14 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     # NORM: "SyncBN"
16 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
17 | DATASETS:
18 |   TRAIN: ("ytvis_2021_train",)
19 |   TEST: ("ytvis_2021_val",)
20 | SOLVER:
21 |   IMS_PER_BATCH: 16
22 |   BASE_LR: 0.0001
23 |   STEPS: (5500,)
24 |   MAX_ITER: 8000
25 |   WARMUP_FACTOR: 1.0
26 |   WARMUP_ITERS: 10
27 |   WEIGHT_DECAY: 0.05
28 |   OPTIMIZER: "ADAMW"
29 |   BACKBONE_MULTIPLIER: 0.1
30 |   CLIP_GRADIENTS:
31 |     ENABLED: True
32 |     CLIP_TYPE: "full_model"
33 |     CLIP_VALUE: 0.01
34 |     NORM_TYPE: 2.0
35 |   AMP:
36 |     ENABLED: True
37 | INPUT:
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice_by_clip"
39 |   RANDOM_FLIP: "flip_by_clip"
40 |   AUGMENTATIONS: []
41 |   MIN_SIZE_TRAIN: (360, 480)
42 |   MIN_SIZE_TEST: 360
43 |   CROP:
44 |     ENABLED: False
45 |     TYPE: "absolute_range"
46 |     SIZE: (600, 720)
47 |   FORMAT: "RGB"
48 | TEST:
49 |   EVAL_PERIOD: 0
50 | DATALOADER:
51 |   FILTER_EMPTY_ANNOTATIONS: False
52 |   NUM_WORKERS: 4
53 | VERSION: 2
54 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor ms_deform_attn_cuda_forward(
20 |     const at::Tensor &value, 
21 |     const at::Tensor &spatial_shapes,
22 |     const at::Tensor &level_start_index,
23 |     const at::Tensor &sampling_loc,
24 |     const at::Tensor &attn_weight,
25 |     const int im2col_step);
26 | 
27 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
28 |     const at::Tensor &value, 
29 |     const at::Tensor &spatial_shapes,
30 |     const at::Tensor &level_start_index,
31 |     const at::Tensor &sampling_loc,
32 |     const at::Tensor &attn_weight,
33 |     const at::Tensor &grad_output,
34 |     const int im2col_step);
35 | 
36 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 80
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: False
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | #include <torch/extension.h>
18 | 
19 | at::Tensor
20 | ms_deform_attn_cpu_forward(
21 |     const at::Tensor &value, 
22 |     const at::Tensor &spatial_shapes,
23 |     const at::Tensor &level_start_index,
24 |     const at::Tensor &sampling_loc,
25 |     const at::Tensor &attn_weight,
26 |     const int im2col_step);
27 | 
28 | std::vector<at::Tensor>
29 | ms_deform_attn_cpu_backward(
30 |     const at::Tensor &value, 
31 |     const at::Tensor &spatial_shapes,
32 |     const at::Tensor &level_start_index,
33 |     const at::Tensor &sampling_loc,
34 |     const at::Tensor &attn_weight,
35 |     const at::Tensor &grad_output,
36 |     const int im2col_step);
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 100
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 8
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: False
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/mapillary-vistas/panoptic-segmentation/maskformer_R50_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-MapillaryVistas-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 65
 7 |     NUM_CLASSES: 65
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.0
45 | 


--------------------------------------------------------------------------------
/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch.utils.data
 2 | import torchvision
 3 | 
 4 | from .ytvos import build as build_ytvos
 5 | from .davis import build as build_davis
 6 | from .a2d import build as build_a2d
 7 | from .jhmdb import build as build_jhmdb
 8 | from .refexp import build as build_refexp
 9 | from .concat_dataset import build as build_joint
10 | 
11 | 
12 | def get_coco_api_from_dataset(dataset):
13 |     for _ in range(10):
14 |         # if isinstance(dataset, torchvision.datasets.CocoDetection):
15 |         #     break
16 |         if isinstance(dataset, torch.utils.data.Subset):
17 |             dataset = dataset.dataset
18 |     if isinstance(dataset, torchvision.datasets.CocoDetection):
19 |         return dataset.coco
20 | 
21 | 
22 | def build_dataset(dataset_file: str, image_set: str, args):
23 |     if dataset_file == 'ytvos':
24 |         return build_ytvos(image_set, args)
25 |     if dataset_file == 'davis':
26 |         return build_davis(image_set, args)
27 |     if dataset_file == 'a2d':
28 |         return build_a2d(image_set, args)
29 |     if dataset_file == 'jhmdb':
30 |         return build_jhmdb(image_set, args)
31 |     # for pretraining
32 |     if dataset_file == "refcoco" or dataset_file == "refcoco+" or dataset_file == "refcocog":
33 |         return build_refexp(dataset_file, image_set, args)
34 |     # for joint training of refcoco and ytvos
35 |     if dataset_file == 'joint':
36 |         return build_joint(image_set, args)
37 |     raise ValueError(f'dataset {dataset_file} not supported')
38 | 


--------------------------------------------------------------------------------
/configs/mapillary-vistas/semantic-segmentation/maskformer2_R50_bs16_300k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-MapillaryVistas-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 65
 7 |     NUM_CLASSES: 65
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.0
45 | 


--------------------------------------------------------------------------------
/configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 133
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: True
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: True
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2019/video_maskformer2_R50_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_3c8ec9.pkl"
 4 |   META_ARCHITECTURE: "VideoMaskFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 40
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: False
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: False
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/configs/youtubevis_2021/video_maskformer2_R50_bs16_8ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-YouTubeVIS-VideoInstanceSegmentation.yaml
 2 | MODEL:
 3 |   WEIGHTS: "model_final_3c8ec9.pkl"
 4 |   META_ARCHITECTURE: "VideoMaskFormer"
 5 |   SEM_SEG_HEAD:
 6 |     NAME: "MaskFormerHead"
 7 |     IGNORE_VALUE: 255
 8 |     NUM_CLASSES: 40
 9 |     LOSS_WEIGHT: 1.0
10 |     CONVS_DIM: 256
11 |     MASK_DIM: 256
12 |     NORM: "GN"
13 |     # pixel decoder
14 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
16 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17 |     COMMON_STRIDE: 4
18 |     TRANSFORMER_ENC_LAYERS: 6
19 |   MASK_FORMER:
20 |     TRANSFORMER_DECODER_NAME: "VideoMultiScaleMaskedTransformerDecoder"
21 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22 |     DEEP_SUPERVISION: True
23 |     NO_OBJECT_WEIGHT: 0.1
24 |     CLASS_WEIGHT: 2.0
25 |     MASK_WEIGHT: 5.0
26 |     DICE_WEIGHT: 5.0
27 |     HIDDEN_DIM: 256
28 |     NUM_OBJECT_QUERIES: 100
29 |     NHEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     ENC_LAYERS: 0
33 |     PRE_NORM: False
34 |     ENFORCE_INPUT_PROJ: False
35 |     SIZE_DIVISIBILITY: 32
36 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
37 |     TRAIN_NUM_POINTS: 12544
38 |     OVERSAMPLE_RATIO: 3.0
39 |     IMPORTANCE_SAMPLE_RATIO: 0.75
40 |     TEST:
41 |       SEMANTIC_ON: False
42 |       INSTANCE_ON: True
43 |       PANOPTIC_ON: False
44 |       OVERLAP_THRESHOLD: 0.8
45 |       OBJECT_MASK_THRESHOLD: 0.8
46 | 


--------------------------------------------------------------------------------
/models/helpers.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from typing import Any, Callable, Dict, Generator, Sequence
 3 | 
 4 | 
 5 | def chunks(l: Sequence, n: int = 5) -> Generator[Sequence, None, None]:
 6 |     """Yield successive n-sized chunks from l."""
 7 |     for i in range(0, len(l), n):
 8 |         yield l[i:i + n]
 9 | 
10 | class LRUCache:
11 |     # initialising capacity
12 |     def __init__(self, capacity: int):
13 |         self.cache = OrderedDict()
14 |         self.capacity = capacity
15 | 
16 |     def has(self, key) -> bool:
17 |         return key in self.cache
18 | 
19 |     # we return the value of the key
20 |     # that is queried in O(1) and return -1 if we
21 |     # don't find the key in out dict / cache.
22 |     # And also move the key to the end
23 |     # to show that it was recently used.
24 |     def get(self, key):
25 |         if key not in self.cache:
26 |             return None
27 |         else:
28 |             self.cache.move_to_end(key)
29 |             return self.cache[key]
30 | 
31 |     # first, we add / update the key by conventional methods.
32 |     # And also move the key to the end to show that it was recently used.
33 |     # But here we will also check whether the length of our
34 |     # ordered dictionary has exceeded our capacity,
35 |     # If so we remove the first key (least recently used)
36 |     def put(self, key, value) -> None:
37 |         self.cache[key] = value
38 |         self.cache.move_to_end(key)
39 |         if len(self.cache) > self.capacity:
40 |             self.cache.popitem(last=False)
41 | 
42 |     def pop(self, key, value):
43 |         self.cache.pop(key, None)
44 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #include <vector>
17 | 
18 | #include <ATen/ATen.h>
19 | #include <ATen/cuda/CUDAContext.h>
20 | 
21 | 
22 | at::Tensor
23 | ms_deform_attn_cpu_forward(
24 |     const at::Tensor &value, 
25 |     const at::Tensor &spatial_shapes,
26 |     const at::Tensor &level_start_index,
27 |     const at::Tensor &sampling_loc,
28 |     const at::Tensor &attn_weight,
29 |     const int im2col_step)
30 | {
31 |     AT_ERROR("Not implement on cpu");
32 | }
33 | 
34 | std::vector<at::Tensor>
35 | ms_deform_attn_cpu_backward(
36 |     const at::Tensor &value, 
37 |     const at::Tensor &spatial_shapes,
38 |     const at::Tensor &level_start_index,
39 |     const at::Tensor &sampling_loc,
40 |     const at::Tensor &attn_weight,
41 |     const at::Tensor &grad_output,
42 |     const int im2col_step)
43 | {
44 |     AT_ERROR("Not implement on cpu");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/configs/mapillary-vistas/panoptic-segmentation/Base-MapillaryVistas-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("mapillary_vistas_panoptic_train",)
18 |   TEST: ("mapillary_vistas_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 300000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 2048
40 |   MAX_SIZE_TRAIN: 8192
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (1024, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 1024  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 0
53 | DATALOADER:
54 |   FILTER_EMPTY_ANNOTATIONS: True
55 |   NUM_WORKERS: 10
56 | VERSION: 2
57 | 


--------------------------------------------------------------------------------
/configs/mapillary-vistas/semantic-segmentation/Base-MapillaryVistas-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("mapillary_vistas_sem_seg_train",)
18 |   TEST: ("mapillary_vistas_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 300000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 2048
40 |   MAX_SIZE_TRAIN: 8192
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (1024, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 1024  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 0
53 | DATALOADER:
54 |   FILTER_EMPTY_ANNOTATIONS: True
55 |   NUM_WORKERS: 10
56 | VERSION: 2
57 | 


--------------------------------------------------------------------------------
/configs/ade20k/instance-segmentation/Base-ADE20K-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_instance_train",)
18 |   TEST: ("ade20k_instance_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 640
40 |   MAX_SIZE_TRAIN: 2560
41 |   MAX_SIZE_TEST: 2560
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (640, 640)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_instance"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
56 |     MAX_SIZE: 4480
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/configs/ade20k/panoptic-segmentation/Base-ADE20K-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_panoptic_train",)
18 |   TEST: ("ade20k_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 640
40 |   MAX_SIZE_TRAIN: 2560
41 |   MAX_SIZE_TEST: 2560
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (640, 640)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 640  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [320, 480, 640, 800, 960, 1120]
56 |     MAX_SIZE: 4480
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_sem_seg_train",)
18 |   TEST: ("ade20k_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 512
40 |   MAX_SIZE_TRAIN: 2048
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 512)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 512  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [256, 384, 512, 640, 768, 896]
56 |     MAX_SIZE: 3584
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_panoptic_train",)
18 |   TEST: ("cityscapes_fine_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_sem_seg_train",)
18 |   TEST: ("cityscapes_fine_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_instance_seg_train",)
18 |   TEST: ("cityscapes_fine_instance_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: True
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_instance"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | 


--------------------------------------------------------------------------------
/models/vlp_model.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | # from Detic.predict import get_bbox_by_detic_rec, get_bbox_by_detic_rec_general
 4 | from .helpers import LRUCache
 5 | # from utils.cal_utils import cal_iou
 6 | from util.stanza_utils import find_main_words, find_agent_by_stanza, find_no_main_words
 7 | # from utils.map_to_coco_label import map_to_coco_label
 8 | 
 9 | class VLPModel:
10 |     MAX_CACHE = 20
11 |     def __init__(self, model_id, device='cuda', templates = 'there is a {}',
12 |                  checkpoint_dir = '../checkpoints'):
13 |         self._models = LRUCache(self.MAX_CACHE)
14 |         self.model_id = model_id
15 |         self.device = device
16 |         self.templates = templates
17 |         self.checkpoint_dir = checkpoint_dir
18 |     
19 |     def get_bbox_for_rec(self, image_path, category, threshold=0.15, unk=False, general=False):
20 |         if general:
21 |             return get_bbox_by_detic_rec_general(image_path, category, threshold)
22 |         else:
23 |             return get_bbox_by_detic_rec(image_path, category, threshold, unk=unk)
24 |     
25 |     def cal_iou(self, box1, box2):
26 |         return cal_iou(box1, box2)
27 | 
28 |     def find_main_words(self, sent, start_idx, tokenizer):
29 |         return find_main_words(sent, start_idx, tokenizer)
30 |     
31 |     def find_no_main_words(self, sent, start_idx, tokenizer):
32 |         return find_no_main_words(sent, start_idx, tokenizer)
33 |     
34 |     def find_agent(self, sent):
35 |         return find_agent_by_stanza(sent)
36 |     
37 |     def map_to_coco_label(self, agent):
38 |         return map_to_coco_label([agent])[0]
39 |     
40 |     def get_results_for_rec(
41 |         self, 
42 |         image_path: str, 
43 |         texts: List[str], 
44 |         gt_bbox: List[int], 
45 |         block_num: int = 8, 
46 |         category='', 
47 |         mapped_categories=None
48 |     ):
49 |         pass
50 | 
51 |     def cal_score(
52 |         self,
53 |         gradcam,
54 |         gt_bbox,
55 |         boxes_category,
56 |         od_scores
57 |     ):
58 |         pass 
59 | 
60 |     
61 | 


--------------------------------------------------------------------------------
/evaluator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | class Evaluator():
 5 |     def __init__(self):
 6 |         self.counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]}
 7 |         self.total_intersection_area = 0
 8 |         self.total_union_area = 0
 9 |         self.ious_list = []
10 |         pass
11 |     def compute_mask_iou(self, outputs: torch.Tensor, labels: torch.Tensor, EPS=1e-6):
12 |         assert outputs.shape[0] == 1; assert outputs.shape == labels.shape; assert len(outputs.shape) == 3
13 |         outputs = outputs.int(); labels = labels.int()
14 |         intersection = (outputs & labels).float().sum((1, 2))  # Will be zero if Truth=0 or Prediction=0
15 |         union = (outputs | labels).float().sum((1, 2))  # Will be zero if both are 0
16 |         iou = (intersection + EPS) / (union + EPS)  # EPS is used to avoid division by zero
17 |         
18 |         iou, intersection, union = iou.item(), intersection.item(), union.item()
19 |         
20 |         for iou_threshold in self.counters_by_iou.keys():
21 |             if iou > iou_threshold:
22 |                 self.counters_by_iou[iou_threshold] += 1
23 |                 
24 |         self.total_intersection_area += intersection
25 |         self.total_union_area += union
26 |         self.ious_list.append(iou)
27 |         
28 |         return iou, intersection, union
29 | 
30 |     def evaluate(self):
31 |         num_samples = len(self.ious_list)
32 |         
33 |         if num_samples == 0:
34 |             print("No samples to evaluate.")
35 |             return
36 |         
37 |         precision_at_k = np.array(list(self.counters_by_iou.values())) / num_samples
38 |         overall_iou = self.total_intersection_area / self.total_union_area
39 |         mean_iou = np.mean(self.ious_list)
40 | 
41 |         print("Evaluation Result")
42 |         iou_thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]
43 |         for iou, prec in zip(iou_thresholds, precision_at_k):
44 |             print(f"IoU: {iou:.2f}, Precision: {prec:.4f}")
45 |         print("========================================")
46 |         print(f"Overall IoU: {overall_iou:.4f}")
47 |         print(f"Mean IoU: {mean_iou:.4f}")


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | /*!
12 | * Copyright (c) Facebook, Inc. and its affiliates.
13 | * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
14 | */
15 | 
16 | #pragma once
17 | 
18 | #include "cpu/ms_deform_attn_cpu.h"
19 | 
20 | #ifdef WITH_CUDA
21 | #include "cuda/ms_deform_attn_cuda.h"
22 | #endif
23 | 
24 | 
25 | at::Tensor
26 | ms_deform_attn_forward(
27 |     const at::Tensor &value, 
28 |     const at::Tensor &spatial_shapes,
29 |     const at::Tensor &level_start_index,
30 |     const at::Tensor &sampling_loc,
31 |     const at::Tensor &attn_weight,
32 |     const int im2col_step)
33 | {
34 |     if (value.type().is_cuda())
35 |     {
36 | #ifdef WITH_CUDA
37 |         return ms_deform_attn_cuda_forward(
38 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
39 | #else
40 |         AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |     }
43 |     AT_ERROR("Not implemented on the CPU");
44 | }
45 | 
46 | std::vector<at::Tensor>
47 | ms_deform_attn_backward(
48 |     const at::Tensor &value, 
49 |     const at::Tensor &spatial_shapes,
50 |     const at::Tensor &level_start_index,
51 |     const at::Tensor &sampling_loc,
52 |     const at::Tensor &attn_weight,
53 |     const at::Tensor &grad_output,
54 |     const int im2col_step)
55 | {
56 |     if (value.type().is_cuda())
57 |     {
58 | #ifdef WITH_CUDA
59 |         return ms_deform_attn_cuda_backward(
60 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
61 | #else
62 |         AT_ERROR("Not compiled with GPU support");
63 | #endif
64 |     }
65 |     AT_ERROR("Not implemented on the CPU");
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/mask2former/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | """
 4 | Various positional encodings for the transformer.
 5 | """
 6 | import math
 7 | 
 8 | import torch
 9 | from torch import nn
10 | 
11 | 
12 | class PositionEmbeddingSine(nn.Module):
13 |     """
14 |     This is a more standard version of the position embedding, very similar to the one
15 |     used by the Attention is all you need paper, generalized to work on images.
16 |     """
17 | 
18 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
19 |         super().__init__()
20 |         self.num_pos_feats = num_pos_feats
21 |         self.temperature = temperature
22 |         self.normalize = normalize
23 |         if scale is not None and normalize is False:
24 |             raise ValueError("normalize should be True if scale is passed")
25 |         if scale is None:
26 |             scale = 2 * math.pi
27 |         self.scale = scale
28 | 
29 |     def forward(self, x, mask=None):
30 |         if mask is None:
31 |             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
32 |         not_mask = ~mask
33 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
34 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
35 |         if self.normalize:
36 |             eps = 1e-6
37 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
38 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
39 | 
40 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
41 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
42 | 
43 |         pos_x = x_embed[:, :, :, None] / dim_t
44 |         pos_y = y_embed[:, :, :, None] / dim_t
45 |         pos_x = torch.stack(
46 |             (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
47 |         ).flatten(3)
48 |         pos_y = torch.stack(
49 |             (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
50 |         ).flatten(3)
51 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
52 |         return pos
53 |     
54 |     def __repr__(self, _repr_indent=4):
55 |         head = "Positional encoding " + self.__class__.__name__
56 |         body = [
57 |             "num_pos_feats: {}".format(self.num_pos_feats),
58 |             "temperature: {}".format(self.temperature),
59 |             "normalize: {}".format(self.normalize),
60 |             "scale: {}".format(self.scale),
61 |         ]
62 |         # _repr_indent = 4
63 |         lines = [head] + [" " * _repr_indent + line for line in body]
64 |         return "\n".join(lines)
65 | 


--------------------------------------------------------------------------------
/demo/postproc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import scipy.ndimage
 3 | 
 4 | def mySig(x, a = 20, b = 0.5):
 5 |     sig_x = 1 / (1 + torch.exp(a*(-x+b)))
 6 |     sig_x = sig_x / sig_x.max()
 7 |     return x
 8 | 
 9 | def post_process(bin_mask, gradcam, caption, select_idx, alpha = 1.00):
10 |     """
11 |     后处理操作
12 |     
13 |     Args:
14 |         bin_mask (torch.Tensor): 候选mask tensor, shape=(1, 100, H, W)
15 |         gradcam (torch.Tensor): GradCAM注意力图tensor, shape=(H, W)
16 |         
17 |     Returns:
18 |         torch.Tensor: 最终mask, shape=(1, 1, H, W)
19 |     """
20 |     B, N, H, W = bin_mask.size()
21 |     
22 |     device = bin_mask.device
23 |     max_val = torch.max(gradcam)
24 |     max_coord = torch.nonzero(gradcam >= max_val * alpha)
25 |     # 先去做一个判断初筛，看最大的这个点的位置是否是白色像素，这样就能够有效的取出
26 |     bin_mask_at_seed = torch.zeros((B,N), dtype=torch.bool).to(device)
27 |     for coord in max_coord:
28 |         max_x, max_y = coord[0].item(), coord[1].item()
29 |         bin_mask_at_seed_tmp = bin_mask[:, :, max_x, max_y]
30 |         bin_mask_at_seed = torch.logical_or(bin_mask_at_seed, bin_mask_at_seed_tmp != 0.0)
31 |     
32 |     # 检查每个候选mask是否在seed位置有白色像素
33 |     
34 |     # 初筛2
35 |     bin_mask_tmp = bin_mask.squeeze(0)
36 |     res = []
37 |     for mask in bin_mask_tmp:
38 |         mask_np = mask.cpu().numpy()
39 |         labeled_array, num_features = scipy.ndimage.label(mask_np)
40 |         res.append(num_features<=12)
41 |     res = torch.tensor(res).float().unsqueeze(0).to(device)
42 |     bin_mask_at_seed = torch.logical_and(bin_mask_at_seed, res)
43 |     
44 |     bin_mask_at_seed = bin_mask_at_seed.float()
45 |     # 对于初筛1，如果没有，将对应的mask置零, 然后后面进行初筛2
46 |     bin_mask_tmp = bin_mask * bin_mask_at_seed.unsqueeze(-1).unsqueeze(-1)
47 |     
48 |     
49 |     # gradcam = gradcam_reweight(caption, gradcam)
50 |     
51 |     # gradcam = torch.exp(gradcam)
52 |     # 1. 计算每个候选mask和gradcam的点乘分数
53 |     # nw = 1; gradcam = torch.exp(gradcam * nw) / torch.exp(torch.tensor(nw)) # 对gradcam重新加权一下
54 |     # thresh = 0.5; 
55 |     # gradcam = (gradcam >= thresh).float() 
56 |     # gradcam = mySig(gradcam)
57 |     scores = (bin_mask_tmp + bin_mask_tmp*gradcam.unsqueeze(0).unsqueeze(0)).sum((2, 3))  # (1, 100)
58 |     
59 |     w_area = bin_mask_tmp.sum((2,3)) + 1e-5
60 |     
61 |     scores /= w_area
62 |     
63 |     # 2. 选择分数最高的候选mask
64 |     sorted_tensor, indices = torch.sort(scores,descending=True)
65 |     for indice in indices[0]:
66 |         if indice.item() in select_idx: continue
67 |         max_idx = indice.item()
68 |         break
69 |     # max_score, max_idx = scores.view(200).max(0)
70 |     print(max_idx)
71 |     candidate_mask = bin_mask_tmp[0, max_idx]  
72 |     
73 |     # 3. 找到分数最高点作为种子点, 进行区域提取
74 |     
75 |     
76 |     # seed_point = (max_x, max_y)
77 |     # final_mask = extract_region(candidate_mask, seed_point).unsqueeze(0).to(device)
78 |     
79 |     return candidate_mask, max_idx
80 | 
81 | if __name__ == "__main__":
82 |     bin_mask = torch.randn(1, 200, 480, 640)
83 |     gradcam = torch.randn(480, 640)
84 |     select_idx = []
85 |     caption = ''
86 |     
87 |     


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | import os
13 | import glob
14 | 
15 | import torch
16 | 
17 | from torch.utils.cpp_extension import CUDA_HOME
18 | from torch.utils.cpp_extension import CppExtension
19 | from torch.utils.cpp_extension import CUDAExtension
20 | 
21 | from setuptools import find_packages
22 | from setuptools import setup
23 | 
24 | requirements = ["torch", "torchvision"]
25 | 
26 | def get_extensions():
27 |     this_dir = os.path.dirname(os.path.abspath(__file__))
28 |     extensions_dir = os.path.join(this_dir, "src")
29 | 
30 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
31 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
32 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
33 | 
34 |     sources = main_file + source_cpu
35 |     extension = CppExtension
36 |     extra_compile_args = {"cxx": []}
37 |     define_macros = []
38 | 
39 |     # Force cuda since torch ask for a device, not if cuda is in fact available.
40 |     if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
41 |         extension = CUDAExtension
42 |         sources += source_cuda
43 |         define_macros += [("WITH_CUDA", None)]
44 |         extra_compile_args["nvcc"] = [
45 |             "-DCUDA_HAS_FP16=1",
46 |             "-D__CUDA_NO_HALF_OPERATORS__",
47 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
48 |             "-D__CUDA_NO_HALF2_OPERATORS__",
49 |         ]
50 |     else:
51 |         if CUDA_HOME is None:
52 |             raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
53 |         else:
54 |             raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
55 | 
56 |     sources = [os.path.join(extensions_dir, s) for s in sources]
57 |     include_dirs = [extensions_dir]
58 |     ext_modules = [
59 |         extension(
60 |             "MultiScaleDeformableAttention",
61 |             sources,
62 |             include_dirs=include_dirs,
63 |             define_macros=define_macros,
64 |             extra_compile_args=extra_compile_args,
65 |         )
66 |     ]
67 |     return ext_modules
68 | 
69 | setup(
70 |     name="MultiScaleDeformableAttention",
71 |     version="1.0",
72 |     author="Weijie Su",
73 |     url="https://github.com/fundamentalvision/Deformable-DETR",
74 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
75 |     packages=find_packages(exclude=("configs", "tests",)),
76 |     ext_modules=get_extensions(),
77 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
78 | )
79 | 


--------------------------------------------------------------------------------
/dataset/refexp_eval.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
 2 | import copy
 3 | from collections import defaultdict
 4 | from pathlib import Path
 5 | 
 6 | import torch
 7 | import torch.utils.data
 8 | 
 9 | import util.misc as utils
10 | from util.box_ops import generalized_box_iou
11 | 
12 | 
13 | class RefExpEvaluator(object):
14 |     def __init__(self, refexp_gt, iou_types, k=(1, 5, 10), thresh_iou=0.5):
15 |         assert isinstance(k, (list, tuple))
16 |         refexp_gt = copy.deepcopy(refexp_gt)
17 |         self.refexp_gt = refexp_gt
18 |         self.iou_types = iou_types
19 |         self.img_ids = self.refexp_gt.imgs.keys()
20 |         self.predictions = {}
21 |         self.k = k
22 |         self.thresh_iou = thresh_iou
23 | 
24 |     def accumulate(self):
25 |         pass
26 | 
27 |     def update(self, predictions):
28 |         self.predictions.update(predictions)
29 | 
30 |     def synchronize_between_processes(self):
31 |         all_predictions = utils.all_gather(self.predictions)
32 |         merged_predictions = {}
33 |         for p in all_predictions:
34 |             merged_predictions.update(p)
35 |         self.predictions = merged_predictions
36 | 
37 |     def summarize(self):
38 |         if utils.is_main_process():
39 |             dataset2score = {
40 |                 "refcoco": {k: 0.0 for k in self.k},
41 |                 "refcoco+": {k: 0.0 for k in self.k},
42 |                 "refcocog": {k: 0.0 for k in self.k},
43 |             }
44 |             dataset2count = {"refcoco": 0.0, "refcoco+": 0.0, "refcocog": 0.0}
45 |             for image_id in self.img_ids:
46 |                 ann_ids = self.refexp_gt.getAnnIds(imgIds=image_id)
47 |                 assert len(ann_ids) == 1
48 |                 img_info = self.refexp_gt.loadImgs(image_id)[0]
49 | 
50 |                 target = self.refexp_gt.loadAnns(ann_ids[0])
51 |                 prediction = self.predictions[image_id]
52 |                 assert prediction is not None
53 |                 sorted_scores_boxes = sorted(
54 |                     zip(prediction["scores"].tolist(), prediction["boxes"].tolist()), reverse=True
55 |                 )
56 |                 sorted_scores, sorted_boxes = zip(*sorted_scores_boxes)
57 |                 sorted_boxes = torch.cat([torch.as_tensor(x).view(1, 4) for x in sorted_boxes])
58 |                 target_bbox = target[0]["bbox"]
59 |                 converted_bbox = [
60 |                     target_bbox[0],
61 |                     target_bbox[1],
62 |                     target_bbox[2] + target_bbox[0],
63 |                     target_bbox[3] + target_bbox[1],
64 |                 ]
65 |                 giou = generalized_box_iou(sorted_boxes, torch.as_tensor(converted_bbox).view(-1, 4))
66 |                 for k in self.k:
67 |                     if max(giou[:k]) >= self.thresh_iou:
68 |                         dataset2score[img_info["dataset_name"]][k] += 1.0
69 |                 dataset2count[img_info["dataset_name"]] += 1.0
70 | 
71 |             for key, value in dataset2score.items():
72 |                 for k in self.k:
73 |                     try:
74 |                         value[k] /= dataset2count[key]
75 |                     except:
76 |                         pass
77 |             results = {}
78 |             for key, value in dataset2score.items():
79 |                 results[key] = sorted([v for k, v in value.items()])
80 |                 print(f" Dataset: {key} - Precision @ 1, 5, 10: {results[key]} \n")
81 | 
82 |             return results
83 |         return None
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/util/box_ops.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for bounding box manipulation and GIoU.
  3 | """
  4 | import torch
  5 | from torchvision.ops.boxes import box_area
  6 | 
  7 | def clip_iou(boxes1,boxes2):
  8 |     area1 = box_area(boxes1)
  9 |     area2 = box_area(boxes2)
 10 |     lt = torch.max(boxes1[:, :2], boxes2[:, :2])
 11 |     rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])
 12 |     wh = (rb - lt).clamp(min=0)
 13 |     inter = wh[:,0] * wh[:,1]
 14 |     union = area1 + area2 - inter
 15 |     iou = (inter + 1e-6) / (union+1e-6)
 16 |     return iou
 17 | 
 18 | def multi_iou(boxes1, boxes2):
 19 |     lt = torch.max(boxes1[...,:2], boxes2[...,:2])
 20 |     rb = torch.min(boxes1[...,2:], boxes2[...,2:])
 21 |     wh = (rb - lt).clamp(min=0)
 22 |     wh_1 = boxes1[...,2:] - boxes1[...,:2]
 23 |     wh_2 = boxes2[...,2:] - boxes2[...,:2]
 24 |     inter = wh[...,0] * wh[...,1]
 25 |     union = wh_1[...,0] * wh_1[...,1] + wh_2[...,0] * wh_2[...,1] - inter
 26 |     iou = (inter + 1e-6) / (union + 1e-6)
 27 |     return iou
 28 | 
 29 | def box_cxcywh_to_xyxy(x):
 30 |     x_c, y_c, w, h = x.unbind(-1)
 31 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
 32 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
 33 |     return torch.stack(b, dim=-1)
 34 | 
 35 | 
 36 | def box_xyxy_to_cxcywh(x):
 37 |     x0, y0, x1, y1 = x.unbind(-1)
 38 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
 39 |          (x1 - x0), (y1 - y0)]
 40 |     return torch.stack(b, dim=-1)
 41 | 
 42 | 
 43 | # modified from torchvision to also return the union
 44 | def box_iou(boxes1, boxes2):
 45 |     area1 = box_area(boxes1)
 46 |     area2 = box_area(boxes2)
 47 | 
 48 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 49 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 50 | 
 51 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 52 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 53 | 
 54 |     union = area1[:, None] + area2 - inter
 55 | 
 56 |     iou = (inter+1e-6) / (union+1e-6)
 57 |     return iou, union
 58 | 
 59 | 
 60 | def generalized_box_iou(boxes1, boxes2):
 61 |     """
 62 |     Generalized IoU from https://giou.stanford.edu/
 63 | 
 64 |     The boxes should be in [x0, y0, x1, y1] format
 65 | 
 66 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 67 |     and M = len(boxes2)
 68 |     """
 69 |     # degenerate boxes gives inf / nan results
 70 |     # so do an early check
 71 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 72 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 73 |     iou, union = box_iou(boxes1, boxes2)
 74 | 
 75 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 76 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 77 | 
 78 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 79 |     area = wh[:, :, 0] * wh[:, :, 1]
 80 | 
 81 |     return iou - ((area - union) + 1e-6) / (area + 1e-6)
 82 | 
 83 | 
 84 | def masks_to_boxes(masks):
 85 |     """Compute the bounding boxes around the provided masks
 86 | 
 87 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
 88 | 
 89 |     Returns a [N, 4] tensors, with the boxes in xyxy format
 90 |     """
 91 |     if masks.numel() == 0:
 92 |         return torch.zeros((0, 4), device=masks.device)
 93 | 
 94 |     h, w = masks.shape[-2:]
 95 | 
 96 |     y = torch.arange(0, h, dtype=torch.float)
 97 |     x = torch.arange(0, w, dtype=torch.float)
 98 |     y, x = torch.meshgrid(y, x)
 99 | 
100 |     x_mask = (masks * x.unsqueeze(0))
101 |     x_max = x_mask.flatten(1).max(-1)[0]
102 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
103 | 
104 |     y_mask = (masks * y.unsqueeze(0))
105 |     y_max = y_mask.flatten(1).max(-1)[0]
106 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
107 | 
108 |     return torch.stack([x_min, y_min, x_max, y_max], 1)
109 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch.autograd import Function
19 | from torch.autograd.function import once_differentiable
20 | 
21 | try:
22 |     import MultiScaleDeformableAttention as MSDA
23 | except ModuleNotFoundError as e:
24 |     info_string = (
25 |         "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
26 |         "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
27 |         "\t`sh make.sh`\n"
28 |     )
29 |     raise ModuleNotFoundError(info_string)
30 | 
31 | 
32 | class MSDeformAttnFunction(Function):
33 |     @staticmethod
34 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
35 |         ctx.im2col_step = im2col_step
36 |         output = MSDA.ms_deform_attn_forward(
37 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
38 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
39 |         return output
40 | 
41 |     @staticmethod
42 |     @once_differentiable
43 |     def backward(ctx, grad_output):
44 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
45 |         grad_value, grad_sampling_loc, grad_attn_weight = \
46 |             MSDA.ms_deform_attn_backward(
47 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
48 | 
49 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
50 | 
51 | 
52 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
53 |     # for debug and test only,
54 |     # need to use cuda version instead
55 |     N_, S_, M_, D_ = value.shape
56 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
57 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
58 |     sampling_grids = 2 * sampling_locations - 1
59 |     sampling_value_list = []
60 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
61 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
62 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
63 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
64 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
65 |         # N_*M_, D_, Lq_, P_
66 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
67 |                                           mode='bilinear', padding_mode='zeros', align_corners=False)
68 |         sampling_value_list.append(sampling_value_l_)
69 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
70 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
71 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
72 |     return output.transpose(1, 2).contiguous()
73 | 


--------------------------------------------------------------------------------
/dataset/categories.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------------------------------------------------
 2 | # 1. Ref-Youtube-VOS
 3 | ytvos_category_dict = {
 4 |     'airplane': 0, 'ape': 1, 'bear': 2, 'bike': 3, 'bird': 4, 'boat': 5, 'bucket': 6, 'bus': 7, 'camel': 8, 'cat': 9, 
 5 |     'cow': 10, 'crocodile': 11, 'deer': 12, 'dog': 13, 'dolphin': 14, 'duck': 15, 'eagle': 16, 'earless_seal': 17, 
 6 |     'elephant': 18, 'fish': 19, 'fox': 20, 'frisbee': 21, 'frog': 22, 'giant_panda': 23, 'giraffe': 24, 'hand': 25, 
 7 |     'hat': 26, 'hedgehog': 27, 'horse': 28, 'knife': 29, 'leopard': 30, 'lion': 31, 'lizard': 32, 'monkey': 33, 
 8 |     'motorbike': 34, 'mouse': 35, 'others': 36, 'owl': 37, 'paddle': 38, 'parachute': 39, 'parrot': 40, 'penguin': 41, 
 9 |     'person': 42, 'plant': 43, 'rabbit': 44, 'raccoon': 45, 'sedan': 46, 'shark': 47, 'sheep': 48, 'sign': 49, 
10 |     'skateboard': 50, 'snail': 51, 'snake': 52, 'snowboard': 53, 'squirrel': 54, 'surfboard': 55, 'tennis_racket': 56, 
11 |     'tiger': 57, 'toilet': 58, 'train': 59, 'truck': 60, 'turtle': 61, 'umbrella': 62, 'whale': 63, 'zebra': 64
12 | }
13 | 
14 | ytvos_category_list = [
15 |     'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bucket', 'bus', 'camel', 'cat', 'cow', 'crocodile', 
16 |     'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frisbee', 'frog', 
17 |     'giant_panda', 'giraffe', 'hand', 'hat', 'hedgehog', 'horse', 'knife', 'leopard', 'lion', 'lizard', 
18 |     'monkey', 'motorbike', 'mouse', 'others', 'owl', 'paddle', 'parachute', 'parrot', 'penguin', 'person', 
19 |     'plant', 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'sign', 'skateboard', 'snail', 'snake', 'snowboard', 
20 |     'squirrel', 'surfboard', 'tennis_racket', 'tiger', 'toilet', 'train', 'truck', 'turtle', 'umbrella', 'whale', 'zebra'
21 | ]
22 | 
23 | # -------------------------------------------------------------------------------------------------------------------
24 | # 2. Ref-DAVIS17
25 | davis_category_dict = {
26 |     'airplane': 0, 'backpack': 1, 'ball': 2, 'bear': 3, 'bicycle': 4, 'bird': 5, 'boat': 6, 'bottle': 7, 'box': 8, 'bus': 9, 
27 |     'camel': 10, 'car': 11, 'carriage': 12, 'cat': 13, 'cellphone': 14, 'chamaleon': 15, 'cow': 16, 'deer': 17, 'dog': 18, 
28 |     'dolphin': 19, 'drone': 20, 'elephant': 21, 'excavator': 22, 'fish': 23, 'goat': 24, 'golf cart': 25, 'golf club': 26, 
29 |     'grass': 27, 'guitar': 28, 'gun': 29, 'helicopter': 30, 'horse': 31, 'hoverboard': 32, 'kart': 33, 'key': 34, 'kite': 35, 
30 |     'koala': 36, 'leash': 37, 'lion': 38, 'lock': 39, 'mask': 40, 'microphone': 41, 'monkey': 42, 'motorcycle': 43, 'oar': 44, 
31 |     'paper': 45, 'paraglide': 46, 'person': 47, 'pig': 48, 'pole': 49, 'potted plant': 50, 'puck': 51, 'rack': 52, 'rhino': 53, 
32 |     'rope': 54, 'sail': 55, 'scale': 56, 'scooter': 57, 'selfie stick': 58, 'sheep': 59, 'skateboard': 60, 'ski': 61, 'ski poles': 62, 
33 |     'snake': 63, 'snowboard': 64, 'stick': 65, 'stroller': 66, 'surfboard': 67, 'swing': 68, 'tennis racket': 69, 'tractor': 70, 
34 |     'trailer': 71, 'train': 72, 'truck': 73, 'turtle': 74, 'varanus': 75, 'violin': 76, 'wheelchair': 77
35 | }
36 | 
37 | davis_category_list = [
38 |     'airplane', 'backpack', 'ball', 'bear', 'bicycle', 'bird', 'boat', 'bottle', 'box', 'bus', 'camel', 'car', 'carriage', 
39 |     'cat', 'cellphone', 'chamaleon', 'cow', 'deer', 'dog', 'dolphin', 'drone', 'elephant', 'excavator', 'fish', 'goat', 
40 |     'golf cart', 'golf club', 'grass', 'guitar', 'gun', 'helicopter', 'horse', 'hoverboard', 'kart', 'key', 'kite', 'koala', 
41 |     'leash', 'lion', 'lock', 'mask', 'microphone', 'monkey', 'motorcycle', 'oar', 'paper', 'paraglide', 'person', 'pig', 
42 |     'pole', 'potted plant', 'puck', 'rack', 'rhino', 'rope', 'sail', 'scale', 'scooter', 'selfie stick', 'sheep', 'skateboard', 
43 |     'ski', 'ski poles', 'snake', 'snowboard', 'stick', 'stroller', 'surfboard', 'swing', 'tennis racket', 'tractor', 'trailer', 
44 |     'train', 'truck', 'turtle', 'varanus', 'violin', 'wheelchair'
45 | ]


--------------------------------------------------------------------------------
/mask2former/test_time_augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | from itertools import count
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from fvcore.transforms import HFlipTransform
  9 | from torch import nn
 10 | from torch.nn.parallel import DistributedDataParallel
 11 | 
 12 | from detectron2.data.detection_utils import read_image
 13 | from detectron2.modeling import DatasetMapperTTA
 14 | 
 15 | 
 16 | __all__ = [
 17 |     "SemanticSegmentorWithTTA",
 18 | ]
 19 | 
 20 | 
 21 | class SemanticSegmentorWithTTA(nn.Module):
 22 |     """
 23 |     A SemanticSegmentor with test-time augmentation enabled.
 24 |     Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
 25 |     """
 26 | 
 27 |     def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
 28 |         """
 29 |         Args:
 30 |             cfg (CfgNode):
 31 |             model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
 32 |             tta_mapper (callable): takes a dataset dict and returns a list of
 33 |                 augmented versions of the dataset dict. Defaults to
 34 |                 `DatasetMapperTTA(cfg)`.
 35 |             batch_size (int): batch the augmented images into this batch size for inference.
 36 |         """
 37 |         super().__init__()
 38 |         if isinstance(model, DistributedDataParallel):
 39 |             model = model.module
 40 |         self.cfg = cfg.clone()
 41 | 
 42 |         self.model = model
 43 | 
 44 |         if tta_mapper is None:
 45 |             tta_mapper = DatasetMapperTTA(cfg)
 46 |         self.tta_mapper = tta_mapper
 47 |         self.batch_size = batch_size
 48 | 
 49 |     def __call__(self, batched_inputs):
 50 |         """
 51 |         Same input/output format as :meth:`SemanticSegmentor.forward`
 52 |         """
 53 | 
 54 |         def _maybe_read_image(dataset_dict):
 55 |             ret = copy.copy(dataset_dict)
 56 |             if "image" not in ret:
 57 |                 image = read_image(ret.pop("file_name"), self.model.input_format)
 58 |                 image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
 59 |                 ret["image"] = image
 60 |             if "height" not in ret and "width" not in ret:
 61 |                 ret["height"] = image.shape[1]
 62 |                 ret["width"] = image.shape[2]
 63 |             return ret
 64 | 
 65 |         processed_results = []
 66 |         for x in batched_inputs:
 67 |             result = self._inference_one_image(_maybe_read_image(x))
 68 |             processed_results.append(result)
 69 |         return processed_results
 70 | 
 71 |     def _inference_one_image(self, input):
 72 |         """
 73 |         Args:
 74 |             input (dict): one dataset dict with "image" field being a CHW tensor
 75 |         Returns:
 76 |             dict: one output dict
 77 |         """
 78 |         orig_shape = (input["height"], input["width"])
 79 |         augmented_inputs, tfms = self._get_augmented_inputs(input)
 80 | 
 81 |         final_predictions = None
 82 |         count_predictions = 0
 83 |         for input, tfm in zip(augmented_inputs, tfms):
 84 |             count_predictions += 1
 85 |             with torch.no_grad():
 86 |                 if final_predictions is None:
 87 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 88 |                         final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
 89 |                     else:
 90 |                         final_predictions = self.model([input])[0].pop("sem_seg")
 91 |                 else:
 92 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 93 |                         final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
 94 |                     else:
 95 |                         final_predictions += self.model([input])[0].pop("sem_seg")
 96 | 
 97 |         final_predictions = final_predictions / count_predictions
 98 |         return {"sem_seg": final_predictions}
 99 | 
100 |     def _get_augmented_inputs(self, input):
101 |         augmented_inputs = self.tta_mapper(input)
102 |         tfms = [x.pop("transforms") for x in augmented_inputs]
103 |         return augmented_inputs, tfms
104 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # Ruff stuff:
171 | .ruff_cache/
172 | 
173 | # PyPI configuration file
174 | .pypirc


--------------------------------------------------------------------------------
/mask2former/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
  3 | """
  4 | Misc functions, including distributed helpers.
  5 | 
  6 | Mostly copy-paste from torchvision references.
  7 | """
  8 | from typing import List, Optional
  9 | 
 10 | import torch
 11 | import torch.distributed as dist
 12 | import torchvision
 13 | from torch import Tensor
 14 | 
 15 | 
 16 | def _max_by_axis(the_list):
 17 |     # type: (List[List[int]]) -> List[int]
 18 |     maxes = the_list[0]
 19 |     for sublist in the_list[1:]:
 20 |         for index, item in enumerate(sublist):
 21 |             maxes[index] = max(maxes[index], item)
 22 |     return maxes
 23 | 
 24 | 
 25 | class NestedTensor(object):
 26 |     def __init__(self, tensors, mask: Optional[Tensor]):
 27 |         self.tensors = tensors
 28 |         self.mask = mask
 29 | 
 30 |     def to(self, device):
 31 |         # type: (Device) -> NestedTensor # noqa
 32 |         cast_tensor = self.tensors.to(device)
 33 |         mask = self.mask
 34 |         if mask is not None:
 35 |             assert mask is not None
 36 |             cast_mask = mask.to(device)
 37 |         else:
 38 |             cast_mask = None
 39 |         return NestedTensor(cast_tensor, cast_mask)
 40 | 
 41 |     def decompose(self):
 42 |         return self.tensors, self.mask
 43 | 
 44 |     def __repr__(self):
 45 |         return str(self.tensors)
 46 | 
 47 | 
 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 49 |     # TODO make this more general
 50 |     if tensor_list[0].ndim == 3:
 51 |         if torchvision._is_tracing():
 52 |             # nested_tensor_from_tensor_list() does not export well to ONNX
 53 |             # call _onnx_nested_tensor_from_tensor_list() instead
 54 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
 55 | 
 56 |         # TODO make it support different-sized images
 57 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 58 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 59 |         batch_shape = [len(tensor_list)] + max_size
 60 |         b, c, h, w = batch_shape
 61 |         dtype = tensor_list[0].dtype
 62 |         device = tensor_list[0].device
 63 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 64 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 65 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 66 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 67 |             m[: img.shape[1], : img.shape[2]] = False
 68 |     else:
 69 |         raise ValueError("not supported")
 70 |     return NestedTensor(tensor, mask)
 71 | 
 72 | 
 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 75 | @torch.jit.unused
 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
 77 |     max_size = []
 78 |     for i in range(tensor_list[0].dim()):
 79 |         max_size_i = torch.max(
 80 |             torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
 81 |         ).to(torch.int64)
 82 |         max_size.append(max_size_i)
 83 |     max_size = tuple(max_size)
 84 | 
 85 |     # work around for
 86 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 87 |     # m[: img.shape[1], :img.shape[2]] = False
 88 |     # which is not yet supported in onnx
 89 |     padded_imgs = []
 90 |     padded_masks = []
 91 |     for img in tensor_list:
 92 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
 93 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
 94 |         padded_imgs.append(padded_img)
 95 | 
 96 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
 97 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
 98 |         padded_masks.append(padded_mask.to(torch.bool))
 99 | 
100 |     tensor = torch.stack(padded_imgs)
101 |     mask = torch.stack(padded_masks)
102 | 
103 |     return NestedTensor(tensor, mask=mask)
104 | 
105 | 
106 | def is_dist_avail_and_initialized():
107 |     if not dist.is_available():
108 |         return False
109 |     if not dist.is_initialized():
110 |         return False
111 |     return True
112 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | # Copyright (c) Facebook, Inc. and its affiliates.
10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
11 | 
12 | from __future__ import absolute_import
13 | from __future__ import print_function
14 | from __future__ import division
15 | 
16 | import time
17 | import torch
18 | import torch.nn as nn
19 | from torch.autograd import gradcheck
20 | 
21 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
22 | 
23 | 
24 | N, M, D = 1, 2, 2
25 | Lq, L, P = 2, 2, 2
26 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
27 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
28 | S = sum([(H*W).item() for H, W in shapes])
29 | 
30 | 
31 | torch.manual_seed(3)
32 | 
33 | 
34 | @torch.no_grad()
35 | def check_forward_equal_with_pytorch_double():
36 |     value = torch.rand(N, S, M, D).cuda() * 0.01
37 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
38 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
39 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
40 |     im2col_step = 2
41 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
42 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
43 |     fwdok = torch.allclose(output_cuda, output_pytorch)
44 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
45 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
46 | 
47 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
48 | 
49 | 
50 | @torch.no_grad()
51 | def check_forward_equal_with_pytorch_float():
52 |     value = torch.rand(N, S, M, D).cuda() * 0.01
53 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
54 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
55 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
56 |     im2col_step = 2
57 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
58 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
59 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
60 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
61 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
62 | 
63 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
64 | 
65 | 
66 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
67 | 
68 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
69 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
70 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
71 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
72 |     im2col_step = 2
73 |     func = MSDeformAttnFunction.apply
74 | 
75 |     value.requires_grad = grad_value
76 |     sampling_locations.requires_grad = grad_sampling_loc
77 |     attention_weights.requires_grad = grad_attn_weight
78 | 
79 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
80 | 
81 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     check_forward_equal_with_pytorch_double()
86 |     check_forward_equal_with_pytorch_float()
87 | 
88 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
89 |         check_gradient_numerical(channels, True, True, True)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/dataset/image_to_seq_augmenter.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Modified from SeqFormer (https://github.com/wjf5203/SeqFormer)
 3 | # ------------------------------------------------------------------------
 4 | # Modified from STEm-Seg (https://github.com/sabarim/STEm-Seg)
 5 | # ------------------------------------------------------------------------
 6 | 
 7 | 
 8 | import imgaug
 9 | import imgaug.augmenters as iaa
10 | import numpy as np
11 | 
12 | from datetime import datetime
13 | 
14 | from imgaug.augmentables.segmaps import SegmentationMapsOnImage
15 | from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
16 | 
17 | 
18 | class ImageToSeqAugmenter(object):
19 |     def __init__(self, perspective=True, affine=True, motion_blur=True,
20 |                  brightness_range=(-50, 50), hue_saturation_range=(-15, 15), perspective_magnitude=0.12,
21 |                  scale_range=1.0, translate_range={"x": (-0.15, 0.15), "y": (-0.15, 0.15)}, rotation_range=(-20, 20),
22 |                  motion_blur_kernel_sizes=(7, 9), motion_blur_prob=0.5):
23 | 
24 |         self.basic_augmenter = iaa.SomeOf((1, None), [
25 |                 iaa.Add(brightness_range),
26 |                 iaa.AddToHueAndSaturation(hue_saturation_range)
27 |             ]
28 |         )
29 | 
30 |         transforms = []
31 |         if perspective:
32 |             transforms.append(iaa.PerspectiveTransform(perspective_magnitude))
33 |         if affine:
34 |             transforms.append(iaa.Affine(scale=scale_range,
35 |                                          translate_percent=translate_range,
36 |                                          rotate=rotation_range,
37 |                                          order=1,  # cv2.INTER_LINEAR
38 |                                          backend='auto'))
39 |         transforms = iaa.Sequential(transforms)
40 |         transforms = [transforms]
41 | 
42 |         if motion_blur:
43 |             blur = iaa.Sometimes(motion_blur_prob, iaa.OneOf(
44 |                 [
45 |                     iaa.MotionBlur(ksize)
46 |                     for ksize in motion_blur_kernel_sizes
47 |                 ]
48 |             ))
49 |             transforms.append(blur)
50 | 
51 |         self.frame_shift_augmenter = iaa.Sequential(transforms)
52 | 
53 |     @staticmethod
54 |     def condense_masks(instance_masks):
55 |         condensed_mask = np.zeros_like(instance_masks[0], dtype=np.int8)
56 |         for instance_id, mask in enumerate(instance_masks, 1):
57 |             condensed_mask = np.where(mask, instance_id, condensed_mask)
58 | 
59 |         return condensed_mask
60 | 
61 |     @staticmethod
62 |     def expand_masks(condensed_mask, num_instances):
63 |         return [(condensed_mask == instance_id).astype(np.uint8) for instance_id in range(1, num_instances + 1)]
64 | 
65 |     def __call__(self, image, masks=None, boxes=None):
66 |         det_augmenter = self.frame_shift_augmenter.to_deterministic()
67 | 
68 | 
69 |         if masks is not None:
70 |             masks_np, is_binary_mask = [], []
71 |             boxs_np = []
72 | 
73 |             for mask in masks:
74 |                 
75 |                 if isinstance(mask, np.ndarray):
76 |                     masks_np.append(mask.astype(np.bool))
77 |                     is_binary_mask.append(False)
78 |                 else:
79 |                     raise ValueError("Invalid mask type: {}".format(type(mask)))
80 | 
81 |             num_instances = len(masks_np)
82 |             masks_np = SegmentationMapsOnImage(self.condense_masks(masks_np), shape=image.shape[:2])
83 |             # boxs_np = BoundingBoxesOnImage(boxs_np, shape=image.shape[:2])
84 | 
85 |             seed = int(datetime.now().strftime('%M%S%f')[-8:])
86 |             imgaug.seed(seed)
87 |             aug_image, aug_masks = det_augmenter(image=self.basic_augmenter(image=image) , segmentation_maps=masks_np)
88 |             imgaug.seed(seed)
89 |             invalid_pts_mask = det_augmenter(image=np.ones(image.shape[:2] + (1,), np.uint8)).squeeze(2)
90 |             aug_masks = self.expand_masks(aug_masks.get_arr(), num_instances)
91 |             # aug_boxes = aug_boxes.remove_out_of_image().clip_out_of_image()
92 |             aug_masks = [mask for mask, is_bm in zip(aug_masks, is_binary_mask)]
93 |             return aug_image, aug_masks #, aug_boxes.to_xyxy_array()
94 | 
95 |         else:
96 |             masks = [SegmentationMapsOnImage(np.ones(image.shape[:2], np.bool), shape=image.shape[:2])]
97 |             aug_image, invalid_pts_mask = det_augmenter(image=image, segmentation_maps=masks)
98 |             return aug_image, invalid_pts_mask.get_arr() == 0
99 | 


--------------------------------------------------------------------------------
/dataset/a2d_eval.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains implementations for the precision@k and IoU (mean, overall) evaluation metrics.
 3 | copy-paste from https://github.com/mttr2021/MTTR/blob/main/metrics.py
 4 | """
 5 | import torch
 6 | from tqdm import tqdm
 7 | from pycocotools.coco import COCO
 8 | from pycocotools.mask import decode
 9 | import numpy as np
10 | 
11 | from torchvision.ops.boxes import box_area
12 | 
13 | def compute_bbox_iou(boxes1: torch.Tensor, boxes2: torch.Tensor):
14 |     # both boxes: xyxy
15 |     area1 = box_area(boxes1)
16 |     area2 = box_area(boxes2)
17 | 
18 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
19 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
20 | 
21 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
22 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
23 | 
24 |     union = area1[:, None] + area2 - inter
25 | 
26 |     iou = (inter+1e-6) / (union+1e-6)
27 |     return iou, inter, union
28 | 
29 | def compute_mask_iou(outputs: torch.Tensor, labels: torch.Tensor, EPS=1e-6):
30 |     outputs = outputs.int()
31 |     intersection = (outputs & labels).float().sum((1, 2))  # Will be zero if Truth=0 or Prediction=0
32 |     union = (outputs | labels).float().sum((1, 2))  # Will be zero if both are 0
33 |     iou = (intersection + EPS) / (union + EPS)  # EPS is used to avoid division by zero
34 |     return iou, intersection, union
35 | 
36 | # mask
37 | def calculate_precision_at_k_and_iou_metrics(coco_gt: COCO, coco_pred: COCO):
38 |     print('evaluating mask precision@k & iou metrics...')
39 |     counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]}
40 |     total_intersection_area = 0
41 |     total_union_area = 0
42 |     ious_list = []
43 |     for instance in tqdm(coco_gt.imgs.keys()):  # each image_id contains exactly one instance
44 |         gt_annot = coco_gt.imgToAnns[instance][0]
45 |         gt_mask = decode(gt_annot['segmentation'])
46 |         pred_annots = coco_pred.imgToAnns[instance]
47 |         pred_annot = sorted(pred_annots, key=lambda a: a['score'])[-1]  # choose pred with highest score
48 |         pred_mask = decode(pred_annot['segmentation'])
49 |         iou, intersection, union = compute_mask_iou(torch.tensor(pred_mask).unsqueeze(0),
50 |                                                torch.tensor(gt_mask).unsqueeze(0))
51 |         iou, intersection, union = iou.item(), intersection.item(), union.item()
52 |         for iou_threshold in counters_by_iou.keys():
53 |             if iou > iou_threshold:
54 |                 counters_by_iou[iou_threshold] += 1
55 |         total_intersection_area += intersection
56 |         total_union_area += union
57 |         ious_list.append(iou)
58 |     num_samples = len(ious_list)
59 |     precision_at_k = np.array(list(counters_by_iou.values())) / num_samples
60 |     overall_iou = total_intersection_area / total_union_area
61 |     mean_iou = np.mean(ious_list)
62 |     return precision_at_k, overall_iou, mean_iou
63 | 
64 | # bbox
65 | def calculate_bbox_precision_at_k_and_iou_metrics(coco_gt: COCO, coco_pred: COCO):
66 |     print('evaluating bbox precision@k & iou metrics...')
67 |     counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]}
68 |     total_intersection_area = 0
69 |     total_union_area = 0
70 |     ious_list = []
71 |     for instance in tqdm(coco_gt.imgs.keys()):  # each image_id contains exactly one instance
72 |         gt_annot = coco_gt.imgToAnns[instance][0]
73 |         gt_bbox = gt_annot['bbox'] # xywh
74 |         gt_bbox = [
75 |             gt_bbox[0],
76 |             gt_bbox[1],
77 |             gt_bbox[2] + gt_bbox[0],
78 |             gt_bbox[3] + gt_bbox[1],
79 |         ]
80 |         pred_annots = coco_pred.imgToAnns[instance]
81 |         pred_annot = sorted(pred_annots, key=lambda a: a['score'])[-1]  # choose pred with highest score
82 |         pred_bbox = pred_annot['bbox']  # xyxy
83 |         iou, intersection, union = compute_bbox_iou(torch.tensor(pred_bbox).unsqueeze(0),
84 |                                                torch.tensor(gt_bbox).unsqueeze(0))
85 |         iou, intersection, union = iou.item(), intersection.item(), union.item()
86 |         for iou_threshold in counters_by_iou.keys():
87 |             if iou > iou_threshold:
88 |                 counters_by_iou[iou_threshold] += 1
89 |         total_intersection_area += intersection
90 |         total_union_area += union
91 |         ious_list.append(iou)
92 |     num_samples = len(ious_list)
93 |     precision_at_k = np.array(list(counters_by_iou.values())) / num_samples
94 |     overall_iou = total_intersection_area / total_union_area
95 |     mean_iou = np.mean(ious_list)
96 |     return precision_at_k, overall_iou, mean_iou
97 | 


--------------------------------------------------------------------------------
/mask2former/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | from detectron2.config import CfgNode as CN
  4 | 
  5 | 
  6 | def add_maskformer2_config(cfg):
  7 |     """
  8 |     Add config for MASK_FORMER.
  9 |     """
 10 |     # NOTE: configs from original maskformer
 11 |     # data config
 12 |     # select the dataset mapper
 13 |     cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
 14 |     # Color augmentation
 15 |     cfg.INPUT.COLOR_AUG_SSD = False
 16 |     # We retry random cropping until no single category in semantic segmentation GT occupies more
 17 |     # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
 18 |     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
 19 |     # Pad image and segmentation GT in dataset mapper.
 20 |     cfg.INPUT.SIZE_DIVISIBILITY = -1
 21 | 
 22 |     # solver config
 23 |     # weight decay on embedding
 24 |     cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
 25 |     # optimizer
 26 |     cfg.SOLVER.OPTIMIZER = "ADAMW"
 27 |     cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
 28 | 
 29 |     # mask_former model config
 30 |     cfg.MODEL.MASK_FORMER = CN()
 31 | 
 32 |     # loss
 33 |     cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
 34 |     cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
 35 |     cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0
 36 |     cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
 37 |     cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
 38 | 
 39 |     # transformer config
 40 |     cfg.MODEL.MASK_FORMER.NHEADS = 8
 41 |     cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
 42 |     cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
 43 |     cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
 44 |     cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
 45 |     cfg.MODEL.MASK_FORMER.PRE_NORM = False
 46 | 
 47 |     cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
 48 |     cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
 49 | 
 50 |     cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
 51 |     cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
 52 | 
 53 |     # mask_former inference config
 54 |     cfg.MODEL.MASK_FORMER.TEST = CN()
 55 |     cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
 56 |     cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
 57 |     cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
 58 |     cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
 59 |     cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
 60 |     cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
 61 | 
 62 |     # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
 63 |     # you can use this config to override
 64 |     cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
 65 | 
 66 |     # pixel decoder config
 67 |     cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
 68 |     # adding transformer in pixel decoder
 69 |     cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
 70 |     # pixel decoder
 71 |     cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
 72 | 
 73 |     # swin transformer backbone
 74 |     cfg.MODEL.SWIN = CN()
 75 |     cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
 76 |     cfg.MODEL.SWIN.PATCH_SIZE = 4
 77 |     cfg.MODEL.SWIN.EMBED_DIM = 96
 78 |     cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
 79 |     cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
 80 |     cfg.MODEL.SWIN.WINDOW_SIZE = 7
 81 |     cfg.MODEL.SWIN.MLP_RATIO = 4.0
 82 |     cfg.MODEL.SWIN.QKV_BIAS = True
 83 |     cfg.MODEL.SWIN.QK_SCALE = None
 84 |     cfg.MODEL.SWIN.DROP_RATE = 0.0
 85 |     cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
 86 |     cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
 87 |     cfg.MODEL.SWIN.APE = False
 88 |     cfg.MODEL.SWIN.PATCH_NORM = True
 89 |     cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
 90 |     cfg.MODEL.SWIN.USE_CHECKPOINT = False
 91 | 
 92 |     # NOTE: maskformer2 extra configs
 93 |     # transformer module
 94 |     cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder"
 95 | 
 96 |     # LSJ aug
 97 |     cfg.INPUT.IMAGE_SIZE = 1024
 98 |     cfg.INPUT.MIN_SCALE = 0.1
 99 |     cfg.INPUT.MAX_SCALE = 2.0
100 | 
101 |     # MSDeformAttn encoder configs
102 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
103 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
104 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
105 | 
106 |     # point loss configs
107 |     # Number of points sampled during training for a mask point head.
108 |     cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112
109 |     # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
110 |     # original paper.
111 |     cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0
112 |     # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
113 |     # the original paper.
114 |     cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
115 | 


--------------------------------------------------------------------------------
/util/stanza_utils.py:
--------------------------------------------------------------------------------
  1 | import stanza
  2 | import torch
  3 | import os
  4 | # from predict import get_clip_embeddings
  5 | from torch.nn import functional as F
  6 | 
  7 | 
  8 | nlp = stanza.Pipeline('en', dir = "../checkpoints/pweight", download_method=None)
  9 | 
 10 | MAIN_POS = ['NOUN', 'VERB', 'ADJ', 'PROPN', 'NUM']
 11 | 
 12 | def find_no_main_words(sent, start_idx, tokenizer):
 13 |     # id_map = {}
 14 |     # doc = nlp(sent)
 15 |     # cursor = start_idx + 1
 16 |     # main_words_ids = []
 17 |     # for word in doc.sentences[0].words:
 18 |     #     temp = cursor; cursor += 1
 19 |     #     main_words_ids.append(word.id)
 20 |     #     id_map[word.id] = list(range(temp, cursor))
 21 |     # return main_words_ids, id_map 
 22 |     
 23 |     doc = nlp(sent)
 24 |     id_map = {}
 25 |     cursor = start_idx + 1
 26 |     main_words_ids = []
 27 |     sent_split = sent.split()
 28 |     cnt = 0; rword = ""; flag = False
 29 |     for word in doc.sentences[0].words:
 30 |         if word.text != sent_split[cnt]:
 31 |             if word.pos: flag = True
 32 |             rword += word.text
 33 |             if rword != sent_split[cnt]:
 34 |                 continue
 35 |             else:
 36 |                 word.text = rword
 37 |                 rword = ""
 38 |                 if flag: word.pos = MAIN_POS[0]
 39 |                 flag = False                
 40 |         cnt += 1
 41 |         temp = cursor
 42 |         cursor += len(tokenizer(word.text, return_tensors="pt").input_ids[0]) - 1
 43 |         id_map[word.id] = list(range(temp, cursor))
 44 |         if word.pos:
 45 |             main_words_ids.append(word.id)
 46 |     return main_words_ids, id_map
 47 | 
 48 | def find_main_words(sent, start_idx, tokenizer):
 49 |     doc = nlp(sent)
 50 |     id_map = {}
 51 |     cursor = start_idx + 1
 52 |     main_words_ids = []
 53 |     sent_split = sent.split()
 54 |     cnt = 0; rword = ""; flag = False
 55 |     for word in doc.sentences[0].words:
 56 |         if word.text != sent_split[cnt]: # 如果当前单词从中间被分解了
 57 |             if word.pos in MAIN_POS: flag = True
 58 |             rword += word.text
 59 |             if rword != sent_split[cnt]:
 60 |                 continue
 61 |             else:
 62 |                 word.text = rword
 63 |                 rword = ""
 64 |                 if flag: word.pos = MAIN_POS[0]
 65 |                 flag = False                
 66 |         cnt += 1
 67 |         temp = cursor
 68 |         cursor += len(tokenizer(word.text, return_tensors="pt").input_ids[0]) - 1
 69 |         id_map[word.id] = list(range(temp, cursor))
 70 |         if word.pos in MAIN_POS:
 71 |             main_words_ids.append(word.id)
 72 |     return main_words_ids, id_map
 73 | 
 74 | def DFS_right(node, variables: list):
 75 |     if node.is_leaf() or variables[1] != '':
 76 |         return
 77 |     if node.label == 'NN':
 78 |         variables[1] = str(node.children[0])
 79 |         return
 80 |     for i in range(len(node.children)):
 81 |         idx = len(node.children) - i - 1
 82 |         DFS_right(node.children[idx], variables)
 83 | 
 84 | 
 85 | def DFS_left(node, variables: list):
 86 |     if node.is_leaf() or variables[1] != '':
 87 |         return
 88 |     if node.label == 'NP':
 89 |         variables[0] += 1
 90 |     now_find_np = variables[0]
 91 |     for child in node.children:
 92 |         DFS_left(child, variables)
 93 |     if node.label == 'NP' and variables[0] == now_find_np:
 94 |         # find rightmost NN
 95 |         DFS_right(node, variables)
 96 | 
 97 | 
 98 | def find_agent_by_stanza(sent):
 99 |     doc = nlp(sent)
100 |     '''
101 |     the type of passing for int and str is passing by value, while for list is passing by reference,
102 |     to modify the value of variables during DFS_left function, we put variables into a list
103 |     variables[0] means the number of occurrence of NP, while variables[1] records the agent
104 |     '''
105 |     variables = [0, '']
106 |     DFS_left(doc.sentences[0].constituency, variables)
107 |     agent = variables[1]
108 |     
109 |     if agent == '':
110 |         for i in range(len(doc.sentences[0].words)):
111 |             idx = len(doc.sentences[0].words) - i - 1
112 |             if doc.sentences[0].words[idx].pos == 'NOUN':
113 |                 agent = doc.sentences[0].words[idx].text
114 |                 break
115 |     # if agent == '':
116 |     #     for i in range(len(doc.sentences[0].words)):
117 |     #         if doc.sentences[0].words[i].deprel == 'root':
118 |     #             agent = doc.sentences[0].words[i].text
119 |     #             break
120 |     if agent == '':
121 |         agent = '[UNK]'
122 |     agent_id = -1
123 |     for word in doc.sentences[0].words:
124 |         if agent == word.text: 
125 |             agent_id = word.id
126 |     return agent, agent_id, len(doc.sentences[0].words)
127 | 
128 | if __name__ == '__main__':
129 |     text = "a red and white checkered table with two wooden chairs"
130 |     print(find_agent_by_stanza(text)[0])
131 |     
132 |     


--------------------------------------------------------------------------------
/mask2former/evaluation/instance_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import contextlib
  3 | import copy
  4 | import io
  5 | import itertools
  6 | import json
  7 | import logging
  8 | import numpy as np
  9 | import os
 10 | import pickle
 11 | from collections import OrderedDict
 12 | import pycocotools.mask as mask_util
 13 | import torch
 14 | from pycocotools.coco import COCO
 15 | from pycocotools.cocoeval import COCOeval
 16 | from tabulate import tabulate
 17 | 
 18 | import detectron2.utils.comm as comm
 19 | from detectron2.config import CfgNode
 20 | from detectron2.data import MetadataCatalog
 21 | from detectron2.data.datasets.coco import convert_to_coco_json
 22 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
 23 | from detectron2.evaluation.fast_eval_api import COCOeval_opt
 24 | from detectron2.structures import Boxes, BoxMode, pairwise_iou
 25 | from detectron2.utils.file_io import PathManager
 26 | from detectron2.utils.logger import create_small_table
 27 | 
 28 | 
 29 | # modified from COCOEvaluator for instance segmetnat
 30 | class InstanceSegEvaluator(COCOEvaluator):
 31 |     """
 32 |     Evaluate AR for object proposals, AP for instance detection/segmentation, AP
 33 |     for keypoint detection outputs using COCO's metrics.
 34 |     See http://cocodataset.org/#detection-eval and
 35 |     http://cocodataset.org/#keypoints-eval to understand its metrics.
 36 |     The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
 37 |     the metric cannot be computed (e.g. due to no predictions made).
 38 | 
 39 |     In addition to COCO, this evaluator is able to support any bounding box detection,
 40 |     instance segmentation, or keypoint detection dataset.
 41 |     """
 42 | 
 43 |     def _eval_predictions(self, predictions, img_ids=None):
 44 |         """
 45 |         Evaluate predictions. Fill self._results with the metrics of the tasks.
 46 |         """
 47 |         self._logger.info("Preparing results for COCO format ...")
 48 |         coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
 49 |         tasks = self._tasks or self._tasks_from_predictions(coco_results)
 50 | 
 51 |         # unmap the category ids for COCO
 52 |         if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
 53 |             dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
 54 |             # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
 55 |             # num_classes = len(all_contiguous_ids)
 56 |             # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
 57 | 
 58 |             reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
 59 |             for result in coco_results:
 60 |                 category_id = result["category_id"]
 61 |                 # assert category_id < num_classes, (
 62 |                 #     f"A prediction has class={category_id}, "
 63 |                 #     f"but the dataset only has {num_classes} classes and "
 64 |                 #     f"predicted class id should be in [0, {num_classes - 1}]."
 65 |                 # )
 66 |                 assert category_id in reverse_id_mapping, (
 67 |                     f"A prediction has class={category_id}, "
 68 |                     f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
 69 |                 )
 70 |                 result["category_id"] = reverse_id_mapping[category_id]
 71 | 
 72 |         if self._output_dir:
 73 |             file_path = os.path.join(self._output_dir, "coco_instances_results.json")
 74 |             self._logger.info("Saving results to {}".format(file_path))
 75 |             with PathManager.open(file_path, "w") as f:
 76 |                 f.write(json.dumps(coco_results))
 77 |                 f.flush()
 78 | 
 79 |         if not self._do_evaluation:
 80 |             self._logger.info("Annotations are not available for evaluation.")
 81 |             return
 82 | 
 83 |         self._logger.info(
 84 |             "Evaluating predictions with {} COCO API...".format(
 85 |                 "unofficial" if self._use_fast_impl else "official"
 86 |             )
 87 |         )
 88 |         for task in sorted(tasks):
 89 |             assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
 90 |             coco_eval = (
 91 |                 _evaluate_predictions_on_coco(
 92 |                     self._coco_api,
 93 |                     coco_results,
 94 |                     task,
 95 |                     kpt_oks_sigmas=self._kpt_oks_sigmas,
 96 |                     use_fast_impl=self._use_fast_impl,
 97 |                     img_ids=img_ids,
 98 |                     max_dets_per_image=self._max_dets_per_image,
 99 |                 )
100 |                 if len(coco_results) > 0
101 |                 else None  # cocoapi does not handle empty results very well
102 |             )
103 | 
104 |             res = self._derive_coco_results(
105 |                 coco_eval, task, class_names=self._metadata.get("thing_classes")
106 |             )
107 |             self._results[task] = res
108 | 


--------------------------------------------------------------------------------
/demo/itergradcam.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import cv2 
  4 | import torch
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | from skimage import transform as skimage_transform
  8 | from scipy.ndimage import filters
  9 | from PIL import Image
 10 | import cv2
 11 | 
 12 | def mySig(x, a = 20, b = 0.5):
 13 |     sig_x = 1 / (1 + torch.exp(a*(-x+b)))
 14 |     sig_x = sig_x / sig_x.max()
 15 |     return sig_x
 16 | 
 17 | def getAttMap(img, attMap, blur = False, overlap = True):
 18 |     attMap -= attMap.min()
 19 |     if attMap.max() > 0:
 20 |         attMap /= attMap.max()
 21 |     attMap = skimage_transform.resize(attMap, (img.shape[:2]), order = 3, mode = 'constant')
 22 |     # attMap = skimage_transform.resize(attMap, (attMap.shape), order = 3, mode = 'constant')
 23 |     
 24 |     if blur:
 25 |         attMap = filters.gaussian_filter(attMap, 0.02*max(img.shape[:2]))
 26 |         attMap -= attMap.min()
 27 |         attMap /= attMap.max()
 28 |     cmap = plt.get_cmap('jet')
 29 |     attMapV = cmap(attMap)
 30 |     attMapV = np.delete(attMapV, 3, 2)
 31 |     if overlap:
 32 |         attMap = 1*(1-attMap**0.7).reshape(attMap.shape + (1,))*img + (attMap**0.7).reshape(attMap.shape+(1,)) * attMapV
 33 |     return attMap
 34 |     
 35 | def show_groundvlp(image_path,query,gradcam,save_path = None):
 36 |     # category,
 37 |     # boxes_category,
 38 |     # od_scores
 39 |     num_image = 2
 40 |     fig, ax = plt.subplots(num_image, 1, figsize=(15, 5 * num_image))
 41 | 
 42 |     bgr_image = cv2.imread(image_path)
 43 |     ax[0].imshow(bgr_image[:, :, ::-1])
 44 |     ax[0].set_yticks([])
 45 |     ax[0].set_xticks([])
 46 |     ax[0].set_xlabel(query, fontsize=15)
 47 | 
 48 |     rgb_image = cv2.imread(image_path)[:, :, ::-1]
 49 |     rgb_image = np.float32(rgb_image) / 255
 50 | 
 51 |     gradcam_image = getAttMap(rgb_image, gradcam)
 52 |     np.clip(gradcam_image, 0., 1., out=gradcam_image)
 53 |     # print(np.max(gradcam_image))
 54 |     ax[1].imshow(gradcam_image)
 55 |     ax[1].set_yticks([])
 56 |     ax[1].set_xticks([])
 57 |     ax[1].set_xlabel(query, fontsize=20)
 58 |     # plt.show()
 59 |     if save_path:
 60 |         plt.savefig(save_path)
 61 | 
 62 | 
 63 | def IterGradCAM(engine,img_path,query,iter_num,path = './erase_imgs',vis = False,lmbda = 0.8):
 64 |     # Iterative机制
 65 |     gradcam = None
 66 |     if os.path.exists(path):
 67 |         shutil.rmtree(path)
 68 |     os.makedirs(path, exist_ok=True)
 69 |     
 70 |     itm_scores = []
 71 |     pre_gradcam = None 
 72 |     flag = True
 73 |     for i in range(iter_num):
 74 |         # gradcam, itm_score = engine.visualize_groundvlp(image_path=img_path,query=query,epoch=i)
 75 |         if i != 0:
 76 |             gradcam, itm_score = engine.visualize_groundvlp(image_path=img_path,query=query,epoch=i)
 77 |             # tmp = torch.exp(tmp) / torch.exp(torch.tensor(1.0))
 78 |             # tmp = mySig(tmp)
 79 |             if grad_rev_sum.item()*itm_score.item() < itm_scores[-1]: 
 80 |             # if itm_score.item() < itm_scores[-1]: 
 81 |                 flag = False
 82 |                 break
 83 |             # gradcam = gradcam*lmbda + tmp*(1-lmbda)
 84 |             # gradcam = gradcam*(1-lmbda) + tmp*lmbda
 85 |             # gradcam = gradcam + tmp
 86 |             # gradcam = gradcam.clamp(0.0, 1.0)
 87 |             # gradcam -= gradcam.min()
 88 |             # if gradcam.max() > 0:
 89 |             #     gradcam /= gradcam.max()
 90 |         else:
 91 |             gradcam, itm_score = engine.visualize_groundvlp(image_path=img_path,query=query,epoch=i)
 92 |             # gradcam = torch.exp(gradcam) / torch.exp(torch.tensor(1.0))
 93 |             gradcam = mySig(gradcam)
 94 |             Image.open(img_path).save(f"{path}/raw.jpg")
 95 |             # img_path = f"{path}/raw.jpg"
 96 |             pre_gradcam = torch.zeros_like(gradcam)
 97 |         
 98 | 
 99 |         grad_rev_sum = torch.sum(torch.ones_like(pre_gradcam)-pre_gradcam) / pre_gradcam.numel()
100 |         
101 |         itm_scores.append(itm_score.item() * grad_rev_sum.item())
102 |         print(f"Iteration {i}")
103 |         
104 |         pre_gradcam = gradcam
105 |         
106 |         if vis: show_groundvlp(f"{path}/raw.jpg",query,gradcam,f"{path}/{i}_grad.jpg")
107 |     # show_groundvlp(f"{path}/raw.jpg",query,gradcam,f"{path}/result.jpg")
108 |     if flag: i = iter_num
109 |     return gradcam, i
110 | def seed_torch(seed):
111 |     os.environ['PYTHONHASHSEED'] = str(seed)
112 |     np.random.seed(seed)
113 |     torch.manual_seed(seed)
114 |     torch.cuda.manual_seed(seed)
115 |     torch.backends.cudnn.benchmark = False
116 |     torch.backends.cudnn.deterministic = True 
117 | 
118 | if __name__ == "__main__":
119 |     # 初始化路径和query
120 |     import sys
121 |     sys.path.insert(1, os.path.join(sys.path[0], '..'))
122 |     from models.albef.engine import ALBEF
123 |     seed = 1234
124 |     seed_torch(seed)
125 |     # torch.manual_seed(seed)
126 |     # torch.cuda.manual_seed(seed)
127 |     img_path = "/home/vegetabot/Filesys/CodeField_win/referformer_modify/data/coco/train2014/COCO_train2014_000000274266.jpg"
128 |     query = "a man standing next to a young girl on a grassy hillside"  # "a lady pouring wine in a glass"
129 |     
130 |     
131 | 
132 |     engine = ALBEF(model_id='ALBEF', device='cuda', templates='there is a {}')
133 |     IterGradCAM(engine,img_path,query,3, vis = True)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # IteRPrimE: Zero-shot Referring Image Segmentation with Iterative Grad-CAM Refinement and Primary Word Emphasis
  3 | 
  4 | ## 📝 Overview
  5 | 
  6 | <img src="image.png" alt="Architecture" width="75%">
  7 | 
  8 | IteRPrimE is a novel framework for zero-shot referring image segmentation that leverages an Iterative Grad-CAM Refinement Strategy (IGRS) and a Primary Word Emphasis Module (PWEM) to improve localization accuracy—especially in cases with complex spatial descriptions. By using pre-trained vision-language models, IteRPrimE eliminates the need for further training or fine-tuning, achieving state-of-the-art performance on benchmarks such as RefCOCO, RefCOCO+, RefCOCOg, and PhraseCut.
  9 | 
 10 | ## ✨ News
 11 | 
 12 | - [2025.04.05]🔥 The v1 of the code is now officially open-sourced!🎉
 13 | - [2024.12.10]🔥 IteRPrimE is accepted by AAAI-2025! 🥳
 14 | 
 15 | ## 🛠 Installation
 16 | 
 17 | 1. Create and activate a Conda environment:
 18 | 
 19 | ```
 20 | conda create -n iterprime python=3.8.19
 21 | conda activate iterprime
 22 | ```
 23 | 
 24 | 2. Install the dependencies:
 25 | 
 26 | ```
 27 | pip install -r requirements.txt
 28 | ```
 29 | 
 30 | If you encounter any issues with the environment setup, you can refer to the configurations in the following two open-source projects for assistance:
 31 | 
 32 | - [GroundVLP](https://github.com/om-ai-lab/GroundVLP)
 33 | 
 34 | - [Mask2Former](https://github.com/facebookresearch/Mask2Former)
 35 | 
 36 | ## 🏋️ Preparing Pretrained Model Weights
 37 | 
 38 | Before you can run the code, you'll need to prepare the pretrained model weights and dataset. Please follow the steps below:
 39 | 
 40 | ### 1. Create the necessary directories
 41 | 
 42 | In the root directory of the project, create two directories: `checkpoints` and `data`.
 43 | 
 44 | ```bash
 45 | mkdir checkpoints
 46 | mkdir data
 47 | ```
 48 | 
 49 | ### 2. Download the pretrained model weights
 50 | 
 51 | Download the following pretrained model weights and place them in the checkpoints folder. You can download them directly from the given links.
 52 | 
 53 | - **Mask2Former weights:** [model_final_f07440.pkl](https://pan.baidu.com/s/1phxTevS--Pe-3p611Pt75g) [pt6z] 
 54 | - **ALBEF weights:** [ALBEF.pth](https://pan.baidu.com/s/1GlO1LIjjvp_G5fb7z49VJQ) [8u5y] 
 55 | - **Stanza weights:** [pweight.zip](https://pan.baidu.com/s/1P1bHVgiRUGsm1bWzcCy5-w) [6f84] 
 56 | - **BERT-base-uncased weights:** [bert-base-uncased.zip](https://pan.baidu.com/s/1l9afrG0tgAHDArh6ON3ieA)[bggq] 
 57 | 
 58 | After downloading, extract the weight files (if in a compressed format) into the checkpoints folder. Your folder structure should look like this:
 59 | 
 60 | ```shell
 61 | <root_directory>/
 62 | │
 63 | ├── checkpoints/
 64 | │   ├── model_final_f07440.pkl
 65 | │   ├── ALBEF.pth
 66 | │   ├── pweight/
 67 | │   └── bert-base-uncased/
 68 | └── data/
 69 | ```
 70 | 
 71 | ### 3. Download the COCO dataset
 72 | 
 73 | Download the COCO dataset from the provided link and place it in the data folder.
 74 | 
 75 | - **COCO dataset:** [coco.zip](https://pan.baidu.com/s/1MxNHMRmlkqLqsd9MWFWHpQ) [egus] 
 76 | 
 77 | After downloading, extract the coco.zip file to the data folder. The final directory structure should look like this:
 78 | 
 79 | ```shell
 80 | <root_directory>/
 81 | │
 82 | ├── data/
 83 | │   └── coco/
 84 | └── checkpoints/
 85 | ```
 86 | 
 87 | Once the weights and dataset are in place, you’re ready to run the code and start testing the model.
 88 | 
 89 | 
 90 | ## 🚀 Running the Demo
 91 | 
 92 | To quickly verify your setup and see IteRPrimE in action, follow these steps:
 93 | 
 94 | 1. **Navigate to the `demo` directory**:
 95 | 
 96 | ```bash
 97 | cd demo
 98 | ```
 99 | 
100 | 2.	**Run the main script**:
101 | 
102 | Run following code to evaluate IteRPrimE on **refcoco testA** datasets:
103 | 
104 | ```bash
105 | python IteRPrimE.py --data-set refcoco --image-set testA
106 | ```
107 | 
108 | This script will:
109 | 
110 | - Load the necessary configurations and pretrained model weights.
111 | 
112 | - Perform zero-shot referring image segmentation on a sample image or dataset.
113 | 
114 | - Output the segmentation results for you to inspect.
115 | 
116 | 
117 | 
118 | Depending on your configuration, the output images or logs may be saved to a specific folder (e.g., outputs/). Check the script and your config settings for details.
119 | 
120 | If you encounter any issues or have questions, feel free to open an issue or start a discussion.
121 | 
122 | 
123 | 
124 | 
125 | ## 📖 Citation
126 | 
127 | If you use IteRPrimE in your research, please cite our paper:
128 | 
129 | ```
130 | @article{wang2025iterprime,
131 |   title={IteRPrimE: Zero-shot Referring Image Segmentation with Iterative Grad-CAM Refinement and Primary Word Emphasis},
132 |   author={Wang, Yuji and Ni, Jingchen and Liu, Yong and Yuan, Chun and Tang, Yansong},
133 |   journal={arXiv preprint arXiv:2503.00936},
134 |   year={2025}
135 | }
136 | ```
137 | 
138 | ## 🤝 Acknowledgements
139 | 
140 | We appreciate the support from our collaborators and funding agencies. Stay tuned for updates—the full code release will be coming soon!
141 | 
142 | We would also like to express our gratitude to the authors of [GroundVLP](https://github.com/om-ai-lab/GroundVLP) and [Mask2Former](https://github.com/facebookresearch/Mask2Former) for their inspiring work and open-source contributions, which served as valuable references for this project.
143 | 


--------------------------------------------------------------------------------
/models/albef/VL_Transformer_ITM.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | 
  3 | import numpy as np
  4 | from matplotlib import pyplot as plt
  5 | from scipy.ndimage import filters
  6 | 
  7 | from models.albef.models.tokenization_bert import BertTokenizer
  8 | from models.albef.models.vit import VisionTransformer
  9 | from models.albef.models.xbert import BertConfig, BertModel
 10 | from skimage import transform as skimage_transform
 11 | 
 12 | import torch
 13 | from torch import nn
 14 | from torchvision import transforms
 15 | import re
 16 | from PIL import Image
 17 | 
 18 | # coding=utf-8
 19 | # 构建SPP层(空间金字塔池化层)
 20 | import math
 21 | import torch
 22 | import torch.nn.functional as F
 23 | 
 24 | # 构建SPP层(空间金字塔池化层)
 25 | class SPPLayer(torch.nn.Module):
 26 | 
 27 |     def __init__(self, num_levels, pool_type='max_pool'):
 28 |         super(SPPLayer, self).__init__()
 29 | 
 30 |         self.num_levels = num_levels
 31 |         self.pool_type = pool_type
 32 | 
 33 |     def forward(self, x):
 34 |         num, n,c = x.size()  # num:样本数量 c:通道数 h:高 w:宽
 35 | 
 36 |         h = 16
 37 |         w = 16
 38 |         # 提取最初的《cls》token
 39 |         cls_token = x[:, 0, :].view(num, 1, c)
 40 | 
 41 |         # 对剩下的 image token 做 SPP
 42 |         x = x[:, 1:, :].view(num, c, h, w)
 43 |         original_num_tokens = 256
 44 | 
 45 |         for i in range(self.num_levels):
 46 |             level = i + 1
 47 |             kernel_size = (math.ceil(h / level), math.ceil(w / level))
 48 |             stride = (math.ceil(h / level), math.ceil(w / level))
 49 |             pooling = (math.floor((kernel_size[0]*level-h+1)/2), math.floor((kernel_size[1]*level-w+1)/2))
 50 | 
 51 |             # 选择池化方式
 52 |             if self.pool_type == 'max_pool':
 53 |                 tensor = F.max_pool2d(x, kernel_size=kernel_size, stride=stride, padding=pooling)
 54 |             else:
 55 |                 tensor = F.avg_pool2d(x, kernel_size=kernel_size, stride=stride, padding=pooling)
 56 | 
 57 |             # 展开、拼接
 58 |             if (i == 0):
 59 |                 x_flatten = tensor.view(num, -1, c)
 60 |             else:
 61 |                 x_flatten = torch.cat((x_flatten, tensor.view(num, -1, c)), 1)
 62 | 
 63 |        # 将最初的《cls》token拼接回去
 64 |         # x_flatten = torch.cat((cls_token, x_flatten), 1)
 65 | 
 66 |         # 复制 cls token 到 257 个
 67 |         cls_tokens = cls_token.expand(-1, original_num_tokens-x_flatten.shape[1]+2, -1)
 68 |         x_flatten = torch.cat((cls_tokens, x_flatten[:, 1:, :]), 1)
 69 | 
 70 | 
 71 |         return x_flatten
 72 | 
 73 | 
 74 | class VL_Transformer_ITM(nn.Module):
 75 |     def __init__(self,
 76 |                  text_encoder=None,
 77 |                  config_bert=''
 78 |                  ):
 79 |         super().__init__()
 80 | 
 81 |         bert_config = BertConfig.from_json_file(config_bert)
 82 | 
 83 |         self.visual_encoder = VisionTransformer(
 84 |             img_size=256, patch_size=16, embed_dim=768, depth=12, num_heads=12,
 85 |             mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6))
 86 | 
 87 |         self.text_encoder = BertModel.from_pretrained(text_encoder, config=bert_config, add_pooling_layer=False)
 88 | 
 89 |         self.itm_head = nn.Linear(768, 2)
 90 |         
 91 |         self.spp = SPPLayer(6)
 92 | 
 93 |     def forward(self, image, text, gradcam=None):
 94 |         image_embeds = self.visual_encoder(image)
 95 |         # 看看这里如果加了细粒度的特征会怎么样
 96 |         # image_embeds = image_embeds.reshape(1,-1,16,16)
 97 |         # image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device)
 98 |         # image_embeds = self.spp(image_embeds)
 99 |         
100 |         if gradcam is None : image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device)
101 |         else: 
102 |             B = image_embeds.shape[0]
103 |             ones = torch.ones(B, 1, dtype=torch.long, device=image.device)
104 |             image_atts = torch.cat((ones, gradcam.to(image.device)), dim=1)
105 |         
106 |         output = self.text_encoder(text.input_ids.to(image.device),
107 |                                    attention_mask=text.attention_mask.to(image.device),
108 |                                    encoder_hidden_states=image_embeds,
109 |                                    encoder_attention_mask=image_atts.to(image.device),
110 |                                    return_dict=True,
111 |                                    )
112 |         vl_embeddings = output.last_hidden_state[:, 0, :]
113 |         
114 |         # print(vl_embeddings)
115 |         vl_output = self.itm_head(vl_embeddings)
116 |         # _,pred = torch.max(vl_output,1)
117 |         # print(type(vl_output))
118 |         # return image_embeds, output.last_hidden_state, vl_output
119 |         return vl_output
120 |     
121 | 
122 | 
123 | def getAttMap(img, attMap, blur = True, overlap = True):
124 |     attMap -= attMap.min()
125 |     if attMap.max() > 0:
126 |         attMap /= attMap.max()
127 |     attMap = skimage_transform.resize(attMap, (img.shape[:2]), order = 3, mode = 'constant')
128 |     # attMap = skimage_transform.resize(attMap, (attMap.shape), order = 3, mode = 'constant')
129 |     
130 |     if blur:
131 |         attMap = filters.gaussian_filter(attMap, 0.02*max(img.shape[:2]))
132 |         attMap -= attMap.min()
133 |         attMap /= attMap.max()
134 |     cmap = plt.get_cmap('jet')
135 |     attMapV = cmap(attMap)
136 |     attMapV = np.delete(attMapV, 3, 2)
137 |     if overlap:
138 |         attMap = 1*(1-attMap**0.7).reshape(attMap.shape + (1,))*img + (attMap**0.7).reshape(attMap.shape+(1,)) * attMapV
139 |     return attMap
140 | 
141 | normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
142 | 
143 | transform = transforms.Compose([
144 |     transforms.Resize((256, 256), interpolation=Image.BICUBIC),
145 |     transforms.ToTensor(),
146 |     normalize,
147 | ])
148 | 
149 | 
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/dataset/coco.py:
--------------------------------------------------------------------------------
  1 | """
  2 | COCO dataset which returns image_id for evaluation.
  3 | 
  4 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
  5 | """
  6 | from pathlib import Path
  7 | 
  8 | import torch
  9 | import torch.utils.data
 10 | import torchvision
 11 | from pycocotools import mask as coco_mask
 12 | 
 13 | import datasets.transforms as T
 14 | 
 15 | 
 16 | class CocoDetection(torchvision.datasets.CocoDetection):
 17 |     def __init__(self, img_folder, ann_file, transforms, return_masks):
 18 |         super(CocoDetection, self).__init__(img_folder, ann_file)
 19 |         self._transforms = transforms
 20 |         self.prepare = ConvertCocoPolysToMask(return_masks)
 21 | 
 22 |     def __getitem__(self, idx):
 23 |         img, target = super(CocoDetection, self).__getitem__(idx)
 24 |         image_id = self.ids[idx]
 25 |         target = {'image_id': image_id, 'annotations': target}
 26 | 
 27 |         img, target = self.prepare(img, target)
 28 |         if self._transforms is not None:
 29 |             img, target = self._transforms(img, target)
 30 |         return img, target
 31 | 
 32 | 
 33 | def convert_coco_poly_to_mask(segmentations, height, width):
 34 |     masks = []
 35 |     for polygons in segmentations:
 36 |         rles = coco_mask.frPyObjects(polygons, height, width)
 37 |         mask = coco_mask.decode(rles)
 38 |         if len(mask.shape) < 3:
 39 |             mask = mask[..., None]
 40 |         mask = torch.as_tensor(mask, dtype=torch.uint8)
 41 |         mask = mask.any(dim=2)
 42 |         masks.append(mask)
 43 |     if masks:
 44 |         masks = torch.stack(masks, dim=0)
 45 |     else:
 46 |         masks = torch.zeros((0, height, width), dtype=torch.uint8)
 47 |     return masks
 48 | 
 49 | 
 50 | class ConvertCocoPolysToMask(object):
 51 |     def __init__(self, return_masks=False):
 52 |         self.return_masks = return_masks
 53 | 
 54 |     def __call__(self, image, target):
 55 |         w, h = image.size
 56 | 
 57 |         image_id = target["image_id"]
 58 |         image_id = torch.tensor([image_id])
 59 | 
 60 |         anno = target["annotations"]
 61 | 
 62 |         anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
 63 | 
 64 |         boxes = [obj["bbox"] for obj in anno]
 65 |         # guard against no boxes via resizing
 66 |         boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
 67 |         boxes[:, 2:] += boxes[:, :2]
 68 |         boxes[:, 0::2].clamp_(min=0, max=w)
 69 |         boxes[:, 1::2].clamp_(min=0, max=h)
 70 | 
 71 |         classes = [obj["category_id"] for obj in anno]
 72 |         classes = torch.tensor(classes, dtype=torch.int64)
 73 | 
 74 |         if self.return_masks:
 75 |             segmentations = [obj["segmentation"] for obj in anno]
 76 |             masks = convert_coco_poly_to_mask(segmentations, h, w)
 77 | 
 78 |         keypoints = None
 79 |         if anno and "keypoints" in anno[0]:
 80 |             keypoints = [obj["keypoints"] for obj in anno]
 81 |             keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
 82 |             num_keypoints = keypoints.shape[0]
 83 |             if num_keypoints:
 84 |                 keypoints = keypoints.view(num_keypoints, -1, 3)
 85 | 
 86 |         keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
 87 |         boxes = boxes[keep]
 88 |         classes = classes[keep]
 89 |         if self.return_masks:
 90 |             masks = masks[keep]
 91 |         if keypoints is not None:
 92 |             keypoints = keypoints[keep]
 93 | 
 94 |         target = {}
 95 |         target["boxes"] = boxes
 96 |         target["labels"] = classes
 97 |         if self.return_masks:
 98 |             target["masks"] = masks
 99 |         target["image_id"] = image_id
100 |         if keypoints is not None:
101 |             target["keypoints"] = keypoints
102 | 
103 |         # for conversion to coco api
104 |         area = torch.tensor([obj["area"] for obj in anno])
105 |         iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
106 |         target["area"] = area[keep]
107 |         target["iscrowd"] = iscrowd[keep]
108 | 
109 |         target["orig_size"] = torch.as_tensor([int(h), int(w)])
110 |         target["size"] = torch.as_tensor([int(h), int(w)])
111 | 
112 |         return image, target
113 | 
114 | 
115 | def make_coco_transforms(image_set):
116 | 
117 |     normalize = T.Compose([
118 |         T.ToTensor(),
119 |         T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
120 |     ])
121 | 
122 |     scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
123 | 
124 |     if image_set == 'train':
125 |         return T.Compose([
126 |             T.RandomHorizontalFlip(),
127 |             T.RandomSelect(
128 |                 T.RandomResize(scales, max_size=1333),
129 |                 T.Compose([
130 |                     T.RandomResize([400, 500, 600]),
131 |                     T.RandomSizeCrop(384, 600),
132 |                     T.RandomResize(scales, max_size=1333),
133 |                 ])
134 |             ),
135 |             normalize,
136 |         ])
137 | 
138 |     if image_set == 'val':
139 |         return T.Compose([
140 |             T.RandomResize([800], max_size=1333),
141 |             normalize,
142 |         ])
143 | 
144 |     raise ValueError(f'unknown {image_set}')
145 | 
146 | 
147 | def build(image_set, args):
148 |     root = Path(args.coco_path)
149 |     assert root.exists(), f'provided COCO path {root} does not exist'
150 |     mode = 'instances'
151 |     PATHS = {
152 |         "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'),
153 |         "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'),
154 |     }
155 |     img_folder, ann_file = PATHS[image_set]
156 |     dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks)
157 |     return dataset
158 | 


--------------------------------------------------------------------------------
/dataset/samplers.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from codes in torch.utils.data.distributed
  7 | # ------------------------------------------------------------------------
  8 | 
  9 | import os
 10 | import math
 11 | import torch
 12 | import torch.distributed as dist
 13 | from torch.utils.data.sampler import Sampler
 14 | 
 15 | 
 16 | class DistributedSampler(Sampler):
 17 |     """Sampler that restricts data loading to a subset of the dataset.
 18 |     It is especially useful in conjunction with
 19 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
 20 |     process can pass a DistributedSampler instance as a DataLoader sampler,
 21 |     and load a subset of the original dataset that is exclusive to it.
 22 |     .. note::
 23 |         Dataset is assumed to be of constant size.
 24 |     Arguments:
 25 |         dataset: Dataset used for sampling.
 26 |         num_replicas (optional): Number of processes participating in
 27 |             distributed training.
 28 |         rank (optional): Rank of the current process within num_replicas.
 29 |     """
 30 | 
 31 |     def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True):
 32 |         if num_replicas is None:
 33 |             if not dist.is_available():
 34 |                 raise RuntimeError("Requires distributed package to be available")
 35 |             num_replicas = dist.get_world_size()
 36 |         if rank is None:
 37 |             if not dist.is_available():
 38 |                 raise RuntimeError("Requires distributed package to be available")
 39 |             rank = dist.get_rank()
 40 |         self.dataset = dataset
 41 |         self.num_replicas = num_replicas
 42 |         self.rank = rank
 43 |         self.epoch = 0
 44 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
 45 |         self.total_size = self.num_samples * self.num_replicas
 46 |         self.shuffle = shuffle
 47 | 
 48 |     def __iter__(self):
 49 |         if self.shuffle:
 50 |             # deterministically shuffle based on epoch
 51 |             g = torch.Generator()
 52 |             g.manual_seed(self.epoch)
 53 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
 54 |         else:
 55 |             indices = torch.arange(len(self.dataset)).tolist()
 56 | 
 57 |         # add extra samples to make it evenly divisible
 58 |         indices += indices[: (self.total_size - len(indices))]
 59 |         assert len(indices) == self.total_size
 60 | 
 61 |         # subsample
 62 |         offset = self.num_samples * self.rank
 63 |         indices = indices[offset : offset + self.num_samples]
 64 |         assert len(indices) == self.num_samples
 65 | 
 66 |         return iter(indices)
 67 | 
 68 |     def __len__(self):
 69 |         return self.num_samples
 70 | 
 71 |     def set_epoch(self, epoch):
 72 |         self.epoch = epoch
 73 | 
 74 | 
 75 | class NodeDistributedSampler(Sampler):
 76 |     """Sampler that restricts data loading to a subset of the dataset.
 77 |     It is especially useful in conjunction with
 78 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
 79 |     process can pass a DistributedSampler instance as a DataLoader sampler,
 80 |     and load a subset of the original dataset that is exclusive to it.
 81 |     .. note::
 82 |         Dataset is assumed to be of constant size.
 83 |     Arguments:
 84 |         dataset: Dataset used for sampling.
 85 |         num_replicas (optional): Number of processes participating in
 86 |             distributed training.
 87 |         rank (optional): Rank of the current process within num_replicas.
 88 |     """
 89 | 
 90 |     def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True):
 91 |         if num_replicas is None:
 92 |             if not dist.is_available():
 93 |                 raise RuntimeError("Requires distributed package to be available")
 94 |             num_replicas = dist.get_world_size()
 95 |         if rank is None:
 96 |             if not dist.is_available():
 97 |                 raise RuntimeError("Requires distributed package to be available")
 98 |             rank = dist.get_rank()
 99 |         if local_rank is None:
100 |             local_rank = int(os.environ.get('LOCAL_RANK', 0))
101 |         if local_size is None:
102 |             local_size = int(os.environ.get('LOCAL_SIZE', 1))
103 |         self.dataset = dataset
104 |         self.shuffle = shuffle
105 |         self.num_replicas = num_replicas
106 |         self.num_parts = local_size
107 |         self.rank = rank
108 |         self.local_rank = local_rank
109 |         self.epoch = 0
110 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
111 |         self.total_size = self.num_samples * self.num_replicas
112 | 
113 |         self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts
114 | 
115 |     def __iter__(self):
116 |         if self.shuffle:
117 |             # deterministically shuffle based on epoch
118 |             g = torch.Generator()
119 |             g.manual_seed(self.epoch)
120 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
121 |         else:
122 |             indices = torch.arange(len(self.dataset)).tolist()
123 |         indices = [i for i in indices if i % self.num_parts == self.local_rank]
124 | 
125 |         # add extra samples to make it evenly divisible
126 |         indices += indices[:(self.total_size_parts - len(indices))]
127 |         assert len(indices) == self.total_size_parts
128 | 
129 |         # subsample
130 |         indices = indices[self.rank // self.num_parts:self.total_size_parts:self.num_replicas // self.num_parts]
131 |         assert len(indices) == self.num_samples
132 | 
133 |         return iter(indices)
134 | 
135 |     def __len__(self):
136 |         return self.num_samples
137 | 
138 |     def set_epoch(self, epoch):
139 |         self.epoch = epoch
140 | 


--------------------------------------------------------------------------------
/mask2former/modeling/meta_arch/mask_former_head.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import logging
  3 | from copy import deepcopy
  4 | from typing import Callable, Dict, List, Optional, Tuple, Union
  5 | 
  6 | import fvcore.nn.weight_init as weight_init
  7 | from torch import nn
  8 | from torch.nn import functional as F
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
 12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
 13 | 
 14 | from ..transformer_decoder.maskformer_transformer_decoder import build_transformer_decoder
 15 | from ..pixel_decoder.fpn import build_pixel_decoder
 16 | 
 17 | 
 18 | @SEM_SEG_HEADS_REGISTRY.register()
 19 | class MaskFormerHead(nn.Module):
 20 | 
 21 |     _version = 2
 22 | 
 23 |     def _load_from_state_dict(
 24 |         self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
 25 |     ):
 26 |         version = local_metadata.get("version", None)
 27 |         if version is None or version < 2:
 28 |             # Do not warn if train from scratch
 29 |             scratch = True
 30 |             logger = logging.getLogger(__name__)
 31 |             for k in list(state_dict.keys()):
 32 |                 newk = k
 33 |                 if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
 34 |                     newk = k.replace(prefix, prefix + "pixel_decoder.")
 35 |                     # logger.debug(f"{k} ==> {newk}")
 36 |                 if newk != k:
 37 |                     state_dict[newk] = state_dict[k]
 38 |                     del state_dict[k]
 39 |                     scratch = False
 40 | 
 41 |             if not scratch:
 42 |                 logger.warning(
 43 |                     f"Weight format of {self.__class__.__name__} have changed! "
 44 |                     "Please upgrade your models. Applying automatic conversion now ..."
 45 |                 )
 46 | 
 47 |     @configurable
 48 |     def __init__(
 49 |         self,
 50 |         input_shape: Dict[str, ShapeSpec],
 51 |         *,
 52 |         num_classes: int,
 53 |         pixel_decoder: nn.Module,
 54 |         loss_weight: float = 1.0,
 55 |         ignore_value: int = -1,
 56 |         # extra parameters
 57 |         transformer_predictor: nn.Module,
 58 |         transformer_in_feature: str,
 59 |     ):
 60 |         """
 61 |         NOTE: this interface is experimental.
 62 |         Args:
 63 |             input_shape: shapes (channels and stride) of the input features
 64 |             num_classes: number of classes to predict
 65 |             pixel_decoder: the pixel decoder module
 66 |             loss_weight: loss weight
 67 |             ignore_value: category id to be ignored during training.
 68 |             transformer_predictor: the transformer decoder that makes prediction
 69 |             transformer_in_feature: input feature name to the transformer_predictor
 70 |         """
 71 |         super().__init__()
 72 |         input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
 73 |         self.in_features = [k for k, v in input_shape]
 74 |         feature_strides = [v.stride for k, v in input_shape]
 75 |         feature_channels = [v.channels for k, v in input_shape]
 76 | 
 77 |         self.ignore_value = ignore_value
 78 |         self.common_stride = 4
 79 |         self.loss_weight = loss_weight
 80 | 
 81 |         self.pixel_decoder = pixel_decoder
 82 |         self.predictor = transformer_predictor
 83 |         self.transformer_in_feature = transformer_in_feature
 84 | 
 85 |         self.num_classes = num_classes
 86 | 
 87 |     @classmethod
 88 |     def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
 89 |         # figure out in_channels to transformer predictor
 90 |         if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
 91 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 92 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
 93 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
 94 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder":  # for maskformer2
 95 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 96 |         else:
 97 |             transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels
 98 | 
 99 |         return {
100 |             "input_shape": {
101 |                 k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
102 |             },
103 |             "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
104 |             "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
105 |             "pixel_decoder": build_pixel_decoder(cfg, input_shape),
106 |             "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
107 |             "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
108 |             "transformer_predictor": build_transformer_decoder(
109 |                 cfg,
110 |                 transformer_predictor_in_channels,
111 |                 mask_classification=True,
112 |             ),
113 |         }
114 | 
115 |     def forward(self, features, mask=None):
116 |         return self.layers(features, mask)
117 | 
118 |     def layers(self, features, mask=None):
119 |         mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(features)
120 |         if self.transformer_in_feature == "multi_scale_pixel_decoder":
121 |             predictions = self.predictor(multi_scale_features, mask_features, mask)
122 |         else:
123 |             if self.transformer_in_feature == "transformer_encoder":
124 |                 assert (
125 |                     transformer_encoder_features is not None
126 |                 ), "Please use the TransformerEncoderPixelDecoder."
127 |                 predictions = self.predictor(transformer_encoder_features, mask_features, mask)
128 |             elif self.transformer_in_feature == "pixel_embedding":
129 |                 predictions = self.predictor(mask_features, mask_features, mask)
130 |             else:
131 |                 predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
132 |         return predictions
133 | 


--------------------------------------------------------------------------------
/mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | # Copyright (c) Facebook, Inc. and its affiliates.
 10 | # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
 11 | 
 12 | from __future__ import absolute_import
 13 | from __future__ import print_function
 14 | from __future__ import division
 15 | 
 16 | import warnings
 17 | import math
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | import torch.nn.functional as F
 22 | from torch.nn.init import xavier_uniform_, constant_
 23 | 
 24 | from ..functions import MSDeformAttnFunction
 25 | from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
 26 | 
 27 | 
 28 | def _is_power_of_2(n):
 29 |     if (not isinstance(n, int)) or (n < 0):
 30 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 31 |     return (n & (n-1) == 0) and n != 0
 32 | 
 33 | 
 34 | class MSDeformAttn(nn.Module):
 35 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 36 |         """
 37 |         Multi-Scale Deformable Attention Module
 38 |         :param d_model      hidden dimension
 39 |         :param n_levels     number of feature levels
 40 |         :param n_heads      number of attention heads
 41 |         :param n_points     number of sampling points per attention head per feature level
 42 |         """
 43 |         super().__init__()
 44 |         if d_model % n_heads != 0:
 45 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 46 |         _d_per_head = d_model // n_heads
 47 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 48 |         if not _is_power_of_2(_d_per_head):
 49 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 50 |                           "which is more efficient in our CUDA implementation.")
 51 | 
 52 |         self.im2col_step = 128
 53 | 
 54 |         self.d_model = d_model
 55 |         self.n_levels = n_levels
 56 |         self.n_heads = n_heads
 57 |         self.n_points = n_points
 58 | 
 59 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
 60 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 61 |         self.value_proj = nn.Linear(d_model, d_model)
 62 |         self.output_proj = nn.Linear(d_model, d_model)
 63 | 
 64 |         self._reset_parameters()
 65 | 
 66 |     def _reset_parameters(self):
 67 |         constant_(self.sampling_offsets.weight.data, 0.)
 68 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 69 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 70 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
 71 |         for i in range(self.n_points):
 72 |             grid_init[:, :, i, :] *= i + 1
 73 |         with torch.no_grad():
 74 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 75 |         constant_(self.attention_weights.weight.data, 0.)
 76 |         constant_(self.attention_weights.bias.data, 0.)
 77 |         xavier_uniform_(self.value_proj.weight.data)
 78 |         constant_(self.value_proj.bias.data, 0.)
 79 |         xavier_uniform_(self.output_proj.weight.data)
 80 |         constant_(self.output_proj.bias.data, 0.)
 81 | 
 82 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 83 |         """
 84 |         :param query                       (N, Length_{query}, C)
 85 |         :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
 86 |                                         or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
 87 |         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
 88 |         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
 89 |         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
 90 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 91 | 
 92 |         :return output                     (N, Length_{query}, C)
 93 |         """
 94 |         N, Len_q, _ = query.shape
 95 |         N, Len_in, _ = input_flatten.shape
 96 |         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 97 | 
 98 |         value = self.value_proj(input_flatten)
 99 |         if input_padding_mask is not None:
100 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
101 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105 |         # N, Len_q, n_heads, n_levels, n_points, 2
106 |         if reference_points.shape[-1] == 2:
107 |             offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
108 |             sampling_locations = reference_points[:, :, None, :, None, :] \
109 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
110 |         elif reference_points.shape[-1] == 4:
111 |             sampling_locations = reference_points[:, :, None, :, None, :2] \
112 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
113 |         else:
114 |             raise ValueError(
115 |                 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
116 |         try:
117 |             output = MSDeformAttnFunction.apply(
118 |                 value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
119 |         except:
120 |             # CPU
121 |             output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
122 |         # # For FLOPs calculation only
123 |         # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
124 |         output = self.output_proj(output)
125 |         return output
126 | 


--------------------------------------------------------------------------------
/dataset/jhmdb.py:
--------------------------------------------------------------------------------
  1 | """
  2 | JHMDB-Sentences data loader
  3 | modified from https://github.com/mttr2021/MTTR/blob/main/datasets/jhmdb_sentences/jhmdb_sentences_dataset.py
  4 | """
  5 | from pathlib import Path
  6 | 
  7 | import torch
  8 | from torchvision.io import read_video
  9 | import torchvision.transforms.functional as F
 10 | 
 11 | from torch.utils.data import Dataset
 12 | import dataset.transforms_video as T
 13 | 
 14 | import os
 15 | from PIL import Image
 16 | import json
 17 | import numpy as np
 18 | import random
 19 | 
 20 | import scipy.io
 21 | 
 22 | def get_image_id(video_id, frame_idx):
 23 |     image_id = f'v_{video_id}_f_{frame_idx}'
 24 |     return image_id
 25 | 
 26 | class JHMDBSentencesDataset(Dataset):
 27 |     """
 28 |     A Torch dataset for JHMDB-Sentences.
 29 |     For more information check out: https://kgavrilyuk.github.io/publication/actor_action/ or the original paper at:
 30 |     https://arxiv.org/abs/1803.07485
 31 |     """
 32 |     def __init__(self, image_folder: Path, ann_file: Path, transforms, return_masks: bool, 
 33 |                  num_frames: int, max_skip: int, subset):
 34 |         super(JHMDBSentencesDataset, self).__init__()
 35 |         self.dataset_path = 'data'
 36 |         self.ann_file = ann_file
 37 |         self.samples_metadata = self.get_samples_metadata()
 38 | 
 39 |         self._transforms = transforms    
 40 |         self.return_masks = return_masks # not used
 41 |         self.num_frames = num_frames     
 42 |         self.max_skip = max_skip
 43 |         self.subset = subset
 44 | 
 45 |         print(f'\n {subset} sample num: ', len(self.samples_metadata))  
 46 |         print('\n')  
 47 | 
 48 |     def get_samples_metadata(self):
 49 |         with open(str(self.ann_file), 'r') as f:
 50 |             samples_metadata = [tuple(a) for a in json.load(f)]
 51 |             return samples_metadata
 52 | 
 53 |     @staticmethod
 54 |     def bounding_box(img):
 55 |         rows = np.any(img, axis=1)
 56 |         cols = np.any(img, axis=0)
 57 |         rmin, rmax = np.where(rows)[0][[0, -1]]
 58 |         cmin, cmax = np.where(cols)[0][[0, -1]]
 59 |         return rmin, rmax, cmin, cmax # y1, y2, x1, x2 
 60 | 
 61 |     def __len__(self):
 62 |         return len(self.samples_metadata)
 63 |     
 64 |     def __getitem__(self, idx):
 65 |         # only support for evaluation
 66 |         video_id, chosen_frame_path, video_masks_path, video_total_frames, text_query = self.samples_metadata[idx]
 67 |         text_query = " ".join(text_query.lower().split())  # clean up the text query
 68 | 
 69 |         # read the source window frames:
 70 |         chosen_frame_idx = int(chosen_frame_path.split('/')[-1].split('.')[0])
 71 |         # get a window of window_size frames with frame chosen_frame_idx in the middle.
 72 |         start_idx, end_idx = chosen_frame_idx - self.num_frames // 2, chosen_frame_idx + (self.num_frames + 1) // 2
 73 |         frame_indices = list(range(start_idx, end_idx))  # note that jhmdb-sentences frames are 1-indexed
 74 |         # extract the window source frames:
 75 |         sample_indx = []
 76 |         for i in frame_indices:
 77 |             i = min(max(i, 1), video_total_frames)  # pad out of range indices with edge frames
 78 |             sample_indx.append(i)
 79 |         sample_indx.sort()
 80 |         # find the valid frame index in sampled frame list, there is only one valid frame
 81 |         valid_indices = sample_indx.index(chosen_frame_idx)
 82 | 
 83 |         # read frames
 84 |         imgs, boxes, masks, valid = [], [], [], []
 85 |         for i in sample_indx:
 86 |             p = '/'.join(chosen_frame_path.split('/')[:-1]) + f'/{i:05d}.png'
 87 |             frame_path = os.path.join(self.dataset_path, p)
 88 |             imgs.append(Image.open(frame_path).convert('RGB'))
 89 | 
 90 |         # read the instance masks:
 91 |         video_masks_path = os.path.join(self.dataset_path, video_masks_path)
 92 |         all_video_masks = scipy.io.loadmat(video_masks_path)['part_mask'].transpose(2, 0, 1) # [T, H, W]
 93 |         # note that to take the center-frame corresponding mask we switch to 0-indexing:
 94 |         instance_mask = torch.tensor(all_video_masks[chosen_frame_idx - 1]) # [H, W]
 95 |         mask = instance_mask.numpy()
 96 |         if (mask > 0).any():
 97 |             y1, y2, x1, x2 = self.bounding_box(mask)
 98 |             box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
 99 |             valid.append(1)
100 |         else: # some frame didn't contain the instance
101 |             box = torch.tensor([0, 0, 0, 0]).to(torch.float) 
102 |             valid.append(0)
103 |         mask = torch.from_numpy(mask)
104 |         boxes.append(box)
105 |         masks.append(mask)
106 | 
107 |         # transform
108 |         h, w = instance_mask.shape[-2:]
109 |         boxes = torch.stack(boxes, dim=0) 
110 |         boxes[:, 0::2].clamp_(min=0, max=w)
111 |         boxes[:, 1::2].clamp_(min=0, max=h)
112 |         masks = torch.stack(masks, dim=0) 
113 |         # there is only one valid frame
114 |         target = {
115 |             'frames_idx': torch.tensor(sample_indx), # [T,]
116 |             'valid_indices': torch.tensor([valid_indices]),
117 |             'boxes': boxes,                          # [1, 4], xyxy
118 |             'masks': masks,                          # [1, H, W]
119 |             'valid': torch.tensor(valid),            # [1,]
120 |             'caption': text_query,
121 |             'orig_size': torch.as_tensor([int(h), int(w)]), 
122 |             'size': torch.as_tensor([int(h), int(w)]),
123 |             'image_id': get_image_id(video_id, chosen_frame_idx)
124 |         }
125 | 
126 |         # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
127 |         imgs, target = self._transforms(imgs, target) 
128 |         imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
129 |         
130 |         # in 'val', valid always satisfies
131 |         return imgs, target
132 | 
133 | 
134 | def make_coco_transforms(image_set, max_size=640):
135 |     normalize = T.Compose([
136 |         T.ToTensor(),
137 |         T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
138 |     ])
139 | 
140 |     scales = [288, 320, 352, 392, 416, 448, 480, 512]
141 | 
142 |     if image_set == 'train':
143 |         return T.Compose([
144 |             T.RandomHorizontalFlip(),
145 |             T.PhotometricDistort(),
146 |             T.RandomSelect(
147 |                 T.Compose([
148 |                     T.RandomResize(scales, max_size=max_size),
149 |                     T.Check(),
150 |                 ]),
151 |                 T.Compose([
152 |                     T.RandomResize([400, 500, 600]),
153 |                     T.RandomSizeCrop(384, 600),
154 |                     T.RandomResize(scales, max_size=max_size),
155 |                     T.Check(),
156 |                 ])
157 |             ),
158 |             normalize,
159 |         ])
160 |     
161 |     # we do not use the 'val' set since the annotations are inaccessible
162 |     if image_set == 'val':
163 |         return T.Compose([
164 |             T.RandomResize([360], max_size=640),
165 |             normalize,
166 |         ])
167 | 
168 |     raise ValueError(f'unknown {image_set}')
169 | 
170 | 
171 | def build(image_set, args):
172 |     root = Path(args.jhmdb_path)
173 |     assert root.exists(), f'provided JHMDB-Sentences path {root} does not exist'
174 |     PATHS = {
175 |         "train": (root, root / "jhmdb_sentences_samples_metadata.json"), # not used
176 |         "val": (root, root / "jhmdb_sentences_samples_metadata.json"),   
177 |     }
178 |     img_folder, ann_file = PATHS[image_set]
179 |     dataset = JHMDBSentencesDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), 
180 |                                 return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
181 |     return dataset


--------------------------------------------------------------------------------
/mask2former/modeling/transformer_decoder/maskformer_transformer_decoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
  3 | import fvcore.nn.weight_init as weight_init
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | from detectron2.config import configurable
  9 | from detectron2.layers import Conv2d
 10 | from detectron2.utils.registry import Registry
 11 | 
 12 | from .position_encoding import PositionEmbeddingSine
 13 | from .transformer import Transformer
 14 | 
 15 | 
 16 | TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE")
 17 | TRANSFORMER_DECODER_REGISTRY.__doc__ = """
 18 | Registry for transformer module in MaskFormer.
 19 | """
 20 | 
 21 | 
 22 | def build_transformer_decoder(cfg, in_channels, mask_classification=True):
 23 |     """
 24 |     Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`.
 25 |     """
 26 |     name = cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME
 27 |     return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification)
 28 | 
 29 | 
 30 | @TRANSFORMER_DECODER_REGISTRY.register()
 31 | class StandardTransformerDecoder(nn.Module):
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         in_channels,
 36 |         mask_classification=True,
 37 |         *,
 38 |         num_classes: int,
 39 |         hidden_dim: int,
 40 |         num_queries: int,
 41 |         nheads: int,
 42 |         dropout: float,
 43 |         dim_feedforward: int,
 44 |         enc_layers: int,
 45 |         dec_layers: int,
 46 |         pre_norm: bool,
 47 |         deep_supervision: bool,
 48 |         mask_dim: int,
 49 |         enforce_input_project: bool,
 50 |     ):
 51 |         """
 52 |         NOTE: this interface is experimental.
 53 |         Args:
 54 |             in_channels: channels of the input features
 55 |             mask_classification: whether to add mask classifier or not
 56 |             num_classes: number of classes
 57 |             hidden_dim: Transformer feature dimension
 58 |             num_queries: number of queries
 59 |             nheads: number of heads
 60 |             dropout: dropout in Transformer
 61 |             dim_feedforward: feature dimension in feedforward network
 62 |             enc_layers: number of Transformer encoder layers
 63 |             dec_layers: number of Transformer decoder layers
 64 |             pre_norm: whether to use pre-LayerNorm or not
 65 |             deep_supervision: whether to add supervision to every decoder layers
 66 |             mask_dim: mask feature dimension
 67 |             enforce_input_project: add input project 1x1 conv even if input
 68 |                 channels and hidden dim is identical
 69 |         """
 70 |         super().__init__()
 71 | 
 72 |         self.mask_classification = mask_classification
 73 | 
 74 |         # positional encoding
 75 |         N_steps = hidden_dim // 2
 76 |         self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
 77 | 
 78 |         transformer = Transformer(
 79 |             d_model=hidden_dim,
 80 |             dropout=dropout,
 81 |             nhead=nheads,
 82 |             dim_feedforward=dim_feedforward,
 83 |             num_encoder_layers=enc_layers,
 84 |             num_decoder_layers=dec_layers,
 85 |             normalize_before=pre_norm,
 86 |             return_intermediate_dec=deep_supervision,
 87 |         )
 88 | 
 89 |         self.num_queries = num_queries
 90 |         self.transformer = transformer
 91 |         hidden_dim = transformer.d_model
 92 | 
 93 |         self.query_embed = nn.Embedding(num_queries, hidden_dim)
 94 | 
 95 |         if in_channels != hidden_dim or enforce_input_project:
 96 |             self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1)
 97 |             weight_init.c2_xavier_fill(self.input_proj)
 98 |         else:
 99 |             self.input_proj = nn.Sequential()
100 |         self.aux_loss = deep_supervision
101 | 
102 |         # output FFNs
103 |         if self.mask_classification:
104 |             self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
105 |         self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
106 | 
107 |     @classmethod
108 |     def from_config(cls, cfg, in_channels, mask_classification):
109 |         ret = {}
110 |         ret["in_channels"] = in_channels
111 |         ret["mask_classification"] = mask_classification
112 | 
113 |         ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
114 |         ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
115 |         ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
116 |         # Transformer parameters:
117 |         ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
118 |         ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
119 |         ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
120 |         ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
121 |         ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
122 |         ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
123 |         ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
124 |         ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
125 | 
126 |         ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
127 | 
128 |         return ret
129 | 
130 |     def forward(self, x, mask_features, mask=None):
131 |         if mask is not None:
132 |             mask = F.interpolate(mask[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
133 |         pos = self.pe_layer(x, mask)
134 | 
135 |         src = x
136 |         hs, memory = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos)
137 | 
138 |         if self.mask_classification:
139 |             outputs_class = self.class_embed(hs)
140 |             out = {"pred_logits": outputs_class[-1]}
141 |         else:
142 |             out = {}
143 | 
144 |         if self.aux_loss:
145 |             # [l, bs, queries, embed]
146 |             mask_embed = self.mask_embed(hs)
147 |             outputs_seg_masks = torch.einsum("lbqc,bchw->lbqhw", mask_embed, mask_features)
148 |             out["pred_masks"] = outputs_seg_masks[-1]
149 |             out["aux_outputs"] = self._set_aux_loss(
150 |                 outputs_class if self.mask_classification else None, outputs_seg_masks
151 |             )
152 |         else:
153 |             # FIXME h_boxes takes the last one computed, keep this in mind
154 |             # [bs, queries, embed]
155 |             mask_embed = self.mask_embed(hs[-1])
156 |             outputs_seg_masks = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features)
157 |             out["pred_masks"] = outputs_seg_masks
158 |         return out
159 | 
160 |     @torch.jit.unused
161 |     def _set_aux_loss(self, outputs_class, outputs_seg_masks):
162 |         # this is a workaround to make torchscript happy, as torchscript
163 |         # doesn't support dictionary with non-homogeneous values, such
164 |         # as a dict having both a Tensor and a list.
165 |         if self.mask_classification:
166 |             return [
167 |                 {"pred_logits": a, "pred_masks": b}
168 |                 for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
169 |             ]
170 |         else:
171 |             return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
172 | 
173 | 
174 | class MLP(nn.Module):
175 |     """Very simple multi-layer perceptron (also called FFN)"""
176 | 
177 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
178 |         super().__init__()
179 |         self.num_layers = num_layers
180 |         h = [hidden_dim] * (num_layers - 1)
181 |         self.layers = nn.ModuleList(
182 |             nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
183 |         )
184 | 
185 |     def forward(self, x):
186 |         for i, layer in enumerate(self.layers):
187 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
188 |         return x
189 | 


--------------------------------------------------------------------------------